[CRIU] [PATCH 08/12] build: Move everything criu related into criu directory
Cyrill Gorcunov
gorcunov at openvz.org
Fri Feb 12 10:05:14 PST 2016
Signed-off-by: Cyrill Gorcunov <gorcunov at openvz.org>
---
.gitignore | 24 +-
Documentation/Makefile | 18 +-
Makefile | 408 +--
Makefile.config | 54 -
Makefile.crtools | 94 -
Makefile.inc | 18 -
Makefile.install | 31 +
action-scripts.c | 77 -
aio.c | 120 -
arch/aarch64/Makefile | 59 -
arch/aarch64/cpu.c | 45 -
arch/aarch64/crtools.c | 233 --
arch/aarch64/include/asm/atomic.h | 98 -
arch/aarch64/include/asm/bitops.h | 7 -
arch/aarch64/include/asm/bitsperlong.h | 6 -
arch/aarch64/include/asm/cpu.h | 1 -
arch/aarch64/include/asm/dump.h | 14 -
arch/aarch64/include/asm/fpu.h | 4 -
arch/aarch64/include/asm/int.h | 6 -
arch/aarch64/include/asm/linkage.h | 24 -
arch/aarch64/include/asm/page.h | 21 -
arch/aarch64/include/asm/parasite-syscall.h | 18 -
arch/aarch64/include/asm/parasite.h | 11 -
arch/aarch64/include/asm/processor-flags.h | 4 -
arch/aarch64/include/asm/restore.h | 28 -
arch/aarch64/include/asm/restorer.h | 121 -
arch/aarch64/include/asm/string.h | 7 -
arch/aarch64/include/asm/syscall-aux.S | 37 -
arch/aarch64/include/asm/syscall-aux.h | 1 -
arch/aarch64/include/asm/types.h | 102 -
arch/aarch64/include/asm/vdso.h | 26 -
arch/aarch64/intraprocedure.S | 22 -
arch/aarch64/parasite-head.S | 21 -
arch/aarch64/restorer.c | 15 -
arch/aarch64/syscall-common.S | 19 -
arch/aarch64/vdso-pie.c | 35 -
arch/arm/Makefile | 59 -
arch/arm/cpu.c | 45 -
arch/arm/crtools.c | 248 --
arch/arm/include/asm/atomic.h | 131 -
arch/arm/include/asm/bitops.h | 7 -
arch/arm/include/asm/bitsperlong.h | 6 -
arch/arm/include/asm/cpu.h | 1 -
arch/arm/include/asm/dump.h | 14 -
arch/arm/include/asm/fpu.h | 4 -
arch/arm/include/asm/int.h | 6 -
arch/arm/include/asm/linkage.h | 24 -
arch/arm/include/asm/page.h | 19 -
arch/arm/include/asm/parasite-syscall.h | 18 -
arch/arm/include/asm/parasite.h | 9 -
arch/arm/include/asm/processor-flags.h | 42 -
arch/arm/include/asm/processor.h | 28 -
arch/arm/include/asm/restore.h | 29 -
arch/arm/include/asm/restorer.h | 163 -
arch/arm/include/asm/string.h | 7 -
arch/arm/include/asm/syscall-aux.S | 13 -
arch/arm/include/asm/syscall-aux.h | 8 -
arch/arm/include/asm/types.h | 137 -
arch/arm/parasite-head.S | 23 -
arch/arm/restorer.c | 15 -
arch/arm/syscall-common.S | 34 -
arch/arm/syscall.def | 107 -
arch/arm/uidiv.S | 186 --
arch/ppc64/Makefile | 55 -
arch/ppc64/cpu.c | 149 -
arch/ppc64/crtools.c | 524 ----
arch/ppc64/include/asm/atomic.h | 112 -
arch/ppc64/include/asm/bitops.h | 174 --
arch/ppc64/include/asm/bitsperlong.h | 6 -
arch/ppc64/include/asm/cmpxchg.h | 96 -
arch/ppc64/include/asm/cpu.h | 1 -
arch/ppc64/include/asm/dump.h | 11 -
arch/ppc64/include/asm/fpu.h | 4 -
arch/ppc64/include/asm/int.h | 6 -
arch/ppc64/include/asm/linkage.h | 301 --
arch/ppc64/include/asm/page.h | 25 -
arch/ppc64/include/asm/parasite-syscall.h | 17 -
arch/ppc64/include/asm/parasite.h | 7 -
arch/ppc64/include/asm/prlimit.h | 14 -
arch/ppc64/include/asm/processor-flags.h | 4 -
arch/ppc64/include/asm/restore.h | 31 -
arch/ppc64/include/asm/restorer.h | 133 -
arch/ppc64/include/asm/string.h | 28 -
arch/ppc64/include/asm/types.h | 113 -
arch/ppc64/include/asm/vdso.h | 34 -
arch/ppc64/memcmp_64.S | 236 --
arch/ppc64/memcpy_power7.S | 213 --
arch/ppc64/misc.S | 197 --
arch/ppc64/parasite-head.S | 46 -
arch/ppc64/restorer.c | 31 -
arch/ppc64/syscall-common-ppc64.S | 24 -
arch/ppc64/syscall-ppc64.def | 105 -
arch/ppc64/syscalls-ppc64.sh | 57 -
arch/ppc64/vdso-pie.c | 155 -
arch/ppc64/vdso-trampoline.S | 11 -
arch/scripts/arm/gen-sys-exec-tbl.pl | 39 -
arch/scripts/arm/gen-syscalls.pl | 95 -
arch/x86/Makefile | 91 -
arch/x86/cpu.c | 491 ---
arch/x86/crtools.c | 572 ----
arch/x86/include/asm/atomic.h | 78 -
arch/x86/include/asm/bitops.h | 113 -
arch/x86/include/asm/bitsperlong.h | 10 -
arch/x86/include/asm/cmpxchg.h | 105 -
arch/x86/include/asm/cpu.h | 207 --
arch/x86/include/asm/dump.h | 11 -
arch/x86/include/asm/fpu.h | 102 -
arch/x86/include/asm/int.h | 6 -
arch/x86/include/asm/linkage.h | 24 -
arch/x86/include/asm/page.h | 19 -
arch/x86/include/asm/parasite-syscall.h | 20 -
arch/x86/include/asm/parasite.h | 10 -
arch/x86/include/asm/prlimit.h | 14 -
arch/x86/include/asm/processor-flags.h | 28 -
arch/x86/include/asm/restore.h | 36 -
arch/x86/include/asm/restorer.h | 181 --
arch/x86/include/asm/string.h | 24 -
arch/x86/include/asm/syscall32.h | 25 -
arch/x86/include/asm/types.h | 142 -
arch/x86/include/asm/vdso.h | 30 -
arch/x86/parasite-head.S | 40 -
arch/x86/prlimit.c | 68 -
arch/x86/restorer.c | 33 -
arch/x86/syscalls/syscall-common-x86-32.S | 36 -
arch/x86/syscalls/syscall-common-x86-64.S | 21 -
arch/x86/syscalls/syscall32.c | 85 -
arch/x86/syscalls/syscall_32.tbl | 92 -
arch/x86/syscalls/syscall_64.tbl | 103 -
arch/x86/vdso-pie.c | 59 -
bfd.c | 327 --
bitmap.c | 54 -
cgroup.c | 1571 ----------
cr-check.c | 958 ------
cr-dedup.c | 197 --
cr-dump.c | 1720 -----------
cr-errno.c | 12 -
cr-exec.c | 170 --
cr-restore.c | 3364 ---------------------
cr-service.c | 1101 -------
cr-show.c | 574 ----
criu/Makefile | 173 ++
criu/Makefile.config | 61 +
criu/Makefile.crtools | 92 +
criu/Makefile.version | 31 +
criu/action-scripts.c | 77 +
criu/aio.c | 120 +
criu/arch/aarch64/Makefile | 7 +
criu/arch/aarch64/Makefile.syscalls | 50 +
criu/arch/aarch64/cpu.c | 45 +
criu/arch/aarch64/crtools.c | 233 ++
criu/arch/aarch64/include/asm/atomic.h | 98 +
criu/arch/aarch64/include/asm/bitops.h | 7 +
criu/arch/aarch64/include/asm/bitsperlong.h | 6 +
criu/arch/aarch64/include/asm/cpu.h | 1 +
criu/arch/aarch64/include/asm/dump.h | 14 +
criu/arch/aarch64/include/asm/fpu.h | 4 +
criu/arch/aarch64/include/asm/int.h | 6 +
criu/arch/aarch64/include/asm/linkage.h | 24 +
criu/arch/aarch64/include/asm/page.h | 21 +
criu/arch/aarch64/include/asm/parasite-syscall.h | 18 +
criu/arch/aarch64/include/asm/parasite.h | 11 +
criu/arch/aarch64/include/asm/processor-flags.h | 4 +
criu/arch/aarch64/include/asm/restore.h | 28 +
criu/arch/aarch64/include/asm/restorer.h | 121 +
criu/arch/aarch64/include/asm/string.h | 7 +
criu/arch/aarch64/include/asm/syscall-aux.S | 37 +
criu/arch/aarch64/include/asm/syscall-aux.h | 1 +
criu/arch/aarch64/include/asm/types.h | 102 +
criu/arch/aarch64/include/asm/vdso.h | 26 +
criu/arch/aarch64/intraprocedure.S | 22 +
criu/arch/aarch64/parasite-head.S | 21 +
criu/arch/aarch64/restorer.c | 15 +
criu/arch/aarch64/syscalls/syscall-common.S | 19 +
criu/arch/aarch64/syscalls/syscall.def | 1 +
criu/arch/aarch64/vdso-pie.c | 35 +
criu/arch/arm/Makefile | 6 +
criu/arch/arm/Makefile.syscalls | 50 +
criu/arch/arm/cpu.c | 45 +
criu/arch/arm/crtools.c | 248 ++
criu/arch/arm/include/asm/atomic.h | 131 +
criu/arch/arm/include/asm/bitops.h | 7 +
criu/arch/arm/include/asm/bitsperlong.h | 6 +
criu/arch/arm/include/asm/cpu.h | 1 +
criu/arch/arm/include/asm/dump.h | 14 +
criu/arch/arm/include/asm/fpu.h | 4 +
criu/arch/arm/include/asm/int.h | 6 +
criu/arch/arm/include/asm/linkage.h | 24 +
criu/arch/arm/include/asm/page.h | 19 +
criu/arch/arm/include/asm/parasite-syscall.h | 18 +
criu/arch/arm/include/asm/parasite.h | 9 +
criu/arch/arm/include/asm/processor-flags.h | 42 +
criu/arch/arm/include/asm/processor.h | 28 +
criu/arch/arm/include/asm/restore.h | 29 +
criu/arch/arm/include/asm/restorer.h | 163 +
criu/arch/arm/include/asm/string.h | 7 +
criu/arch/arm/include/asm/syscall-aux.S | 13 +
criu/arch/arm/include/asm/syscall-aux.h | 8 +
criu/arch/arm/include/asm/types.h | 137 +
criu/arch/arm/parasite-head.S | 23 +
criu/arch/arm/restorer.c | 15 +
criu/arch/arm/syscalls/syscall-common.S | 34 +
criu/arch/arm/syscalls/syscall.def | 107 +
criu/arch/arm/uidiv.S | 186 ++
criu/arch/ppc64/Makefile | 6 +
criu/arch/ppc64/Makefile.syscalls | 50 +
criu/arch/ppc64/cpu.c | 149 +
criu/arch/ppc64/crtools.c | 524 ++++
criu/arch/ppc64/include/asm/atomic.h | 112 +
criu/arch/ppc64/include/asm/bitops.h | 174 ++
criu/arch/ppc64/include/asm/bitsperlong.h | 6 +
criu/arch/ppc64/include/asm/cmpxchg.h | 96 +
criu/arch/ppc64/include/asm/cpu.h | 1 +
criu/arch/ppc64/include/asm/dump.h | 11 +
criu/arch/ppc64/include/asm/fpu.h | 4 +
criu/arch/ppc64/include/asm/int.h | 6 +
criu/arch/ppc64/include/asm/linkage.h | 301 ++
criu/arch/ppc64/include/asm/page.h | 25 +
criu/arch/ppc64/include/asm/parasite-syscall.h | 17 +
criu/arch/ppc64/include/asm/parasite.h | 7 +
criu/arch/ppc64/include/asm/prlimit.h | 14 +
criu/arch/ppc64/include/asm/processor-flags.h | 4 +
criu/arch/ppc64/include/asm/restore.h | 31 +
criu/arch/ppc64/include/asm/restorer.h | 133 +
criu/arch/ppc64/include/asm/string.h | 28 +
criu/arch/ppc64/include/asm/types.h | 113 +
criu/arch/ppc64/include/asm/vdso.h | 34 +
criu/arch/ppc64/memcmp_64.S | 236 ++
criu/arch/ppc64/memcpy_power7.S | 213 ++
criu/arch/ppc64/misc.S | 197 ++
criu/arch/ppc64/parasite-head.S | 46 +
criu/arch/ppc64/restorer.c | 31 +
criu/arch/ppc64/syscalls/syscall-common-ppc64.S | 24 +
criu/arch/ppc64/syscalls/syscall-ppc64.tbl | 105 +
criu/arch/ppc64/vdso-pie.c | 155 +
criu/arch/ppc64/vdso-trampoline.S | 11 +
criu/arch/scripts/arm/gen-sys-exec-tbl.pl | 39 +
criu/arch/scripts/arm/gen-syscalls.pl | 95 +
criu/arch/x86/Makefile | 7 +
criu/arch/x86/Makefile.syscalls | 66 +
criu/arch/x86/cpu.c | 491 +++
criu/arch/x86/crtools.c | 572 ++++
criu/arch/x86/include/asm/atomic.h | 78 +
criu/arch/x86/include/asm/bitops.h | 113 +
criu/arch/x86/include/asm/bitsperlong.h | 10 +
criu/arch/x86/include/asm/cmpxchg.h | 105 +
criu/arch/x86/include/asm/cpu.h | 207 ++
criu/arch/x86/include/asm/dump.h | 11 +
criu/arch/x86/include/asm/fpu.h | 102 +
criu/arch/x86/include/asm/int.h | 6 +
criu/arch/x86/include/asm/linkage.h | 24 +
criu/arch/x86/include/asm/page.h | 19 +
criu/arch/x86/include/asm/parasite-syscall.h | 20 +
criu/arch/x86/include/asm/parasite.h | 10 +
criu/arch/x86/include/asm/prlimit.h | 14 +
criu/arch/x86/include/asm/processor-flags.h | 28 +
criu/arch/x86/include/asm/restore.h | 36 +
criu/arch/x86/include/asm/restorer.h | 181 ++
criu/arch/x86/include/asm/string.h | 24 +
criu/arch/x86/include/asm/syscall32.h | 25 +
criu/arch/x86/include/asm/types.h | 142 +
criu/arch/x86/include/asm/vdso.h | 30 +
criu/arch/x86/parasite-head.S | 40 +
criu/arch/x86/prlimit.c | 68 +
criu/arch/x86/restorer.c | 33 +
criu/arch/x86/syscalls/syscall-common-x86-32.S | 36 +
criu/arch/x86/syscalls/syscall-common-x86-64.S | 21 +
criu/arch/x86/syscalls/syscall32.c | 85 +
criu/arch/x86/syscalls/syscall_32.tbl | 92 +
criu/arch/x86/syscalls/syscall_64.tbl | 103 +
criu/arch/x86/vdso-pie.c | 59 +
criu/bfd.c | 327 ++
criu/bitmap.c | 54 +
criu/cgroup.c | 1571 ++++++++++
criu/cr-check.c | 958 ++++++
criu/cr-dedup.c | 197 ++
criu/cr-dump.c | 1720 +++++++++++
criu/cr-errno.c | 12 +
criu/cr-exec.c | 170 ++
criu/cr-restore.c | 3364 +++++++++++++++++++++
criu/cr-service.c | 1101 +++++++
criu/cr-show.c | 574 ++++
criu/crtools.c | 836 ++++++
criu/eventfd.c | 129 +
criu/eventpoll.c | 229 ++
criu/fault-injection.c | 22 +
criu/fifo.c | 168 ++
criu/file-ids.c | 113 +
criu/file-lock.c | 377 +++
criu/files-ext.c | 93 +
criu/files-reg.c | 1643 ++++++++++
criu/files.c | 1587 ++++++++++
criu/fsnotify.c | 940 ++++++
criu/image-desc.c | 117 +
criu/image.c | 561 ++++
criu/include/action-scripts.h | 29 +
criu/include/aio.h | 15 +
criu/include/asm-generic/bitops.h | 123 +
criu/include/asm-generic/int.h | 15 +
criu/include/asm-generic/string.h | 51 +
criu/include/asm-generic/vdso.h | 12 +
criu/include/bfd.h | 40 +
criu/include/bitmap.h | 7 +
criu/include/bug.h | 39 +
criu/include/cgroup.h | 65 +
criu/include/compiler.h | 87 +
criu/include/config-base.h | 40 +
criu/include/cpu.h | 13 +
criu/include/cr-errno.h | 17 +
criu/include/cr-service-const.h | 6 +
criu/include/cr-service.h | 14 +
criu/include/cr-show.h | 25 +
criu/include/cr_options.h | 117 +
criu/include/criu-log.h | 88 +
criu/include/criu-plugin.h | 132 +
criu/include/crtools.h | 32 +
criu/include/err.h | 53 +
criu/include/errno.h | 9 +
criu/include/eventfd.h | 10 +
criu/include/eventpoll.h | 11 +
criu/include/fault-injection.h | 19 +
criu/include/fcntl.h | 36 +
criu/include/fifo.h | 11 +
criu/include/file-ids.h | 20 +
criu/include/file-lock.h | 64 +
criu/include/files-reg.h | 59 +
criu/include/files.h | 183 ++
criu/include/fs-magic.h | 52 +
criu/include/fsnotify.h | 26 +
criu/include/image-desc.h | 119 +
criu/include/image.h | 190 ++
criu/include/imgset.h | 37 +
criu/include/inet_diag.h | 136 +
criu/include/ipc_ns.h | 9 +
criu/include/irmap.h | 13 +
criu/include/kcmp-ids.h | 29 +
criu/include/kcmp.h | 16 +
criu/include/kerndat.h | 58 +
criu/include/libnetlink.h | 20 +
criu/include/list.h | 423 +++
criu/include/lock.h | 157 +
criu/include/log.h | 41 +
criu/include/lsm.h | 35 +
criu/include/magic.h | 115 +
criu/include/mem.h | 27 +
criu/include/mman.h | 17 +
criu/include/mount.h | 129 +
criu/include/namespaces.h | 130 +
criu/include/net.h | 33 +
criu/include/netfilter.h | 11 +
criu/include/netlink_diag.h | 42 +
criu/include/packet_diag.h | 76 +
criu/include/page-pipe.h | 107 +
criu/include/page-read.h | 90 +
criu/include/page-xfer.h | 47 +
criu/include/pagemap-cache.h | 30 +
criu/include/parasite-syscall.h | 139 +
criu/include/parasite-vdso.h | 93 +
criu/include/parasite.h | 253 ++
criu/include/pid.h | 32 +
criu/include/pipes.h | 57 +
criu/include/plugin.h | 46 +
criu/include/posix-timer.h | 27 +
criu/include/prctl.h | 77 +
criu/include/proc_parse.h | 217 ++
criu/include/protobuf-desc.h | 91 +
criu/include/protobuf.h | 57 +
criu/include/pstree.h | 102 +
criu/include/ptrace.h | 84 +
criu/include/rbtree.h | 89 +
criu/include/restorer.h | 241 ++
criu/include/rst-malloc.h | 74 +
criu/include/rst_info.h | 71 +
criu/include/seccomp.h | 32 +
criu/include/seize.h | 8 +
criu/include/servicefd.h | 35 +
criu/include/setproctitle.h | 19 +
criu/include/shmem.h | 15 +
criu/include/sigframe.h | 66 +
criu/include/signalfd.h | 10 +
criu/include/sk-inet.h | 88 +
criu/include/sk-packet.h | 39 +
criu/include/sk-queue.h | 8 +
criu/include/sockets.h | 89 +
criu/include/stats.h | 48 +
criu/include/string.h | 21 +
criu/include/syscall-types.h | 85 +
criu/include/sysctl.h | 39 +
criu/include/sysfs_parse.h | 14 +
criu/include/timerfd.h | 39 +
criu/include/tty.h | 34 +
criu/include/tun.h | 16 +
criu/include/unix_diag.h | 67 +
criu/include/util-pie.h | 66 +
criu/include/util-vdso.h | 65 +
criu/include/util.h | 284 ++
criu/include/uts_ns.h | 9 +
criu/include/vdso.h | 27 +
criu/include/vma.h | 110 +
criu/include/xmalloc.h | 67 +
criu/ipc_ns.c | 936 ++++++
criu/irmap.c | 489 +++
criu/kcmp-ids.c | 153 +
criu/kerndat.c | 556 ++++
criu/libnetlink.c | 160 +
criu/log.c | 199 ++
criu/lsm.c | 251 ++
criu/mem.c | 473 +++
criu/mount.c | 3455 ++++++++++++++++++++++
criu/namespaces.c | 1403 +++++++++
criu/net.c | 1429 +++++++++
criu/netfilter.c | 124 +
criu/page-pipe.c | 238 ++
criu/page-read.c | 360 +++
criu/page-xfer.c | 880 ++++++
criu/pagemap-cache.c | 173 ++
criu/parasite-syscall.c | 1408 +++++++++
criu/pie-util-fd.c | 1 +
criu/pie-util-vdso.c | 1 +
criu/pie-util.c | 1 +
criu/pie/Makefile | 103 +
criu/pie/Makefile.library | 43 +
criu/pie/log-simple.c | 291 ++
criu/pie/parasite-vdso.c | 218 ++
criu/pie/parasite.c | 727 +++++
criu/pie/pie-reloc.lds.S.in | 30 +
criu/pie/pie-relocs.c | 47 +
criu/pie/pie-relocs.h | 29 +
criu/pie/pie.lds.S.in | 29 +
criu/pie/piegen/Makefile | 17 +
criu/pie/piegen/elf-ppc64.c | 16 +
criu/pie/piegen/elf-x86-32.c | 16 +
criu/pie/piegen/elf-x86-64.c | 16 +
criu/pie/piegen/elf.c | 512 ++++
criu/pie/piegen/main.c | 154 +
criu/pie/piegen/piegen.h | 35 +
criu/pie/piegen/uapi/types.h | 15 +
criu/pie/restorer.c | 1335 +++++++++
criu/pie/util-fd.c | 168 ++
criu/pie/util-vdso.c | 210 ++
criu/pie/util.c | 47 +
criu/pipes.c | 521 ++++
criu/plugin.c | 247 ++
criu/proc_parse.c | 2444 +++++++++++++++
criu/protobuf-desc.c | 104 +
criu/protobuf.c | 692 +++++
criu/pstree.c | 846 ++++++
criu/ptrace.c | 331 +++
criu/rbtree.c | 357 +++
criu/rst-malloc.c | 223 ++
criu/seccomp.c | 272 ++
criu/seize.c | 688 +++++
criu/shmem.c | 449 +++
criu/sigframe.c | 36 +
criu/signalfd.c | 123 +
criu/sk-inet.c | 758 +++++
criu/sk-netlink.c | 233 ++
criu/sk-packet.c | 504 ++++
criu/sk-queue.c | 256 ++
criu/sk-tcp.c | 771 +++++
criu/sk-unix.c | 1435 +++++++++
criu/sockets.c | 731 +++++
criu/stats.c | 157 +
criu/string.c | 60 +
criu/sysctl.c | 467 +++
criu/sysfs_parse.c | 325 ++
criu/timerfd.c | 211 ++
criu/tty.c | 1712 +++++++++++
criu/tun.c | 494 ++++
criu/util.c | 1002 +++++++
criu/uts_ns.c | 71 +
criu/vdso.c | 320 ++
crtools | 1 -
crtools.c | 836 ------
eventfd.c | 129 -
eventpoll.c | 229 --
fault-injection.c | 22 -
fifo.c | 168 --
file-ids.c | 113 -
file-lock.c | 377 ---
files-ext.c | 93 -
files-reg.c | 1643 ----------
files.c | 1587 ----------
fsnotify.c | 940 ------
image-desc.c | 117 -
image.c | 561 ----
images/Makefile | 5 +-
include/action-scripts.h | 29 -
include/aio.h | 15 -
include/asm-generic/bitops.h | 123 -
include/asm-generic/int.h | 15 -
include/asm-generic/string.h | 51 -
include/asm-generic/vdso.h | 12 -
include/bfd.h | 40 -
include/bitmap.h | 7 -
include/bug.h | 39 -
include/cgroup.h | 65 -
include/compiler.h | 87 -
include/config-base.h | 40 -
include/cpu.h | 13 -
include/cr-errno.h | 17 -
include/cr-service-const.h | 6 -
include/cr-service.h | 14 -
include/cr-show.h | 25 -
include/cr_options.h | 117 -
include/criu-log.h | 88 -
include/criu-plugin.h | 132 -
include/crtools.h | 32 -
include/err.h | 53 -
include/errno.h | 9 -
include/eventfd.h | 10 -
include/eventpoll.h | 11 -
include/fault-injection.h | 19 -
include/fcntl.h | 36 -
include/fifo.h | 11 -
include/file-ids.h | 20 -
include/file-lock.h | 64 -
include/files-reg.h | 59 -
include/files.h | 183 --
include/fs-magic.h | 52 -
include/fsnotify.h | 26 -
include/image-desc.h | 119 -
include/image.h | 190 --
include/imgset.h | 37 -
include/inet_diag.h | 136 -
include/ipc_ns.h | 9 -
include/irmap.h | 13 -
include/kcmp-ids.h | 29 -
include/kcmp.h | 16 -
include/kerndat.h | 58 -
include/libnetlink.h | 20 -
include/list.h | 423 ---
include/lock.h | 157 -
include/log.h | 41 -
include/lsm.h | 35 -
include/magic.h | 115 -
include/mem.h | 27 -
include/mman.h | 17 -
include/mount.h | 129 -
include/namespaces.h | 130 -
include/net.h | 33 -
include/netfilter.h | 11 -
include/netlink_diag.h | 42 -
include/packet_diag.h | 76 -
include/page-pipe.h | 107 -
include/page-read.h | 90 -
include/page-xfer.h | 47 -
include/pagemap-cache.h | 30 -
include/parasite-syscall.h | 139 -
include/parasite-vdso.h | 93 -
include/parasite.h | 253 --
include/pid.h | 32 -
include/pipes.h | 57 -
include/plugin.h | 46 -
include/posix-timer.h | 27 -
include/prctl.h | 77 -
include/proc_parse.h | 217 --
include/protobuf-desc.h | 91 -
include/protobuf.h | 57 -
include/pstree.h | 102 -
include/ptrace.h | 84 -
include/rbtree.h | 89 -
include/restorer.h | 241 --
include/rst-malloc.h | 74 -
include/rst_info.h | 71 -
include/seccomp.h | 32 -
include/seize.h | 8 -
include/servicefd.h | 35 -
include/setproctitle.h | 19 -
include/shmem.h | 15 -
include/sigframe.h | 66 -
include/signalfd.h | 10 -
include/sk-inet.h | 88 -
include/sk-packet.h | 39 -
include/sk-queue.h | 8 -
include/sockets.h | 89 -
include/stats.h | 48 -
include/string.h | 21 -
include/syscall-types.h | 85 -
include/sysctl.h | 39 -
include/sysfs_parse.h | 14 -
include/timerfd.h | 39 -
include/tty.h | 34 -
include/tun.h | 16 -
include/unix_diag.h | 67 -
include/util-pie.h | 66 -
include/util-vdso.h | 65 -
include/util.h | 284 --
include/uts_ns.h | 9 -
include/vdso.h | 27 -
include/vma.h | 110 -
include/xmalloc.h | 67 -
ipc_ns.c | 936 ------
irmap.c | 489 ---
kcmp-ids.c | 153 -
kerndat.c | 556 ----
lib/Makefile | 6 +-
lib/c/Makefile | 10 +-
lib/py/Makefile | 2 +-
lib/py/images/Makefile | 19 +-
libnetlink.c | 160 -
log.c | 199 --
lsm.c | 251 --
mem.c | 473 ---
mount.c | 3455 ----------------------
namespaces.c | 1403 ---------
net.c | 1429 ---------
netfilter.c | 124 -
page-pipe.c | 238 --
page-read.c | 360 ---
page-xfer.c | 880 ------
pagemap-cache.c | 173 --
parasite-syscall.c | 1408 ---------
pie/Makefile | 132 -
pie/log-simple.c | 291 --
pie/parasite-vdso.c | 218 --
pie/parasite.c | 727 -----
pie/pie-reloc.lds.S.in | 30 -
pie/pie-relocs.c | 47 -
pie/pie-relocs.h | 29 -
pie/pie.lds.S.in | 29 -
pie/piegen/Makefile | 17 -
pie/piegen/elf-ppc64.c | 16 -
pie/piegen/elf-x86-32.c | 16 -
pie/piegen/elf-x86-64.c | 16 -
pie/piegen/elf.c | 512 ----
pie/piegen/main.c | 154 -
pie/piegen/piegen.h | 35 -
pie/piegen/uapi/types.h | 15 -
pie/restorer.c | 1335 ---------
pie/util-fd.c | 168 --
pie/util-vdso.c | 210 --
pie/util.c | 47 -
pipes.c | 521 ----
plugin.c | 247 --
proc_parse.c | 2444 ---------------
protobuf-desc.c | 104 -
protobuf.c | 692 -----
pstree.c | 846 ------
ptrace.c | 331 ---
rbtree.c | 357 ---
rst-malloc.c | 223 --
scripts/Makefile.build | 251 --
scripts/Makefile.rules | 52 -
scripts/Makefile.version | 36 -
seccomp.c | 272 --
seize.c | 688 -----
shmem.c | 449 ---
sigframe.c | 36 -
signalfd.c | 123 -
sk-inet.c | 758 -----
sk-netlink.c | 233 --
sk-packet.c | 504 ----
sk-queue.c | 256 --
sk-tcp.c | 771 -----
sk-unix.c | 1435 ---------
sockets.c | 731 -----
stats.c | 157 -
string.c | 60 -
sysctl.c | 467 ---
sysfs_parse.c | 325 --
timerfd.c | 211 --
tty.c | 1712 -----------
tun.c | 494 ----
util.c | 1002 -------
uts_ns.c | 71 -
vdso.c | 320 --
666 files changed, 66217 insertions(+), 66580 deletions(-)
delete mode 100644 Makefile.config
delete mode 100644 Makefile.crtools
delete mode 100644 Makefile.inc
create mode 100644 Makefile.install
delete mode 100644 action-scripts.c
delete mode 100644 aio.c
delete mode 100644 arch/aarch64/Makefile
delete mode 100644 arch/aarch64/cpu.c
delete mode 100644 arch/aarch64/crtools.c
delete mode 100644 arch/aarch64/include/asm/atomic.h
delete mode 100644 arch/aarch64/include/asm/bitops.h
delete mode 100644 arch/aarch64/include/asm/bitsperlong.h
delete mode 100644 arch/aarch64/include/asm/cpu.h
delete mode 100644 arch/aarch64/include/asm/dump.h
delete mode 100644 arch/aarch64/include/asm/fpu.h
delete mode 100644 arch/aarch64/include/asm/int.h
delete mode 100644 arch/aarch64/include/asm/linkage.h
delete mode 100644 arch/aarch64/include/asm/page.h
delete mode 100644 arch/aarch64/include/asm/parasite-syscall.h
delete mode 100644 arch/aarch64/include/asm/parasite.h
delete mode 100644 arch/aarch64/include/asm/processor-flags.h
delete mode 100644 arch/aarch64/include/asm/restore.h
delete mode 100644 arch/aarch64/include/asm/restorer.h
delete mode 100644 arch/aarch64/include/asm/string.h
delete mode 100644 arch/aarch64/include/asm/syscall-aux.S
delete mode 100644 arch/aarch64/include/asm/syscall-aux.h
delete mode 100644 arch/aarch64/include/asm/types.h
delete mode 100644 arch/aarch64/include/asm/vdso.h
delete mode 100644 arch/aarch64/intraprocedure.S
delete mode 100644 arch/aarch64/parasite-head.S
delete mode 100644 arch/aarch64/restorer.c
delete mode 100644 arch/aarch64/syscall-common.S
delete mode 100644 arch/aarch64/vdso-pie.c
delete mode 100644 arch/arm/Makefile
delete mode 100644 arch/arm/cpu.c
delete mode 100644 arch/arm/crtools.c
delete mode 100644 arch/arm/include/asm/atomic.h
delete mode 100644 arch/arm/include/asm/bitops.h
delete mode 100644 arch/arm/include/asm/bitsperlong.h
delete mode 100644 arch/arm/include/asm/cpu.h
delete mode 100644 arch/arm/include/asm/dump.h
delete mode 100644 arch/arm/include/asm/fpu.h
delete mode 100644 arch/arm/include/asm/int.h
delete mode 100644 arch/arm/include/asm/linkage.h
delete mode 100644 arch/arm/include/asm/page.h
delete mode 100644 arch/arm/include/asm/parasite-syscall.h
delete mode 100644 arch/arm/include/asm/parasite.h
delete mode 100644 arch/arm/include/asm/processor-flags.h
delete mode 100644 arch/arm/include/asm/processor.h
delete mode 100644 arch/arm/include/asm/restore.h
delete mode 100644 arch/arm/include/asm/restorer.h
delete mode 100644 arch/arm/include/asm/string.h
delete mode 100644 arch/arm/include/asm/syscall-aux.S
delete mode 100644 arch/arm/include/asm/syscall-aux.h
delete mode 100644 arch/arm/include/asm/types.h
delete mode 100644 arch/arm/parasite-head.S
delete mode 100644 arch/arm/restorer.c
delete mode 100644 arch/arm/syscall-common.S
delete mode 100644 arch/arm/syscall.def
delete mode 100644 arch/arm/uidiv.S
delete mode 100644 arch/ppc64/Makefile
delete mode 100644 arch/ppc64/cpu.c
delete mode 100644 arch/ppc64/crtools.c
delete mode 100644 arch/ppc64/include/asm/atomic.h
delete mode 100644 arch/ppc64/include/asm/bitops.h
delete mode 100644 arch/ppc64/include/asm/bitsperlong.h
delete mode 100644 arch/ppc64/include/asm/cmpxchg.h
delete mode 100644 arch/ppc64/include/asm/cpu.h
delete mode 100644 arch/ppc64/include/asm/dump.h
delete mode 100644 arch/ppc64/include/asm/fpu.h
delete mode 100644 arch/ppc64/include/asm/int.h
delete mode 100644 arch/ppc64/include/asm/linkage.h
delete mode 100644 arch/ppc64/include/asm/page.h
delete mode 100644 arch/ppc64/include/asm/parasite-syscall.h
delete mode 100644 arch/ppc64/include/asm/parasite.h
delete mode 100644 arch/ppc64/include/asm/prlimit.h
delete mode 100644 arch/ppc64/include/asm/processor-flags.h
delete mode 100644 arch/ppc64/include/asm/restore.h
delete mode 100644 arch/ppc64/include/asm/restorer.h
delete mode 100644 arch/ppc64/include/asm/string.h
delete mode 100644 arch/ppc64/include/asm/types.h
delete mode 100644 arch/ppc64/include/asm/vdso.h
delete mode 100644 arch/ppc64/memcmp_64.S
delete mode 100644 arch/ppc64/memcpy_power7.S
delete mode 100644 arch/ppc64/misc.S
delete mode 100644 arch/ppc64/parasite-head.S
delete mode 100644 arch/ppc64/restorer.c
delete mode 100644 arch/ppc64/syscall-common-ppc64.S
delete mode 100644 arch/ppc64/syscall-ppc64.def
delete mode 100644 arch/ppc64/syscalls-ppc64.sh
delete mode 100644 arch/ppc64/vdso-pie.c
delete mode 100644 arch/ppc64/vdso-trampoline.S
delete mode 100755 arch/scripts/arm/gen-sys-exec-tbl.pl
delete mode 100755 arch/scripts/arm/gen-syscalls.pl
delete mode 100644 arch/x86/Makefile
delete mode 100644 arch/x86/cpu.c
delete mode 100644 arch/x86/crtools.c
delete mode 100644 arch/x86/include/asm/atomic.h
delete mode 100644 arch/x86/include/asm/bitops.h
delete mode 100644 arch/x86/include/asm/bitsperlong.h
delete mode 100644 arch/x86/include/asm/cmpxchg.h
delete mode 100644 arch/x86/include/asm/cpu.h
delete mode 100644 arch/x86/include/asm/dump.h
delete mode 100644 arch/x86/include/asm/fpu.h
delete mode 100644 arch/x86/include/asm/int.h
delete mode 100644 arch/x86/include/asm/linkage.h
delete mode 100644 arch/x86/include/asm/page.h
delete mode 100644 arch/x86/include/asm/parasite-syscall.h
delete mode 100644 arch/x86/include/asm/parasite.h
delete mode 100644 arch/x86/include/asm/prlimit.h
delete mode 100644 arch/x86/include/asm/processor-flags.h
delete mode 100644 arch/x86/include/asm/restore.h
delete mode 100644 arch/x86/include/asm/restorer.h
delete mode 100644 arch/x86/include/asm/string.h
delete mode 100644 arch/x86/include/asm/syscall32.h
delete mode 100644 arch/x86/include/asm/types.h
delete mode 100644 arch/x86/include/asm/vdso.h
delete mode 100644 arch/x86/parasite-head.S
delete mode 100644 arch/x86/prlimit.c
delete mode 100644 arch/x86/restorer.c
delete mode 100644 arch/x86/syscalls/syscall-common-x86-32.S
delete mode 100644 arch/x86/syscalls/syscall-common-x86-64.S
delete mode 100644 arch/x86/syscalls/syscall32.c
delete mode 100644 arch/x86/syscalls/syscall_32.tbl
delete mode 100644 arch/x86/syscalls/syscall_64.tbl
delete mode 100644 arch/x86/vdso-pie.c
delete mode 100644 bfd.c
delete mode 100644 bitmap.c
delete mode 100644 cgroup.c
delete mode 100644 cr-check.c
delete mode 100644 cr-dedup.c
delete mode 100644 cr-dump.c
delete mode 100644 cr-errno.c
delete mode 100644 cr-exec.c
delete mode 100644 cr-restore.c
delete mode 100644 cr-service.c
delete mode 100644 cr-show.c
create mode 100644 criu/Makefile
create mode 100644 criu/Makefile.config
create mode 100644 criu/Makefile.crtools
create mode 100644 criu/Makefile.version
create mode 100644 criu/action-scripts.c
create mode 100644 criu/aio.c
create mode 100644 criu/arch/aarch64/Makefile
create mode 100644 criu/arch/aarch64/Makefile.syscalls
create mode 100644 criu/arch/aarch64/cpu.c
create mode 100644 criu/arch/aarch64/crtools.c
create mode 100644 criu/arch/aarch64/include/asm/atomic.h
create mode 100644 criu/arch/aarch64/include/asm/bitops.h
create mode 100644 criu/arch/aarch64/include/asm/bitsperlong.h
create mode 100644 criu/arch/aarch64/include/asm/cpu.h
create mode 100644 criu/arch/aarch64/include/asm/dump.h
create mode 100644 criu/arch/aarch64/include/asm/fpu.h
create mode 100644 criu/arch/aarch64/include/asm/int.h
create mode 100644 criu/arch/aarch64/include/asm/linkage.h
create mode 100644 criu/arch/aarch64/include/asm/page.h
create mode 100644 criu/arch/aarch64/include/asm/parasite-syscall.h
create mode 100644 criu/arch/aarch64/include/asm/parasite.h
create mode 100644 criu/arch/aarch64/include/asm/processor-flags.h
create mode 100644 criu/arch/aarch64/include/asm/restore.h
create mode 100644 criu/arch/aarch64/include/asm/restorer.h
create mode 100644 criu/arch/aarch64/include/asm/string.h
create mode 100644 criu/arch/aarch64/include/asm/syscall-aux.S
create mode 100644 criu/arch/aarch64/include/asm/syscall-aux.h
create mode 100644 criu/arch/aarch64/include/asm/types.h
create mode 100644 criu/arch/aarch64/include/asm/vdso.h
create mode 100644 criu/arch/aarch64/intraprocedure.S
create mode 100644 criu/arch/aarch64/parasite-head.S
create mode 100644 criu/arch/aarch64/restorer.c
create mode 100644 criu/arch/aarch64/syscalls/syscall-common.S
create mode 120000 criu/arch/aarch64/syscalls/syscall.def
create mode 100644 criu/arch/aarch64/vdso-pie.c
create mode 100644 criu/arch/arm/Makefile
create mode 100644 criu/arch/arm/Makefile.syscalls
create mode 100644 criu/arch/arm/cpu.c
create mode 100644 criu/arch/arm/crtools.c
create mode 100644 criu/arch/arm/include/asm/atomic.h
create mode 100644 criu/arch/arm/include/asm/bitops.h
create mode 100644 criu/arch/arm/include/asm/bitsperlong.h
create mode 100644 criu/arch/arm/include/asm/cpu.h
create mode 100644 criu/arch/arm/include/asm/dump.h
create mode 100644 criu/arch/arm/include/asm/fpu.h
create mode 100644 criu/arch/arm/include/asm/int.h
create mode 100644 criu/arch/arm/include/asm/linkage.h
create mode 100644 criu/arch/arm/include/asm/page.h
create mode 100644 criu/arch/arm/include/asm/parasite-syscall.h
create mode 100644 criu/arch/arm/include/asm/parasite.h
create mode 100644 criu/arch/arm/include/asm/processor-flags.h
create mode 100644 criu/arch/arm/include/asm/processor.h
create mode 100644 criu/arch/arm/include/asm/restore.h
create mode 100644 criu/arch/arm/include/asm/restorer.h
create mode 100644 criu/arch/arm/include/asm/string.h
create mode 100644 criu/arch/arm/include/asm/syscall-aux.S
create mode 100644 criu/arch/arm/include/asm/syscall-aux.h
create mode 100644 criu/arch/arm/include/asm/types.h
create mode 100644 criu/arch/arm/parasite-head.S
create mode 100644 criu/arch/arm/restorer.c
create mode 100644 criu/arch/arm/syscalls/syscall-common.S
create mode 100644 criu/arch/arm/syscalls/syscall.def
create mode 100644 criu/arch/arm/uidiv.S
create mode 100644 criu/arch/ppc64/Makefile
create mode 100644 criu/arch/ppc64/Makefile.syscalls
create mode 100644 criu/arch/ppc64/cpu.c
create mode 100644 criu/arch/ppc64/crtools.c
create mode 100644 criu/arch/ppc64/include/asm/atomic.h
create mode 100644 criu/arch/ppc64/include/asm/bitops.h
create mode 100644 criu/arch/ppc64/include/asm/bitsperlong.h
create mode 100644 criu/arch/ppc64/include/asm/cmpxchg.h
create mode 100644 criu/arch/ppc64/include/asm/cpu.h
create mode 100644 criu/arch/ppc64/include/asm/dump.h
create mode 100644 criu/arch/ppc64/include/asm/fpu.h
create mode 100644 criu/arch/ppc64/include/asm/int.h
create mode 100644 criu/arch/ppc64/include/asm/linkage.h
create mode 100644 criu/arch/ppc64/include/asm/page.h
create mode 100644 criu/arch/ppc64/include/asm/parasite-syscall.h
create mode 100644 criu/arch/ppc64/include/asm/parasite.h
create mode 100644 criu/arch/ppc64/include/asm/prlimit.h
create mode 100644 criu/arch/ppc64/include/asm/processor-flags.h
create mode 100644 criu/arch/ppc64/include/asm/restore.h
create mode 100644 criu/arch/ppc64/include/asm/restorer.h
create mode 100644 criu/arch/ppc64/include/asm/string.h
create mode 100644 criu/arch/ppc64/include/asm/types.h
create mode 100644 criu/arch/ppc64/include/asm/vdso.h
create mode 100644 criu/arch/ppc64/memcmp_64.S
create mode 100644 criu/arch/ppc64/memcpy_power7.S
create mode 100644 criu/arch/ppc64/misc.S
create mode 100644 criu/arch/ppc64/parasite-head.S
create mode 100644 criu/arch/ppc64/restorer.c
create mode 100644 criu/arch/ppc64/syscalls/syscall-common-ppc64.S
create mode 100644 criu/arch/ppc64/syscalls/syscall-ppc64.tbl
create mode 100644 criu/arch/ppc64/vdso-pie.c
create mode 100644 criu/arch/ppc64/vdso-trampoline.S
create mode 100755 criu/arch/scripts/arm/gen-sys-exec-tbl.pl
create mode 100755 criu/arch/scripts/arm/gen-syscalls.pl
create mode 100644 criu/arch/x86/Makefile
create mode 100644 criu/arch/x86/Makefile.syscalls
create mode 100644 criu/arch/x86/cpu.c
create mode 100644 criu/arch/x86/crtools.c
create mode 100644 criu/arch/x86/include/asm/atomic.h
create mode 100644 criu/arch/x86/include/asm/bitops.h
create mode 100644 criu/arch/x86/include/asm/bitsperlong.h
create mode 100644 criu/arch/x86/include/asm/cmpxchg.h
create mode 100644 criu/arch/x86/include/asm/cpu.h
create mode 100644 criu/arch/x86/include/asm/dump.h
create mode 100644 criu/arch/x86/include/asm/fpu.h
create mode 100644 criu/arch/x86/include/asm/int.h
create mode 100644 criu/arch/x86/include/asm/linkage.h
create mode 100644 criu/arch/x86/include/asm/page.h
create mode 100644 criu/arch/x86/include/asm/parasite-syscall.h
create mode 100644 criu/arch/x86/include/asm/parasite.h
create mode 100644 criu/arch/x86/include/asm/prlimit.h
create mode 100644 criu/arch/x86/include/asm/processor-flags.h
create mode 100644 criu/arch/x86/include/asm/restore.h
create mode 100644 criu/arch/x86/include/asm/restorer.h
create mode 100644 criu/arch/x86/include/asm/string.h
create mode 100644 criu/arch/x86/include/asm/syscall32.h
create mode 100644 criu/arch/x86/include/asm/types.h
create mode 100644 criu/arch/x86/include/asm/vdso.h
create mode 100644 criu/arch/x86/parasite-head.S
create mode 100644 criu/arch/x86/prlimit.c
create mode 100644 criu/arch/x86/restorer.c
create mode 100644 criu/arch/x86/syscalls/syscall-common-x86-32.S
create mode 100644 criu/arch/x86/syscalls/syscall-common-x86-64.S
create mode 100644 criu/arch/x86/syscalls/syscall32.c
create mode 100644 criu/arch/x86/syscalls/syscall_32.tbl
create mode 100644 criu/arch/x86/syscalls/syscall_64.tbl
create mode 100644 criu/arch/x86/vdso-pie.c
create mode 100644 criu/bfd.c
create mode 100644 criu/bitmap.c
create mode 100644 criu/cgroup.c
create mode 100644 criu/cr-check.c
create mode 100644 criu/cr-dedup.c
create mode 100644 criu/cr-dump.c
create mode 100644 criu/cr-errno.c
create mode 100644 criu/cr-exec.c
create mode 100644 criu/cr-restore.c
create mode 100644 criu/cr-service.c
create mode 100644 criu/cr-show.c
create mode 100644 criu/crtools.c
create mode 100644 criu/eventfd.c
create mode 100644 criu/eventpoll.c
create mode 100644 criu/fault-injection.c
create mode 100644 criu/fifo.c
create mode 100644 criu/file-ids.c
create mode 100644 criu/file-lock.c
create mode 100644 criu/files-ext.c
create mode 100644 criu/files-reg.c
create mode 100644 criu/files.c
create mode 100644 criu/fsnotify.c
create mode 100644 criu/image-desc.c
create mode 100644 criu/image.c
create mode 100644 criu/include/action-scripts.h
create mode 100644 criu/include/aio.h
create mode 100644 criu/include/asm-generic/bitops.h
create mode 100644 criu/include/asm-generic/int.h
create mode 100644 criu/include/asm-generic/string.h
create mode 100644 criu/include/asm-generic/vdso.h
create mode 100644 criu/include/bfd.h
create mode 100644 criu/include/bitmap.h
create mode 100644 criu/include/bug.h
create mode 100644 criu/include/cgroup.h
create mode 100644 criu/include/compiler.h
create mode 100644 criu/include/config-base.h
create mode 100644 criu/include/cpu.h
create mode 100644 criu/include/cr-errno.h
create mode 100644 criu/include/cr-service-const.h
create mode 100644 criu/include/cr-service.h
create mode 100644 criu/include/cr-show.h
create mode 100644 criu/include/cr_options.h
create mode 100644 criu/include/criu-log.h
create mode 100644 criu/include/criu-plugin.h
create mode 100644 criu/include/crtools.h
create mode 100644 criu/include/err.h
create mode 100644 criu/include/errno.h
create mode 100644 criu/include/eventfd.h
create mode 100644 criu/include/eventpoll.h
create mode 100644 criu/include/fault-injection.h
create mode 100644 criu/include/fcntl.h
create mode 100644 criu/include/fifo.h
create mode 100644 criu/include/file-ids.h
create mode 100644 criu/include/file-lock.h
create mode 100644 criu/include/files-reg.h
create mode 100644 criu/include/files.h
create mode 100644 criu/include/fs-magic.h
create mode 100644 criu/include/fsnotify.h
create mode 100644 criu/include/image-desc.h
create mode 100644 criu/include/image.h
create mode 100644 criu/include/imgset.h
create mode 100644 criu/include/inet_diag.h
create mode 100644 criu/include/ipc_ns.h
create mode 100644 criu/include/irmap.h
create mode 100644 criu/include/kcmp-ids.h
create mode 100644 criu/include/kcmp.h
create mode 100644 criu/include/kerndat.h
create mode 100644 criu/include/libnetlink.h
create mode 100644 criu/include/list.h
create mode 100644 criu/include/lock.h
create mode 100644 criu/include/log.h
create mode 100644 criu/include/lsm.h
create mode 100644 criu/include/magic.h
create mode 100644 criu/include/mem.h
create mode 100644 criu/include/mman.h
create mode 100644 criu/include/mount.h
create mode 100644 criu/include/namespaces.h
create mode 100644 criu/include/net.h
create mode 100644 criu/include/netfilter.h
create mode 100644 criu/include/netlink_diag.h
create mode 100644 criu/include/packet_diag.h
create mode 100644 criu/include/page-pipe.h
create mode 100644 criu/include/page-read.h
create mode 100644 criu/include/page-xfer.h
create mode 100644 criu/include/pagemap-cache.h
create mode 100644 criu/include/parasite-syscall.h
create mode 100644 criu/include/parasite-vdso.h
create mode 100644 criu/include/parasite.h
create mode 100644 criu/include/pid.h
create mode 100644 criu/include/pipes.h
create mode 100644 criu/include/plugin.h
create mode 100644 criu/include/posix-timer.h
create mode 100644 criu/include/prctl.h
create mode 100644 criu/include/proc_parse.h
create mode 100644 criu/include/protobuf-desc.h
create mode 100644 criu/include/protobuf.h
create mode 100644 criu/include/pstree.h
create mode 100644 criu/include/ptrace.h
create mode 100644 criu/include/rbtree.h
create mode 100644 criu/include/restorer.h
create mode 100644 criu/include/rst-malloc.h
create mode 100644 criu/include/rst_info.h
create mode 100644 criu/include/seccomp.h
create mode 100644 criu/include/seize.h
create mode 100644 criu/include/servicefd.h
create mode 100644 criu/include/setproctitle.h
create mode 100644 criu/include/shmem.h
create mode 100644 criu/include/sigframe.h
create mode 100644 criu/include/signalfd.h
create mode 100644 criu/include/sk-inet.h
create mode 100644 criu/include/sk-packet.h
create mode 100644 criu/include/sk-queue.h
create mode 100644 criu/include/sockets.h
create mode 100644 criu/include/stats.h
create mode 100644 criu/include/string.h
create mode 100644 criu/include/syscall-types.h
create mode 100644 criu/include/sysctl.h
create mode 100644 criu/include/sysfs_parse.h
create mode 100644 criu/include/timerfd.h
create mode 100644 criu/include/tty.h
create mode 100644 criu/include/tun.h
create mode 100644 criu/include/unix_diag.h
create mode 100644 criu/include/util-pie.h
create mode 100644 criu/include/util-vdso.h
create mode 100644 criu/include/util.h
create mode 100644 criu/include/uts_ns.h
create mode 100644 criu/include/vdso.h
create mode 100644 criu/include/vma.h
create mode 100644 criu/include/xmalloc.h
create mode 100644 criu/ipc_ns.c
create mode 100644 criu/irmap.c
create mode 100644 criu/kcmp-ids.c
create mode 100644 criu/kerndat.c
create mode 100644 criu/libnetlink.c
create mode 100644 criu/log.c
create mode 100644 criu/lsm.c
create mode 100644 criu/mem.c
create mode 100644 criu/mount.c
create mode 100644 criu/namespaces.c
create mode 100644 criu/net.c
create mode 100644 criu/netfilter.c
create mode 100644 criu/page-pipe.c
create mode 100644 criu/page-read.c
create mode 100644 criu/page-xfer.c
create mode 100644 criu/pagemap-cache.c
create mode 100644 criu/parasite-syscall.c
create mode 120000 criu/pie-util-fd.c
create mode 120000 criu/pie-util-vdso.c
create mode 120000 criu/pie-util.c
create mode 100644 criu/pie/Makefile
create mode 100644 criu/pie/Makefile.library
create mode 100644 criu/pie/log-simple.c
create mode 100644 criu/pie/parasite-vdso.c
create mode 100644 criu/pie/parasite.c
create mode 100644 criu/pie/pie-reloc.lds.S.in
create mode 100644 criu/pie/pie-relocs.c
create mode 100644 criu/pie/pie-relocs.h
create mode 100644 criu/pie/pie.lds.S.in
create mode 100644 criu/pie/piegen/Makefile
create mode 100644 criu/pie/piegen/elf-ppc64.c
create mode 100644 criu/pie/piegen/elf-x86-32.c
create mode 100644 criu/pie/piegen/elf-x86-64.c
create mode 100644 criu/pie/piegen/elf.c
create mode 100644 criu/pie/piegen/main.c
create mode 100644 criu/pie/piegen/piegen.h
create mode 100644 criu/pie/piegen/uapi/types.h
create mode 100644 criu/pie/restorer.c
create mode 100644 criu/pie/util-fd.c
create mode 100644 criu/pie/util-vdso.c
create mode 100644 criu/pie/util.c
create mode 100644 criu/pipes.c
create mode 100644 criu/plugin.c
create mode 100644 criu/proc_parse.c
create mode 100644 criu/protobuf-desc.c
create mode 100644 criu/protobuf.c
create mode 100644 criu/pstree.c
create mode 100644 criu/ptrace.c
create mode 100644 criu/rbtree.c
create mode 100644 criu/rst-malloc.c
create mode 100644 criu/seccomp.c
create mode 100644 criu/seize.c
create mode 100644 criu/shmem.c
create mode 100644 criu/sigframe.c
create mode 100644 criu/signalfd.c
create mode 100644 criu/sk-inet.c
create mode 100644 criu/sk-netlink.c
create mode 100644 criu/sk-packet.c
create mode 100644 criu/sk-queue.c
create mode 100644 criu/sk-tcp.c
create mode 100644 criu/sk-unix.c
create mode 100644 criu/sockets.c
create mode 100644 criu/stats.c
create mode 100644 criu/string.c
create mode 100644 criu/sysctl.c
create mode 100644 criu/sysfs_parse.c
create mode 100644 criu/timerfd.c
create mode 100644 criu/tty.c
create mode 100644 criu/tun.c
create mode 100644 criu/util.c
create mode 100644 criu/uts_ns.c
create mode 100644 criu/vdso.c
delete mode 120000 crtools
delete mode 100644 crtools.c
delete mode 100644 eventfd.c
delete mode 100644 eventpoll.c
delete mode 100644 fault-injection.c
delete mode 100644 fifo.c
delete mode 100644 file-ids.c
delete mode 100644 file-lock.c
delete mode 100644 files-ext.c
delete mode 100644 files-reg.c
delete mode 100644 files.c
delete mode 100644 fsnotify.c
delete mode 100644 image-desc.c
delete mode 100644 image.c
delete mode 100644 include/action-scripts.h
delete mode 100644 include/aio.h
delete mode 100644 include/asm-generic/bitops.h
delete mode 100644 include/asm-generic/int.h
delete mode 100644 include/asm-generic/string.h
delete mode 100644 include/asm-generic/vdso.h
delete mode 100644 include/bfd.h
delete mode 100644 include/bitmap.h
delete mode 100644 include/bug.h
delete mode 100644 include/cgroup.h
delete mode 100644 include/compiler.h
delete mode 100644 include/config-base.h
delete mode 100644 include/cpu.h
delete mode 100644 include/cr-errno.h
delete mode 100644 include/cr-service-const.h
delete mode 100644 include/cr-service.h
delete mode 100644 include/cr-show.h
delete mode 100644 include/cr_options.h
delete mode 100644 include/criu-log.h
delete mode 100644 include/criu-plugin.h
delete mode 100644 include/crtools.h
delete mode 100644 include/err.h
delete mode 100644 include/errno.h
delete mode 100644 include/eventfd.h
delete mode 100644 include/eventpoll.h
delete mode 100644 include/fault-injection.h
delete mode 100644 include/fcntl.h
delete mode 100644 include/fifo.h
delete mode 100644 include/file-ids.h
delete mode 100644 include/file-lock.h
delete mode 100644 include/files-reg.h
delete mode 100644 include/files.h
delete mode 100644 include/fs-magic.h
delete mode 100644 include/fsnotify.h
delete mode 100644 include/image-desc.h
delete mode 100644 include/image.h
delete mode 100644 include/imgset.h
delete mode 100644 include/inet_diag.h
delete mode 100644 include/ipc_ns.h
delete mode 100644 include/irmap.h
delete mode 100644 include/kcmp-ids.h
delete mode 100644 include/kcmp.h
delete mode 100644 include/kerndat.h
delete mode 100644 include/libnetlink.h
delete mode 100644 include/list.h
delete mode 100644 include/lock.h
delete mode 100644 include/log.h
delete mode 100644 include/lsm.h
delete mode 100644 include/magic.h
delete mode 100644 include/mem.h
delete mode 100644 include/mman.h
delete mode 100644 include/mount.h
delete mode 100644 include/namespaces.h
delete mode 100644 include/net.h
delete mode 100644 include/netfilter.h
delete mode 100644 include/netlink_diag.h
delete mode 100644 include/packet_diag.h
delete mode 100644 include/page-pipe.h
delete mode 100644 include/page-read.h
delete mode 100644 include/page-xfer.h
delete mode 100644 include/pagemap-cache.h
delete mode 100644 include/parasite-syscall.h
delete mode 100644 include/parasite-vdso.h
delete mode 100644 include/parasite.h
delete mode 100644 include/pid.h
delete mode 100644 include/pipes.h
delete mode 100644 include/plugin.h
delete mode 100644 include/posix-timer.h
delete mode 100644 include/prctl.h
delete mode 100644 include/proc_parse.h
delete mode 100644 include/protobuf-desc.h
delete mode 100644 include/protobuf.h
delete mode 100644 include/pstree.h
delete mode 100644 include/ptrace.h
delete mode 100644 include/rbtree.h
delete mode 100644 include/restorer.h
delete mode 100644 include/rst-malloc.h
delete mode 100644 include/rst_info.h
delete mode 100644 include/seccomp.h
delete mode 100644 include/seize.h
delete mode 100644 include/servicefd.h
delete mode 100644 include/setproctitle.h
delete mode 100644 include/shmem.h
delete mode 100644 include/sigframe.h
delete mode 100644 include/signalfd.h
delete mode 100644 include/sk-inet.h
delete mode 100644 include/sk-packet.h
delete mode 100644 include/sk-queue.h
delete mode 100644 include/sockets.h
delete mode 100644 include/stats.h
delete mode 100644 include/string.h
delete mode 100644 include/syscall-types.h
delete mode 100644 include/sysctl.h
delete mode 100644 include/sysfs_parse.h
delete mode 100644 include/timerfd.h
delete mode 100644 include/tty.h
delete mode 100644 include/tun.h
delete mode 100644 include/unix_diag.h
delete mode 100644 include/util-pie.h
delete mode 100644 include/util-vdso.h
delete mode 100644 include/util.h
delete mode 100644 include/uts_ns.h
delete mode 100644 include/vdso.h
delete mode 100644 include/vma.h
delete mode 100644 include/xmalloc.h
delete mode 100644 ipc_ns.c
delete mode 100644 irmap.c
delete mode 100644 kcmp-ids.c
delete mode 100644 kerndat.c
delete mode 100644 libnetlink.c
delete mode 100644 log.c
delete mode 100644 lsm.c
delete mode 100644 mem.c
delete mode 100644 mount.c
delete mode 100644 namespaces.c
delete mode 100644 net.c
delete mode 100644 netfilter.c
delete mode 100644 page-pipe.c
delete mode 100644 page-read.c
delete mode 100644 page-xfer.c
delete mode 100644 pagemap-cache.c
delete mode 100644 parasite-syscall.c
delete mode 100644 pie/Makefile
delete mode 100644 pie/log-simple.c
delete mode 100644 pie/parasite-vdso.c
delete mode 100644 pie/parasite.c
delete mode 100644 pie/pie-reloc.lds.S.in
delete mode 100644 pie/pie-relocs.c
delete mode 100644 pie/pie-relocs.h
delete mode 100644 pie/pie.lds.S.in
delete mode 100644 pie/piegen/Makefile
delete mode 100644 pie/piegen/elf-ppc64.c
delete mode 100644 pie/piegen/elf-x86-32.c
delete mode 100644 pie/piegen/elf-x86-64.c
delete mode 100644 pie/piegen/elf.c
delete mode 100644 pie/piegen/main.c
delete mode 100644 pie/piegen/piegen.h
delete mode 100644 pie/piegen/uapi/types.h
delete mode 100644 pie/restorer.c
delete mode 100644 pie/util-fd.c
delete mode 100644 pie/util-vdso.c
delete mode 100644 pie/util.c
delete mode 100644 pipes.c
delete mode 100644 plugin.c
delete mode 100644 proc_parse.c
delete mode 100644 protobuf-desc.c
delete mode 100644 protobuf.c
delete mode 100644 pstree.c
delete mode 100644 ptrace.c
delete mode 100644 rbtree.c
delete mode 100644 rst-malloc.c
delete mode 100644 scripts/Makefile.build
delete mode 100644 scripts/Makefile.rules
delete mode 100644 scripts/Makefile.version
delete mode 100644 seccomp.c
delete mode 100644 seize.c
delete mode 100644 shmem.c
delete mode 100644 sigframe.c
delete mode 100644 signalfd.c
delete mode 100644 sk-inet.c
delete mode 100644 sk-netlink.c
delete mode 100644 sk-packet.c
delete mode 100644 sk-queue.c
delete mode 100644 sk-tcp.c
delete mode 100644 sk-unix.c
delete mode 100644 sockets.c
delete mode 100644 stats.c
delete mode 100644 string.c
delete mode 100644 sysctl.c
delete mode 100644 sysfs_parse.c
delete mode 100644 timerfd.c
delete mode 100644 tty.c
delete mode 100644 tun.c
delete mode 100644 util.c
delete mode 100644 uts_ns.c
delete mode 100644 vdso.c
diff --git a/.gitignore b/.gitignore
index d1ee8415e8d4..c46bc5f75ad4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,31 +6,15 @@
*.out
*.swp
*.swo
-*-blob.h
*.so
.git-ignore
*.patch
*.pyc
-criu
cscope*
tags
TAGS
-Makefile.local
-syscall-x86-64.S
-include/syscall.h
-include/syscall-codes.h
-protobuf/*.c
-protobuf/*.h
-protobuf/google/protobuf/*.c
-protobuf/google/protobuf/*.h
-include/version.h
-arch/x86/sys-exec-tbl.c
-arch/x86/syscalls.S
-pie/pie.lds.S
-pie/piegen/piegen
-include/config.h
-protobuf-desc-gen.h
-criu.pc
-build
+images/*.c
+images/*.h
+images/google/protobuf/*.c
+images/google/protobuf/*.h
.gitid
-usr/
diff --git a/Documentation/Makefile b/Documentation/Makefile
index e236635e5b20..77e3a26bb3ad 100644
--- a/Documentation/Makefile
+++ b/Documentation/Makefile
@@ -1,4 +1,5 @@
--include ../Makefile.inc
+include $(__nmk_dir)/include.mk
+include $(__nmk_dir)/macro.mk
ASCIIDOC := asciidoc
A2X := a2x
@@ -9,10 +10,9 @@ XMLS := $(patsubst %.txt,%.xml,$(SRC))
MANS := $(patsubst %.txt,%.8,$(SRC))
MAN8DIR := $(MANDIR)/man8
-GROFF=groff
-PAPER=$(shell paperconf 2>/dev/null || echo letter)
-GROFF_OPTS := -Tps -t -dpaper=$(PAPER) -P-p$(PAPER) \
- -man -msafer -rC1 -rD1 -rS11
+GROFF :=groff
+PAPER :=$(shell paperconf 2>/dev/null || echo letter)
+GROFF_OPTS := -Tps -t -dpaper=$(PAPER) -P-p$(PAPER) -man -msafer -rC1 -rD1 -rS11
PSS := $(MANS:%.8=%.ps)
PDFS := $(MANS:%.8=%.pdf)
@@ -27,20 +27,20 @@ check:
done
%.8: %.txt
- $(E) " GEN " $@
+ $(call msg-gen, $@)
$(Q) $(ASCIIDOC) -b docbook -d manpage -o $(patsubst %.8,%.xml,$@) $<
$(Q) $(XMLTO) man --skip-validation $(patsubst %.8,%.xml,$@) 2>/dev/null
%.ps: %.8
- $(E) " GEN " $@
+ $(call msg-gen, $@)
$(Q) $(GROFF) $(GROFF_OPTS) $^ > $@
%.pdf: %.ps
- $(E) " GEN " $@
+ $(call msg-gen, $@)
$(Q) ps2pdf $< $@
clean:
- $(E) " CLEAN "
+ $(call msg-clean, "Documentation")
$(Q) rm -f $(XMLS) $(MANS) $(PSS) $(PDFS)
install: $(MANS)
diff --git a/Makefile b/Makefile
index 18956b455596..9f113cccd4a4 100644
--- a/Makefile
+++ b/Makefile
@@ -1,98 +1,52 @@
+#
# Import the build engine first
__nmk_dir=$(CURDIR)/scripts/nmk/scripts/
export __nmk_dir
include $(__nmk_dir)/include.mk
-
-VERSION_MAJOR := 1
-VERSION_MINOR := 8
-VERSION_SUBLEVEL :=
-VERSION_EXTRA :=
-VERSION_NAME :=
-VERSION_SO_MAJOR := 1
-VERSION_SO_MINOR := 0
-
-export VERSION_MAJOR VERSION_MINOR VERSION_SUBLEVEL VERSION_EXTRA VERSION_NAME
-export VERSION_SO_MAJOR VERSION_SO_MINOR
-
-#
-# FIXME zdtm building procedure requires implicit rules
-# so I can't use strict make file mode and drop completely
-# all of implicit rules, so I tuned only .SUFFIXES:
-#
-# In future zdtm makefiles need to be fixed and the line below
-# may be uncommented.
-#
-#MAKEFLAGS := -r -R
+include $(__nmk_dir)/macro.mk
#
-# Common definitions
-#
-
-FIND := find
-CSCOPE := cscope
-RM := rm -f
-LD := $(CROSS_COMPILE)ld
-CC := $(CROSS_COMPILE)gcc
-NM := $(CROSS_COMPILE)nm
-SH := bash
-MAKE := make
-OBJCOPY := $(CROSS_COMPILE)objcopy
+# To build host helpers.
HOSTCC ?= gcc
HOSTLD ?= ld
+export HOSTCC HOSTLD
CFLAGS += $(USERCFLAGS)
-HOSTCFLAGS ?= $(CFLAGS)
+export CFLAGS
-export HOSTCC
-export HOSTLD
+HOSTCFLAGS ?= $(CFLAGS)
export HOSTCFLAGS
+#
+# Where we live.
+SRC_DIR := $(CURDIR)
+export SRC_DIR
-ifeq ($(ARCH),x86_64)
- ARCH := x86
-endif
-
-ifeq ($(ARCH),x86)
- SRCARCH := x86
- DEFINES := -DCONFIG_X86_64
- LDARCH := i386:x86-64
- VDSO := y
-endif
-ifeq ($(ARCH),ia32)
- SRCARCH := x86
- DEFINES := -DCONFIG_X86_32
- LDARCH := i386
- ldflags-y += -m elf_i386
- VDSO := y
- USERCFLAGS += -m32
- PROTOUFIX := y
- export PROTOUFIX ldflags-y
-endif
+#
+# General architecture specific options.
+UNAME-M := $(shell uname -m)
+export UNAME-M
-ifeq ($(GCOV),1)
- LDFLAGS += -lgcov
- DEBUG := 1 # disable optimization if we want to measure code coverage
-%.o $(PROGRAM): override CFLAGS += --coverage -fno-exceptions -fno-inline
-endif
+ifeq ($(ARCH),arm)
+ ARMV := $(shell echo $(UNAME-M) | sed -nr 's/armv([[:digit:]]).*/\1/p; t; i7')
+ DEFINES := -DCONFIG_ARMV$(ARMV)
-ifeq ($(shell echo $(ARCH) | sed -e 's/arm.*/arm/'),arm)
- ARMV := $(shell echo $(ARCH) | sed -nr 's/armv([[:digit:]]).*/\1/p; t; i7')
- SRCARCH := arm
- DEFINES := -DCONFIG_ARMV$(ARMV)
+ USERCFLAGS += -Wa,-mimplicit-it=always
- USERCFLAGS += -Wa,-mimplicit-it=always
+ ifeq ($(ARMV),6)
+ USERCFLAGS += -march=armv6
+ endif
- ifeq ($(ARMV),6)
- USERCFLAGS += -march=armv6
- endif
+ ifeq ($(ARMV),7)
+ USERCFLAGS += -march=armv7-a
+ endif
- ifeq ($(ARMV),7)
- USERCFLAGS += -march=armv7-a
- endif
+ PROTOUFIX := y
+endif
- PROTOUFIX := y
- export PROTOUFIX
+ifeq ($(ARCH),x86)
+ DEFINES := -DCONFIG_X86_64
endif
ifeq ($(ARCH),aarch64)
@@ -103,239 +57,138 @@ endif
# The PowerPC 64 bits architecture could be big or little endian.
# They are handled in the same way.
#
-ifeq ($(shell echo $(ARCH) | sed -e 's/ppc64.*/ppc64/'),ppc64)
- ifeq ($(ARCH),ppc64)
- error := $(error ppc64 big endian not yet supported)
- endif
- SRCARCH := ppc64
- DEFINES := -DCONFIG_PPC64
- LDARCH := powerpc:common64
- VDSO := y
-endif
-
-LDARCH ?= $(SRCARCH)
+ifeq ($(ARCH),powerpc)
+ ifeq ($(UNAME-M),ppc64)
+ error := $(error ppc64 big endian not yet supported)
+ endif
-SRC_DIR ?= $(CURDIR)
-ARCH_DIR := arch/$(SRCARCH)
-
-$(if $(wildcard $(ARCH_DIR)),,$(error "The architecture $(ARCH) isn't supported"))
-
-#
-# piegen might be disabled by hands. Don't use it until
-# you know what you're doing.
-ifneq ($(filter ia32 x86 ppc64le, $(ARCH)),)
-ifneq ($(PIEGEN),no)
- piegen-y := y
- export piegen-y
-endif
+ DEFINES := -DCONFIG_PPC64
endif
-cflags-y += -iquote include -iquote pie -iquote . -I/usr/include/libnl3
-cflags-y += -iquote $(ARCH_DIR) -iquote $(ARCH_DIR)/include
-cflags-y += -fno-strict-aliasing
-export cflags-y
+export PROTOUFIX DEFINES USERCFLAGS
-LIBS := -lrt -lpthread -lprotobuf-c -ldl -lnl-3
+#
+# Independent options for all tools.
+DEFINES += -D_FILE_OFFSET_BITS=64
+DEFINES += -D_GNU_SOURCE
-DEFINES += -D_FILE_OFFSET_BITS=64
-DEFINES += -D_GNU_SOURCE
+CFLAGS += $(USERCFLAGS)
-WARNINGS := -Wall
+WARNINGS := -Wall
ifneq ($(WERROR),0)
- WARNINGS += -Werror
+ WARNINGS += -Werror
endif
ifeq ($(DEBUG),1)
- DEFINES += -DCR_DEBUG
- CFLAGS += -O0 -ggdb3
+ DEFINES += -DCR_DEBUG
+ CFLAGS += -O0 -ggdb3
else
- CFLAGS += -O2 -g
-endif
-
-ifeq ($(GMON),1)
- CFLAGS += -pg
- GMONLDOPT := -pg
-endif
-
-CFLAGS += $(WARNINGS) $(DEFINES)
-SYSCALL-LIB := $(ARCH_DIR)/syscalls.built-in.o
-ARCH-LIB := $(ARCH_DIR)/crtools.built-in.o
-CRIU-SO := libcriu
-CRIU-LIB := lib/c/$(CRIU-SO).so
-CRIU-INC := lib/criu.h include/criu-plugin.h include/criu-log.h protobuf/rpc.proto
-ifeq ($(piegen-y),y)
-piegen := pie/piegen/piegen
+ CFLAGS += -O2 -g
endif
-export CC MAKE CFLAGS LIBS SRCARCH DEFINES MAKEFLAGS CRIU-SO
-export SRC_DIR SYSCALL-LIB SH RM ARCH_DIR OBJCOPY LDARCH LD
-export USERCFLAGS
-export cflags-y
-export VDSO
-
-include Makefile.inc
-include Makefile.config
-include scripts/Makefile.version
-include scripts/Makefile.rules
-
-.SUFFIXES:
+CFLAGS += $(WARNINGS) $(DEFINES)
#
-# shorthand
-build-old := -r -R -f scripts/Makefile.build makefile=Makefile obj
-build-old-crtools := -r -R -f scripts/Makefile.build makefile=Makefile.crtools obj
-
-PROGRAM := criu
-
-.PHONY: all zdtm test rebuild clean distclean tags cscope \
- docs help pie protobuf $(ARCH_DIR) clean-built lib crit
-
-all: config pie $(VERSION_HEADER) $(CRIU-LIB)
- $(Q) $(MAKE) $(PROGRAM)
- $(Q) $(MAKE) crit
-
-protobuf/%::
- $(Q) $(MAKE) $(build-old)=protobuf $@
-protobuf:
- $(Q) $(MAKE) $(build-old)=protobuf all
-
-$(ARCH_DIR)/%:: protobuf config
- $(Q) $(MAKE) $(build-old)=$(ARCH_DIR) $@
-$(ARCH_DIR): protobuf config
- $(Q) $(MAKE) $(build-old)=$(ARCH_DIR) all
-
-ifeq ($(piegen-y),y)
-pie/piegen/%: config
- $(Q) CC=$(HOSTCC) LD=$(HOSTLD) CFLAGS="$(HOSTCFLAGS)" $(MAKE) $(build-old)=pie/piegen $@
-pie/piegen: config
- $(Q) CC=$(HOSTCC) LD=$(HOSTLD) CFLAGS="$(HOSTCFLAGS)" $(MAKE) $(build-old)=pie/piegen all
-$(piegen): pie/piegen/built-in.o
- $(E) " LINK " $@
- $(Q) $(HOSTCC) $(HOSTCFLAGS) $^ $(LDFLAGS) -o $@
-.PHONY: pie/piegen
-endif
+# Protobuf images first, they are not depending
+# on anything else.
+$(eval $(call gen-built-in,images))
+PHONY += images
-pie: $(ARCH_DIR) $(piegen)
- $(Q) $(MAKE) $(build-old)=pie all
-
-%.o %.i %.s %.d: $(VERSION_HEADER) pie
- $(Q) $(MAKE) $(build-old-crtools)=. $@
-built-in.o: $(VERSION_HEADER) pie
- $(Q) $(MAKE) $(build-old-crtools)=. $@
+#
+# CRIU building done in own directory
+# with slightly different rules so we
+# can't use nmk engine directly (we
+# build syscalls library and such).
+#
+# But note that we're already included
+# the nmk so we can reuse it there.
+criu/%: images/built-in.o
+ $(Q) $(MAKE) -C criu $@
+criu: images/built-in.o
+ $(Q) $(MAKE) -C criu all
+criu/criu: criu
+PHONY += criu
-lib/%:: $(VERSION_HEADER) config built-in.o
+#
+# Libraries next once criu it ready
+# (we might generate headers and such
+# when building criu itself).
+lib/%: criu
$(Q) $(MAKE) -C lib $@
-lib: $(VERSION_HEADER) config built-in.o
+lib: criu
$(Q) $(MAKE) -C lib all
+PHONY += lib
-$(CRIU-LIB): lib
- @true
-crit: lib
- @true
+all: criu lib
+PHONY += all
+clean-built:
+ $(Q) $(MAKE) $(build)=images clean
+ $(Q) $(MAKE) -C criu clean
+ $(Q) $(MAKE) -C lib clean
+PHONY += clean-built
-PROGRAM-BUILTINS += protobuf/built-in.o
-PROGRAM-BUILTINS += built-in.o
+clean: clean-built
+ $(call msg-clean, criu)
+PHONY += clean
-$(SYSCALL-LIB) $(ARCH-LIB) $(PROGRAM-BUILTINS): config
+#
+# Non-CRIU stuff.
+#
-$(PROGRAM): $(ARCH-LIB) $(PROGRAM-BUILTINS)
- $(E) " LINK " $@
- $(Q) $(CC) $(CFLAGS) $^ $(LIBS) $(LDFLAGS) $(GMONLDOPT) -rdynamic -o $@
+docs:
+ $(Q) $(MAKE) -s -C Documentation all
+PHONY += docs
zdtm: all
$(Q) $(MAKE) -C test/zdtm all
+PHONY += zdtm
test: zdtm
$(Q) $(MAKE) -C test
+PHONY += test
-clean-built:
- $(Q) $(RM) $(VERSION_HEADER)
- $(Q) $(MAKE) $(build-old)=$(ARCH_DIR) clean
- $(Q) $(MAKE) $(build-old)=protobuf clean
- $(Q) $(MAKE) $(build-old)=pie/piegen clean
- $(Q) $(MAKE) $(build-old)=pie clean
- $(Q) $(MAKE) -C lib clean
- $(Q) $(MAKE) $(build-old-crtools)=. clean
- $(Q) $(MAKE) -C Documentation clean
- $(Q) $(RM) ./include/config.h
- $(Q) $(RM) ./$(PROGRAM)
-
-rebuild: clean-built
- $(E) " FORCE-REBUILD"
- $(Q) $(MAKE)
-
-clean: clean-built
- $(E) " CLEAN"
- $(Q) $(RM) ./*.img
- $(Q) $(RM) ./*.out
- $(Q) $(RM) ./*.bin
- $(Q) $(RM) ./*.{gcda,gcno,gcov} ./test/`pwd`/*.{gcda,gcno,gcov}
- $(Q) $(RM) ./pie/*.{gcda,gcno,gcov} ./pie/piegen/*.{gcda,gcno,gcov}
- $(Q) $(RM) -r ./gcov
- $(Q) $(RM) protobuf-desc-gen.h
- $(Q) $(MAKE) -C test $@
- $(Q) $(RM) ./*.pyc
- $(Q) $(RM) -r build
- $(Q) $(RM) -r usr
-
-distclean: clean
- $(E) " DISTCLEAN"
- $(Q) $(RM) ./tags
- $(Q) $(RM) ./cscope*
+dist: tar
+tar: criu-$(CRTOOLSVERSION).tar.bz2
+criu-$(CRTOOLSVERSION).tar.bz2:
+ git archive --format tar --prefix 'criu-$(CRTOOLSVERSION)/' \
+ v$(CRTOOLSVERSION) | bzip2 > $@
+.PHONY: dist tar
tags:
- $(E) " GEN " $@
+ $(call msg-gen, $@)
$(Q) $(RM) tags
$(Q) $(FIND) . -name '*.[hcS]' ! -path './.*' ! -path './test/*' -print | xargs ctags -a
+PHONY += tags
cscope:
- $(E) " GEN " $@
+ $(call msg-gen, $@)
$(Q) $(FIND) . -name '*.[hcS]' ! -path './.*' ! -path './test/*' ! -type l -print > cscope.files
$(Q) $(CSCOPE) -bkqu
+PHONY += cscope
-docs:
- $(Q) $(MAKE) -s -C Documentation all
+gcov:
+ $(E) " GCOV"
+ $(Q) test -d gcov || mkdir gcov && \
+ cp *.{gcno,c} test/`pwd`/ && \
+ geninfo --output-filename gcov/crtools.h.info --no-recursion . && \
+ geninfo --output-filename gcov/crtools.ns.info --no-recursion test/`pwd`/ && \
+ sed -i 's#/test/`pwd`##' gcov/crtools.ns.info && \
+ cd gcov && \
+ lcov --rc lcov_branch_coverage=1 --add-tracefile crtools.h.info \
+ --add-tracefile crtools.ns.info --output-file criu.info && \
+ genhtml --rc lcov_branch_coverage=1 --output-directory html criu.info
+ @echo "Code coverage report is in `pwd`/gcov/html/ directory."
+PHONY += gcov
-dist: tar
-tar: criu-$(CRTOOLSVERSION).tar.bz2
-criu-$(CRTOOLSVERSION).tar.bz2:
- git archive --format tar --prefix 'criu-$(CRTOOLSVERSION)/' \
- v$(CRTOOLSVERSION) | bzip2 > $@
-.PHONY: dist tar
+docker-build:
+ docker build -t criu .
+PHONY += docker-build
-install: install-criu install-man
-
-install-criu: all $(CRIU-LIB) install-crit
- $(E) " INSTALL " $(PROGRAM)
- $(Q) mkdir -p $(DESTDIR)$(SBINDIR)
- $(Q) install -m 755 $(PROGRAM) $(DESTDIR)$(SBINDIR)
- $(Q) mkdir -p $(DESTDIR)$(LIBDIR)
- $(Q) install -m 755 $(CRIU-LIB) \
- $(DESTDIR)$(LIBDIR)/$(CRIU-SO).so.$(VERSION_SO_MAJOR).$(VERSION_SO_MINOR)
- $(Q) ln -fns $(CRIU-SO).so.$(VERSION_SO_MAJOR).$(VERSION_SO_MINOR) \
- $(DESTDIR)$(LIBDIR)/$(CRIU-SO).so.$(VERSION_SO_MAJOR)
- $(Q) ln -fns $(CRIU-SO).so.$(VERSION_SO_MAJOR).$(VERSION_SO_MINOR) \
- $(DESTDIR)$(LIBDIR)/$(CRIU-SO).so
- $(Q) mkdir -p $(DESTDIR)$(INCLUDEDIR)
- $(Q) install -m 644 $(CRIU-INC) $(DESTDIR)$(INCLUDEDIR)
- $(Q) mkdir -p $(DESTDIR)$(SYSTEMDUNITDIR)
- $(Q) sed -e 's, at version@,$(CRTOOLSVERSION),' \
- -e 's, at libdir@,$(LIBDIR),' \
- -e 's, at includedir@,$(dir $(INCLUDEDIR)),' \
- lib/criu.pc.in > criu.pc
- $(Q) mkdir -p $(DESTDIR)$(LIBDIR)/pkgconfig
- $(Q) install -m 644 criu.pc $(DESTDIR)$(LIBDIR)/pkgconfig
-
-install-man:
- $(Q) $(MAKE) -C Documentation install
-
-install-crit: crit
- $(E) " INSTALL crit"
- $(Q) python scripts/crit-setup.py install --root=$(DESTDIR) --prefix=$(PREFIX)
-
-.PHONY: install install-man install-crit install-criu
+docker-test:
+ docker run --rm -it --privileged criu ./test/zdtm.sh -C -x tcp6 -x tcpbuf6 -x static/rtc -x cgroup -x mountpoint
+PHONY += docker-test
help:
@echo ' Targets:'
@@ -350,28 +203,15 @@ help:
@echo ' cscope - Generate cscope database'
@echo ' rebuild - Force-rebuild of [*] targets'
@echo ' test - Run zdtm test-suite'
- @echo ' gcov - Make code coverage report'
-
-gcov:
- $(E) " GCOV"
- $(Q) test -d gcov || mkdir gcov && \
- cp *.{gcno,c} test/`pwd`/ && \
- geninfo --output-filename gcov/crtools.h.info --no-recursion . && \
- geninfo --output-filename gcov/crtools.ns.info --no-recursion test/`pwd`/ && \
- sed -i 's#/test/`pwd`##' gcov/crtools.ns.info && \
- cd gcov && \
- lcov --rc lcov_branch_coverage=1 --add-tracefile crtools.h.info --add-tracefile crtools.ns.info --output-file criu.info && \
- genhtml --rc lcov_branch_coverage=1 --output-directory html criu.info
- @echo "Code coverage report is in `pwd`/gcov/html/ directory."
-.PHONY: gcov
+ @echo ' gcov - Make code coverage report'
+PHONY += help
-docker-build:
- docker build -t criu .
+include Makefile.install
-docker-test:
- docker run --rm -it --privileged criu ./test/zdtm.sh -C -x tcp6 -x tcpbuf6 -x static/rtc -x cgroup -x mountpoint
+.PHONY: $(PHONY)
-.DEFAULT_GOAL := all
+.DEFAULT_GOAL := all
-# include optional local rules
+#
+# Optional local include.
-include Makefile.local
diff --git a/Makefile.config b/Makefile.config
deleted file mode 100644
index 26d581bfab43..000000000000
--- a/Makefile.config
+++ /dev/null
@@ -1,54 +0,0 @@
-include scripts/utilities.mak
-include scripts/feature-tests.mak
-
-CONFIG := include/config.h
-
-ifeq ($(call try-cc,$(LIBBSD_DEV_TEST),-lbsd),y)
- LIBS += -lbsd
- DEFINES += -DCONFIG_HAS_LIBBSD
-endif
-
-ifeq ($(call pkg-config-check,libselinux),y)
- LIBS := -lselinux $(LIBS)
- DEFINES += -DCONFIG_HAS_SELINUX
-endif
-
-$(CONFIG): scripts/utilities.mak scripts/feature-tests.mak include/config-base.h
- $(E) " GEN " $@
- $(Q) @echo '#ifndef __CR_CONFIG_H__' > $@
- $(Q) @echo '#define __CR_CONFIG_H__' >> $@
- $(Q) @echo '' >> $@
- $(Q) @echo '#include "config-base.h"' >> $@
- $(Q) @echo '' >> $@
-ifeq ($(call try-cc,$(TCP_REPAIR_TEST),),y)
- $(Q) @echo '#define CONFIG_HAS_TCP_REPAIR' >> $@
-endif
-ifeq ($(call try-cc,$(PRLIMIT_TEST),),y)
- $(Q) @echo '#define CONFIG_HAS_PRLIMIT' >> $@
-endif
-ifeq ($(call try-cc,$(STRLCPY_TEST),$(LIBS)),y)
- $(Q) @echo '#define CONFIG_HAS_STRLCPY' >> $@
-endif
-ifeq ($(call try-cc,$(STRLCAT_TEST),$(LIBS)),y)
- $(Q) @echo '#define CONFIG_HAS_STRLCAT' >> $@
-endif
-ifeq ($(call try-cc,$(PTRACE_PEEKSIGINFO_TEST),),y)
- $(Q) @echo '#define CONFIG_HAS_PEEKSIGINFO_ARGS' >> $@
-endif
-ifeq ($(VDSO),y)
- $(Q) @echo '#define CONFIG_VDSO' >> $@
-endif
-ifeq ($(call try-cc,$(SETPROCTITLE_INIT_TEST),-lbsd),y)
- $(Q) @echo '#define CONFIG_HAS_SETPROCTITLE_INIT' >> $@
-endif
-ifeq ($(call try-cc,$(MEMFD_TEST),),y)
- $(Q) @echo '#define CONFIG_HAS_MEMFD' >> $@
-endif
-ifeq ($(piegen-y),y)
- $(Q) @echo '#define CONFIG_PIEGEN' >> $@
-endif
- $(Q) @echo '#endif /* __CR_CONFIG_H__ */' >> $@
-
-config: $(CONFIG)
-
-.PHONY: config
diff --git a/Makefile.crtools b/Makefile.crtools
deleted file mode 100644
index 5788ef0304a4..000000000000
--- a/Makefile.crtools
+++ /dev/null
@@ -1,94 +0,0 @@
-obj-y += parasite-syscall.o
-obj-y += mem.o
-obj-y += rst-malloc.o
-obj-y += cr-restore.o
-obj-y += crtools.o
-obj-y += image.o
-obj-y += image-desc.o
-obj-y += net.o
-obj-y += tun.o
-obj-y += proc_parse.o
-obj-y += sysfs_parse.o
-obj-y += cr-dump.o
-obj-y += cr-show.o
-obj-y += cr-check.o
-obj-y += cr-dedup.o
-obj-y += util.o
-obj-y += bfd.o
-obj-y += action-scripts.o
-obj-y += sysctl.o
-obj-y += ptrace.o
-obj-y += kcmp-ids.o
-obj-y += rbtree.o
-obj-y += log.o
-obj-y += libnetlink.o
-obj-y += sockets.o
-obj-y += sk-inet.o
-obj-y += sk-tcp.o
-obj-y += sk-unix.o
-obj-y += sk-packet.o
-obj-y += sk-netlink.o
-obj-y += sk-queue.o
-obj-y += files.o
-obj-y += files-reg.o
-obj-y += files-ext.o
-obj-y += pipes.o
-obj-y += fifo.o
-obj-y += file-ids.o
-obj-y += namespaces.o
-obj-y += uts_ns.o
-obj-y += ipc_ns.o
-obj-y += netfilter.o
-obj-y += shmem.o
-obj-y += eventfd.o
-obj-y += eventpoll.o
-obj-y += mount.o
-obj-y += fsnotify.o
-obj-y += irmap.o
-obj-y += signalfd.o
-obj-y += pstree.o
-obj-y += bitmap.o
-obj-y += protobuf.o
-obj-y += protobuf-desc.o
-obj-y += tty.o
-obj-y += cr-exec.o
-obj-y += file-lock.o
-obj-y += page-pipe.o
-obj-y += page-xfer.o
-obj-y += page-read.o
-obj-y += pagemap-cache.o
-obj-y += kerndat.o
-obj-y += stats.o
-obj-y += cgroup.o
-obj-y += timerfd.o
-obj-y += aio.o
-obj-y += string.o
-obj-y += sigframe.o
-obj-y += lsm.o
-ifeq ($(VDSO),y)
-obj-y += vdso.o
-obj-y += pie/util-vdso.o
-endif
-obj-y += cr-service.o
-obj-y += plugin.o
-obj-y += cr-errno.o
-obj-y += pie/pie-relocs.o
-obj-y += seize.o
-obj-y += fault-injection.o
-obj-y += pie/util-fd.o
-obj-y += pie/util.o
-obj-y += seccomp.o
-
-ifneq ($(MAKECMDGOALS),clean)
-incdeps := y
-endif
-
-PROTOBUF_GEN := scripts/protobuf-gen.sh
-
-protobuf-desc.c: protobuf-desc-gen.h
-
-protobuf-desc-gen.h: $(PROTOBUF_GEN) include/protobuf-desc.h
- $(E) " GEN " $@
- $(Q) $(SH) $(obj)/$(PROTOBUF_GEN) > $@
-
-cleanup-y += protobuf-desc-gen.h
diff --git a/Makefile.inc b/Makefile.inc
deleted file mode 100644
index 4782ea23939d..000000000000
--- a/Makefile.inc
+++ /dev/null
@@ -1,18 +0,0 @@
-# Installation paths
-PREFIX ?= /usr/local
-SBINDIR ?= $(PREFIX)/sbin
-MANDIR ?= $(PREFIX)/share/man
-SYSTEMDUNITDIR ?= $(PREFIX)/lib/systemd/system/
-LOGROTATEDIR ?= $(PREFIX)/etc/logrotate.d/
-LIBDIR ?= $(PREFIX)/lib
-# For recent Debian/Ubuntu with multiarch support
-DEB_HOST_MULTIARCH ?= $(shell dpkg-architecture \
- -qDEB_HOST_MULTIARCH 2>/dev/null)
-ifneq "$(DEB_HOST_MULTIARCH)" ""
-LIBDIR ?= $(PREFIX)/lib/$(DEB_HOST_MULTIARCH)
-# For most other systems
-else ifeq "$(shell uname -m)" "x86_64"
-LIBDIR ?= $(PREFIX)/lib64
-endif
-
-INCLUDEDIR ?= $(PREFIX)/include/criu
diff --git a/Makefile.install b/Makefile.install
new file mode 100644
index 000000000000..4f3b57fb07d8
--- /dev/null
+++ b/Makefile.install
@@ -0,0 +1,31 @@
+install: install-criu install-man
+
+install-criu: all $(CRIU-LIB) install-crit
+ $(E) " INSTALL " $(PROGRAM)
+ $(Q) mkdir -p $(DESTDIR)$(SBINDIR)
+ $(Q) install -m 755 $(PROGRAM) $(DESTDIR)$(SBINDIR)
+ $(Q) mkdir -p $(DESTDIR)$(LIBDIR)
+ $(Q) install -m 755 $(CRIU-LIB) \
+ $(DESTDIR)$(LIBDIR)/$(CRIU-SO).so.$(VERSION_SO_MAJOR).$(VERSION_SO_MINOR)
+ $(Q) ln -fns $(CRIU-SO).so.$(VERSION_SO_MAJOR).$(VERSION_SO_MINOR) \
+ $(DESTDIR)$(LIBDIR)/$(CRIU-SO).so.$(VERSION_SO_MAJOR)
+ $(Q) ln -fns $(CRIU-SO).so.$(VERSION_SO_MAJOR).$(VERSION_SO_MINOR) \
+ $(DESTDIR)$(LIBDIR)/$(CRIU-SO).so
+ $(Q) mkdir -p $(DESTDIR)$(INCLUDEDIR)
+ $(Q) install -m 644 $(CRIU-INC) $(DESTDIR)$(INCLUDEDIR)
+ $(Q) mkdir -p $(DESTDIR)$(SYSTEMDUNITDIR)
+ $(Q) sed -e 's, at version@,$(CRTOOLSVERSION),' \
+ -e 's, at libdir@,$(LIBDIR),' \
+ -e 's, at includedir@,$(dir $(INCLUDEDIR)),' \
+ lib/criu.pc.in > criu.pc
+ $(Q) mkdir -p $(DESTDIR)$(LIBDIR)/pkgconfig
+ $(Q) install -m 644 criu.pc $(DESTDIR)$(LIBDIR)/pkgconfig
+
+install-man:
+ $(Q) $(MAKE) -C Documentation install
+
+install-crit: crit
+ $(E) " INSTALL crit"
+ $(Q) python scripts/crit-setup.py install --root=$(DESTDIR) --prefix=$(PREFIX)
+
+.PHONY: install install-man install-crit install-criu
diff --git a/action-scripts.c b/action-scripts.c
deleted file mode 100644
index 05aa9d01326e..000000000000
--- a/action-scripts.c
+++ /dev/null
@@ -1,77 +0,0 @@
-#include <unistd.h>
-#include <stdio.h>
-#include <limits.h>
-#include <stdlib.h>
-
-#include "cr_options.h"
-#include "list.h"
-#include "xmalloc.h"
-#include "log.h"
-#include "servicefd.h"
-#include "cr-service.h"
-#include "action-scripts.h"
-
-static const char *action_names[ACT_MAX] = {
- [ ACT_PRE_DUMP ] = "pre-dump",
- [ ACT_POST_DUMP ] = "post-dump",
- [ ACT_PRE_RESTORE ] = "pre-restore",
- [ ACT_POST_RESTORE ] = "post-restore",
- [ ACT_NET_LOCK ] = "network-lock",
- [ ACT_NET_UNLOCK ] = "network-unlock",
- [ ACT_SETUP_NS ] = "setup-namespaces",
- [ ACT_POST_SETUP_NS ] = "post-setup-namespaces",
-};
-
-int run_scripts(enum script_actions act)
-{
- struct script *script;
- int ret = 0;
- char image_dir[PATH_MAX];
- const char *action = action_names[act];
-
- pr_debug("Running %s scripts\n", action);
-
- if (unlikely(list_empty(&opts.scripts)))
- return 0;
-
- if (setenv("CRTOOLS_SCRIPT_ACTION", action, 1)) {
- pr_perror("Can't set CRTOOLS_SCRIPT_ACTION=%s", action);
- return -1;
- }
-
- sprintf(image_dir, "/proc/%ld/fd/%d", (long) getpid(), get_service_fd(IMG_FD_OFF));
- if (setenv("CRTOOLS_IMAGE_DIR", image_dir, 1)) {
- pr_perror("Can't set CRTOOLS_IMAGE_DIR=%s", image_dir);
- return -1;
- }
-
- list_for_each_entry(script, &opts.scripts, node) {
- if (script->path == SCRIPT_RPC_NOTIFY) {
- pr_debug("\tRPC\n");
- ret |= send_criu_rpc_script(act, (char *)action, script->arg);
- } else {
- pr_debug("\t[%s]\n", script->path);
- ret |= system(script->path);
- }
- }
-
- unsetenv("CRTOOLS_SCRIPT_ACTION");
- if (ret)
- pr_err("One of more action scripts failed\n");
- return ret;
-}
-
-int add_script(char *path, int arg)
-{
- struct script *script;
-
- script = xmalloc(sizeof(struct script));
- if (script == NULL)
- return 1;
-
- script->path = path;
- script->arg = arg;
- list_add(&script->node, &opts.scripts);
-
- return 0;
-}
diff --git a/aio.c b/aio.c
deleted file mode 100644
index 9965efd8c483..000000000000
--- a/aio.c
+++ /dev/null
@@ -1,120 +0,0 @@
-#include <unistd.h>
-#include <stdio.h>
-#include <stdbool.h>
-#include "vma.h"
-#include "xmalloc.h"
-#include "aio.h"
-#include "parasite.h"
-#include "parasite-syscall.h"
-#include "protobuf/mm.pb-c.h"
-
-int dump_aio_ring(MmEntry *mme, struct vma_area *vma)
-{
- int nr = mme->n_aios;
- AioRingEntry *re;
-
- pr_info("Dumping AIO ring @%"PRIx64", %u reqs\n",
- vma->e->start, vma->aio_nr_req);
-
- mme->aios = xrealloc(mme->aios, (nr + 1) * sizeof(re));
- if (!mme->aios)
- return -1;
-
- re = xmalloc(sizeof(*re));
- if (!re)
- return -1;
-
- aio_ring_entry__init(re);
- re->id = vma->e->start;
- re->nr_req = vma->aio_nr_req;
- re->ring_len = vma->e->end - vma->e->start;
- mme->aios[nr] = re;
- mme->n_aios = nr + 1;
- return 0;
-}
-
-void free_aios(MmEntry *mme)
-{
- int i;
-
- if (mme->aios) {
- for (i = 0; i < mme->n_aios; i++)
- xfree(mme->aios[i]);
- xfree(mme->aios);
- }
-}
-
-static unsigned int aio_estimate_nr_reqs(unsigned int k_max_reqs)
-{
- /*
- * Kernel does
- *
- * nr_reqs = max(nr_reqs, nr_cpus * 4)
- * nr_reqs *= 2
- * nr_reqs += 2
- * ring = roundup(sizeof(head) + nr_reqs * sizeof(req))
- * nr_reqs = (ring - sizeof(head)) / sizeof(req)
- *
- * And the k_max_reqs here is the resulting value.
- *
- * We need to get the initial nr_reqs that would grow
- * up back to the k_max_reqs.
- */
-
- return (k_max_reqs - 2) / 2;
-}
-
-unsigned long aio_rings_args_size(struct vm_area_list *vmas)
-{
- return sizeof(struct parasite_check_aios_args) +
- vmas->nr_aios * sizeof(struct parasite_aio);
-}
-
-int parasite_check_aios(struct parasite_ctl *ctl, struct vm_area_list *vmas)
-{
- struct vma_area *vma;
- struct parasite_check_aios_args *aa;
- struct parasite_aio *pa;
- int i;
-
- if (!vmas->nr_aios)
- return 0;
-
- pr_info("Checking AIO rings\n");
-
- /*
- * Go to parasite and
- * a) check that no requests are currently pengind
- * b) get the maximum number of requests kernel handles
- * to estimate what was the user request on ring
- * creation.
- */
-
- aa = parasite_args_s(ctl, aio_rings_args_size(vmas));
- pa = &aa->ring[0];
- list_for_each_entry(vma, &vmas->h, list) {
- if (!vma_area_is(vma, VMA_AREA_AIORING))
- continue;
-
- pr_debug(" `- Ring #%ld @%"PRIx64"\n",
- (long)(pa - &aa->ring[0]), vma->e->start);
- pa->ctx = vma->e->start;
- pa->max_reqs = 0;
- pa->vma_nr_reqs = &vma->aio_nr_req;
- pa++;
- }
- aa->nr_rings = vmas->nr_aios;
-
- if (parasite_execute_daemon(PARASITE_CMD_CHECK_AIOS, ctl))
- return -1;
-
- pa = &aa->ring[0];
- for (i = 0; i < vmas->nr_aios; i++) {
- pa = &aa->ring[i];
- *pa->vma_nr_reqs = aio_estimate_nr_reqs(pa->max_reqs);
- pr_debug(" `- Ring #%d has %u reqs, estimated to %u\n", i,
- pa->max_reqs, *pa->vma_nr_reqs);
- }
-
- return 0;
-}
diff --git a/arch/aarch64/Makefile b/arch/aarch64/Makefile
deleted file mode 100644
index 200d37c726b9..000000000000
--- a/arch/aarch64/Makefile
+++ /dev/null
@@ -1,59 +0,0 @@
-targets += syscalls
-targets += crtools
-
-SYS-ASM := syscalls.S
-
-syscalls-asm-y += $(SYS-ASM:.S=).o
-crtools-obj-y += crtools.o
-crtools-obj-y += cpu.o
-
-SYS-DEF := ../arm/syscall.def
-SYS-ASM-COMMON := syscall-common.S
-SYS-TYPES := include/syscall-types.h
-
-SYS-CODES := include/syscall-codes.h
-SYS-PROTO := include/syscall.h
-
-SYS-GEN := ../scripts/arm/gen-syscalls.pl
-SYS-GEN-TBL := ../scripts/arm/gen-sys-exec-tbl.pl
-
-SYS-EXEC-TBL := sys-exec-tbl.c
-
-syscalls-asm-y-asmflags += -fpie -Wstrict-prototypes -Wa,--noexecstack
-syscalls-asm-y-asmflags += -nostdlib -fomit-frame-pointer -I$(obj)
-ASMFLAGS += -D__ASSEMBLY__
-
-ARCH_BITS := 64
-
-$(obj)/$(SYS-ASM): $(obj)/$(SYS-GEN) $(obj)/$(SYS-DEF) $(obj)/$(SYS-ASM-COMMON) $(SYS-TYPES)
- $(E) " GEN " $@
- $(Q) perl \
- $(obj)/$(SYS-GEN) \
- $(obj)/$(SYS-DEF) \
- $(SYS-CODES) \
- $(SYS-PROTO) \
- $(obj)/$(SYS-ASM) \
- $(SYS-ASM-COMMON) \
- $(SYS-TYPES) \
- $(ARCH_BITS)
-
-$(obj)/syscalls.o: $(obj)/$(SYS-ASM)
-
-$(obj)/$(SYS-EXEC-TBL): $(obj)/$(SYS-GEN-TBL) $(obj)/$(SYS-DEF)
- $(E) " GEN " $@
- $(Q) perl \
- $(obj)/$(SYS-GEN-TBL) \
- $(obj)/$(SYS-DEF) \
- $(obj)/$(SYS-EXEC-TBL) \
- $(ARCH_BITS)
-
-_all += $(obj)/$(SYS-EXEC-TBL)
-
-cleanup-y += $(obj)/$(SYS-EXEC-TBL) $(obj)/$(SYS-ASM)
-cleanup-y += $(SYS-CODES)
-cleanup-y += $(SYS-PROTO)
-
-ifneq ($(MAKECMDGOALS),clean)
-deps-after := $(obj)/$(SYS-ASM)
-incdeps := y
-endif
diff --git a/arch/aarch64/cpu.c b/arch/aarch64/cpu.c
deleted file mode 100644
index 040fe14fcfb7..000000000000
--- a/arch/aarch64/cpu.c
+++ /dev/null
@@ -1,45 +0,0 @@
-#undef LOG_PREFIX
-#define LOG_PREFIX "cpu: "
-
-#include <errno.h>
-#include "cpu.h"
-
-bool cpu_has_feature(unsigned int feature)
-{
- return false;
-}
-
-int cpu_init(void)
-{
- return 0;
-}
-
-int cpu_dump_cpuinfo(void)
-{
- return 0;
-}
-
-int cpu_validate_cpuinfo(void)
-{
- return 0;
-}
-
-int cpu_dump_cpuinfo_single(void)
-{
- return -ENOTSUP;
-}
-
-int cpu_validate_image_cpuinfo_single(void)
-{
- return -ENOTSUP;
-}
-
-int cpuinfo_dump(void)
-{
- return -ENOTSUP;
-}
-
-int cpuinfo_check(void)
-{
- return -ENOTSUP;
-}
diff --git a/arch/aarch64/crtools.c b/arch/aarch64/crtools.c
deleted file mode 100644
index 5df7f1d5fd6c..000000000000
--- a/arch/aarch64/crtools.c
+++ /dev/null
@@ -1,233 +0,0 @@
-#include <string.h>
-#include <unistd.h>
-
-#include <linux/elf.h>
-
-#include "asm/types.h"
-#include "asm/restorer.h"
-#include "compiler.h"
-#include "ptrace.h"
-#include "asm/processor-flags.h"
-#include "protobuf.h"
-#include "protobuf/core.pb-c.h"
-#include "protobuf/creds.pb-c.h"
-#include "parasite-syscall.h"
-#include "log.h"
-#include "util.h"
-#include "cpu.h"
-#include "parasite-syscall.h"
-#include "restorer.h"
-
-
-/*
- * Injected syscall instruction
- */
-const char code_syscall[] = {
- 0x01, 0x00, 0x00, 0xd4, /* SVC #0 */
- 0x00, 0x00, 0x20, 0xd4 /* BRK #0 */
-};
-
-const int code_syscall_size = round_up(sizeof(code_syscall), sizeof(long));
-
-static inline void __always_unused __check_code_syscall(void)
-{
- BUILD_BUG_ON(sizeof(code_syscall) != BUILTIN_SYSCALL_SIZE);
- BUILD_BUG_ON(!is_log2(sizeof(code_syscall)));
-}
-
-void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs)
-{
- regs->pc = new_ip;
- if (stack)
- regs->sp = (unsigned long)stack;
-}
-
-bool arch_can_dump_task(pid_t pid)
-{
- /*
- * TODO: Add proper check here
- */
- return true;
-}
-
-int syscall_seized(struct parasite_ctl *ctl, int nr, unsigned long *ret,
- unsigned long arg1,
- unsigned long arg2,
- unsigned long arg3,
- unsigned long arg4,
- unsigned long arg5,
- unsigned long arg6)
-{
- user_regs_struct_t regs = ctl->orig.regs;
- int err;
-
- regs.regs[8] = (unsigned long)nr;
- regs.regs[0] = arg1;
- regs.regs[1] = arg2;
- regs.regs[2] = arg3;
- regs.regs[3] = arg4;
- regs.regs[4] = arg5;
- regs.regs[5] = arg6;
- regs.regs[6] = 0;
- regs.regs[7] = 0;
-
- err = __parasite_execute_syscall(ctl, ®s);
-
- *ret = regs.regs[0];
- return err;
-}
-
-
-#define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))(src).e
-
-int get_task_regs(pid_t pid, user_regs_struct_t regs, CoreEntry *core)
-{
- struct iovec iov;
- struct user_fpsimd_state fpsimd;
- int i, ret;
-
- pr_info("Dumping GP/FPU registers for %d\n", pid);
-
- iov.iov_base = ®s;
- iov.iov_len = sizeof(user_regs_struct_t);
- if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov))) {
- pr_perror("Failed to obtain CPU registers for %d", pid);
- goto err;
- }
-
- iov.iov_base = &fpsimd;
- iov.iov_len = sizeof(fpsimd);
- if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov))) {
- pr_perror("Failed to obtain FPU registers for %d", pid);
- goto err;
- }
-
-
- // Save the Aarch64 CPU state
- for (i = 0; i < 31; ++i)
- assign_reg(core->ti_aarch64->gpregs, regs, regs[i]);
- assign_reg(core->ti_aarch64->gpregs, regs, sp);
- assign_reg(core->ti_aarch64->gpregs, regs, pc);
- assign_reg(core->ti_aarch64->gpregs, regs, pstate);
-
-
- // Save the FP/SIMD state
- for (i = 0; i < 32; ++i)
- {
- core->ti_aarch64->fpsimd->vregs[2*i] = fpsimd.vregs[i];
- core->ti_aarch64->fpsimd->vregs[2*i + 1] = fpsimd.vregs[i] >> 64;
- }
- assign_reg(core->ti_aarch64->fpsimd, fpsimd, fpsr);
- assign_reg(core->ti_aarch64->fpsimd, fpsimd, fpcr);
-
- ret = 0;
-
-err:
- return ret;
-}
-
-int arch_alloc_thread_info(CoreEntry *core)
-{
- ThreadInfoAarch64 *ti_aarch64;
- UserAarch64RegsEntry *gpregs;
- UserAarch64FpsimdContextEntry *fpsimd;
-
- ti_aarch64 = xmalloc(sizeof(*ti_aarch64));
- if (!ti_aarch64)
- goto err;
- thread_info_aarch64__init(ti_aarch64);
- core->ti_aarch64 = ti_aarch64;
-
- gpregs = xmalloc(sizeof(*gpregs));
- if (!gpregs)
- goto err;
- user_aarch64_regs_entry__init(gpregs);
-
- gpregs->regs = xmalloc(31*sizeof(uint64_t));
- if (!gpregs->regs)
- goto err;
- gpregs->n_regs = 31;
-
- ti_aarch64->gpregs = gpregs;
-
- fpsimd = xmalloc(sizeof(*fpsimd));
- if (!fpsimd)
- goto err;
- user_aarch64_fpsimd_context_entry__init(fpsimd);
- ti_aarch64->fpsimd = fpsimd;
- fpsimd->vregs = xmalloc(64*sizeof(fpsimd->vregs[0]));
- fpsimd->n_vregs = 64;
- if (!fpsimd->vregs)
- goto err;
-
- return 0;
-err:
- return -1;
-}
-
-void arch_free_thread_info(CoreEntry *core)
-{
- if (CORE_THREAD_ARCH_INFO(core)) {
- if (CORE_THREAD_ARCH_INFO(core)->fpsimd) {
- xfree(CORE_THREAD_ARCH_INFO(core)->fpsimd->vregs);
- xfree(CORE_THREAD_ARCH_INFO(core)->fpsimd);
- }
- xfree(CORE_THREAD_ARCH_INFO(core)->gpregs->regs);
- xfree(CORE_THREAD_ARCH_INFO(core)->gpregs);
- xfree(CORE_THREAD_ARCH_INFO(core));
- CORE_THREAD_ARCH_INFO(core) = NULL;
- }
-}
-
-int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core)
-{
- int i;
- struct fpsimd_context *fpsimd = &RT_SIGFRAME_FPU(sigframe);
-
- if (core->ti_aarch64->fpsimd->n_vregs != 64)
- return 1;
-
- for (i = 0; i < 32; ++i)
- fpsimd->vregs[i] = (__uint128_t)core->ti_aarch64->fpsimd->vregs[2*i] |
- ((__uint128_t)core->ti_aarch64->fpsimd->vregs[2*i + 1] << 64);
- assign_reg(fpsimd, *core->ti_aarch64->fpsimd, fpsr);
- assign_reg(fpsimd, *core->ti_aarch64->fpsimd, fpcr);
-
- fpsimd->head.magic = FPSIMD_MAGIC;
- fpsimd->head.size = sizeof(*fpsimd);
-
- return 0;
-}
-
-void *mmap_seized(
- struct parasite_ctl *ctl,
- void *addr, size_t length, int prot,
- int flags, int fd, off_t offset)
-{
- unsigned long map;
- int err;
-
- err = syscall_seized(ctl, __NR_mmap, &map,
- (unsigned long)addr, length, prot, flags, fd, offset);
- if (err < 0 || (long)map < 0)
- map = 0;
-
- return (void *)map;
-}
-
-int restore_gpregs(struct rt_sigframe *f, UserRegsEntry *r)
-{
-#define CPREG1(d) f->uc.uc_mcontext.d = r->d
-
- int i;
-
- for (i = 0; i < 31; ++i)
- CPREG1(regs[i]);
- CPREG1(sp);
- CPREG1(pc);
- CPREG1(pstate);
-
-#undef CPREG1
-
- return 0;
-}
diff --git a/arch/aarch64/include/asm/atomic.h b/arch/aarch64/include/asm/atomic.h
deleted file mode 100644
index 0e1c04f5a714..000000000000
--- a/arch/aarch64/include/asm/atomic.h
+++ /dev/null
@@ -1,98 +0,0 @@
-#ifndef __CR_ATOMIC_H__
-#define __CR_ATOMIC_H__
-
-typedef struct {
- int counter;
-} atomic_t;
-
-
-/* Copied from the Linux header arch/arm/include/asm/barrier.h */
-
-#define smp_mb() asm volatile("dmb ish" : : : "memory")
-
-
-/* Copied from the Linux kernel header arch/arm64/include/asm/atomic.h */
-
-static inline int atomic_read(const atomic_t *v)
-{
- return (*(volatile int *)&(v)->counter);
-}
-
-static inline void atomic_set(atomic_t *v, int i)
-{
- v->counter = i;
-}
-
-#define atomic_get atomic_read
-
-
-static inline int atomic_add_return(int i, atomic_t *v)
-{
- unsigned long tmp;
- int result;
-
- asm volatile(
-"1: ldxr %w0, %2\n"
-" add %w0, %w0, %w3\n"
-" stlxr %w1, %w0, %2\n"
-" cbnz %w1, 1b"
- : "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
- : "Ir" (i)
- : "cc", "memory");
-
- smp_mb();
- return result;
-}
-
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
- unsigned long tmp;
- int result;
-
- asm volatile(
-"1: ldxr %w0, %2\n"
-" sub %w0, %w0, %w3\n"
-" stlxr %w1, %w0, %2\n"
-" cbnz %w1, 1b"
- : "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
- : "Ir" (i)
- : "cc", "memory");
-
- smp_mb();
- return result;
-}
-
-static inline int atomic_inc(atomic_t *v) { return atomic_add_return(1, v) - 1; }
-
-static inline int atomic_add(int val, atomic_t *v) { return atomic_add_return(val, v) - val; }
-
-static inline int atomic_dec(atomic_t *v) { return atomic_sub_return(1, v) + 1; }
-
-/* true if the result is 0, or false for all other cases. */
-#define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0)
-
-#define atomic_inc_return(v) (atomic_add_return(1, v))
-
-static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
-{
- unsigned long tmp;
- int oldval;
-
- smp_mb();
-
- asm volatile("// atomic_cmpxchg\n"
-"1: ldxr %w1, %2\n"
-" cmp %w1, %w3\n"
-" b.ne 2f\n"
-" stxr %w0, %w4, %2\n"
-" cbnz %w0, 1b\n"
-"2:"
- : "=&r" (tmp), "=&r" (oldval), "+Q" (ptr->counter)
- : "Ir" (old), "r" (new)
- : "cc");
-
- smp_mb();
- return oldval;
-}
-
-#endif /* __CR_ATOMIC_H__ */
diff --git a/arch/aarch64/include/asm/bitops.h b/arch/aarch64/include/asm/bitops.h
deleted file mode 100644
index 5a750447f25f..000000000000
--- a/arch/aarch64/include/asm/bitops.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef __CR_ASM_BITOPS_H__
-#define __CR_ASM_BITOPS_H__
-
-#include "compiler.h"
-#include "asm-generic/bitops.h"
-
-#endif /* __CR_ASM_BITOPS_H__ */
diff --git a/arch/aarch64/include/asm/bitsperlong.h b/arch/aarch64/include/asm/bitsperlong.h
deleted file mode 100644
index d95727d193e8..000000000000
--- a/arch/aarch64/include/asm/bitsperlong.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __CR_BITSPERLONG_H__
-#define __CR_BITSPERLONG_H__
-
-#define BITS_PER_LONG 64
-
-#endif /* __CR_BITSPERLONG_H__ */
diff --git a/arch/aarch64/include/asm/cpu.h b/arch/aarch64/include/asm/cpu.h
deleted file mode 100644
index 59118c211d10..000000000000
--- a/arch/aarch64/include/asm/cpu.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <stdbool.h>
diff --git a/arch/aarch64/include/asm/dump.h b/arch/aarch64/include/asm/dump.h
deleted file mode 100644
index 671c424da9d7..000000000000
--- a/arch/aarch64/include/asm/dump.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef __CR_ASM_DUMP_H__
-#define __CR_ASM_DUMP_H__
-
-extern int get_task_regs(pid_t pid, user_regs_struct_t regs, CoreEntry *core);
-extern int arch_alloc_thread_info(CoreEntry *core);
-extern void arch_free_thread_info(CoreEntry *core);
-
-
-static inline void core_put_tls(CoreEntry *core, tls_t tls)
-{
- core->ti_aarch64->tls = tls;
-}
-
-#endif
diff --git a/arch/aarch64/include/asm/fpu.h b/arch/aarch64/include/asm/fpu.h
deleted file mode 100644
index 7f476d541a7d..000000000000
--- a/arch/aarch64/include/asm/fpu.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef __CR_ASM_FPU_H__
-#define __CR_ASM_FPU_H__
-
-#endif /* __CR_ASM_FPU_H__ */
diff --git a/arch/aarch64/include/asm/int.h b/arch/aarch64/include/asm/int.h
deleted file mode 100644
index 642804e9b485..000000000000
--- a/arch/aarch64/include/asm/int.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __CR_ASM_INT_H__
-#define __CR_ASM_INT_H__
-
-#include "asm-generic/int.h"
-
-#endif /* __CR_ASM_INT_H__ */
diff --git a/arch/aarch64/include/asm/linkage.h b/arch/aarch64/include/asm/linkage.h
deleted file mode 100644
index 7380642337a0..000000000000
--- a/arch/aarch64/include/asm/linkage.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef __CR_LINKAGE_H__
-#define __CR_LINKAGE_H__
-
-#ifdef __ASSEMBLY__
-
-#define __ALIGN .align 4, 0x00
-#define __ALIGN_STR ".align 4, 0x00"
-
-#define GLOBAL(name) \
- .globl name; \
- name:
-
-#define ENTRY(name) \
- .globl name; \
- .type name, #function; \
- __ALIGN; \
- name:
-
-#define END(sym) \
- .size sym, . - sym
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* __CR_LINKAGE_H__ */
diff --git a/arch/aarch64/include/asm/page.h b/arch/aarch64/include/asm/page.h
deleted file mode 100644
index de1fe5428c50..000000000000
--- a/arch/aarch64/include/asm/page.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef __CR_ASM_PAGE_H__
-#define __CR_ASM_PAGE_H__
-
-#include <unistd.h>
-
-#ifndef PAGE_SHIFT
-# define PAGE_SHIFT 12
-#endif
-
-#ifndef PAGE_SIZE
-# define PAGE_SIZE (1UL << PAGE_SHIFT)
-#endif
-
-#ifndef PAGE_MASK
-# define PAGE_MASK (~(PAGE_SIZE - 1))
-#endif
-
-#define PAGE_PFN(addr) ((addr) / PAGE_SIZE)
-#define page_size() sysconf(_SC_PAGESIZE)
-
-#endif /* __CR_ASM_PAGE_H__ */
diff --git a/arch/aarch64/include/asm/parasite-syscall.h b/arch/aarch64/include/asm/parasite-syscall.h
deleted file mode 100644
index 0c07121da737..000000000000
--- a/arch/aarch64/include/asm/parasite-syscall.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef __CR_ASM_PARASITE_SYSCALL_H__
-#define __CR_ASM_PARASITE_SYSCALL_H__
-
-struct parasite_ctl;
-
-#define ARCH_SI_TRAP TRAP_BRKPT
-
-
-extern const char code_syscall[];
-extern const int code_syscall_size;
-
-void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs);
-
-void *mmap_seized(struct parasite_ctl *ctl,
- void *addr, size_t length, int prot,
- int flags, int fd, off_t offset);
-
-#endif
diff --git a/arch/aarch64/include/asm/parasite.h b/arch/aarch64/include/asm/parasite.h
deleted file mode 100644
index 2a1e1c12e7d7..000000000000
--- a/arch/aarch64/include/asm/parasite.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef __ASM_PARASITE_H__
-#define __ASM_PARASITE_H__
-
-static inline void arch_get_tls(tls_t *ptls)
-{
- tls_t tls;
- asm("mrs %0, tpidr_el0" : "=r" (tls));
- *ptls = tls;
-}
-
-#endif
diff --git a/arch/aarch64/include/asm/processor-flags.h b/arch/aarch64/include/asm/processor-flags.h
deleted file mode 100644
index c1888af36fa0..000000000000
--- a/arch/aarch64/include/asm/processor-flags.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef __CR_PROCESSOR_FLAGS_H__
-#define __CR_PROCESSOR_FLAGS_H__
-
-#endif
diff --git a/arch/aarch64/include/asm/restore.h b/arch/aarch64/include/asm/restore.h
deleted file mode 100644
index 69404b0e815e..000000000000
--- a/arch/aarch64/include/asm/restore.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef __CR_ASM_RESTORE_H__
-#define __CR_ASM_RESTORE_H__
-
-#include "asm/restorer.h"
-
-#include "protobuf/core.pb-c.h"
-
-#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \
- task_args) \
- asm volatile( \
- "and sp, %0, #~15 \n" \
- "mov x0, %2 \n" \
- "br %1 \n" \
- : \
- : "r"(new_sp), \
- "r"(restore_task_exec_start), \
- "r"(task_args) \
- : "sp", "x0", "memory")
-
-static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls)
-{
- *ptls = pcore->ti_aarch64->tls;
-}
-
-
-int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core);
-
-#endif
diff --git a/arch/aarch64/include/asm/restorer.h b/arch/aarch64/include/asm/restorer.h
deleted file mode 100644
index 583f9583b836..000000000000
--- a/arch/aarch64/include/asm/restorer.h
+++ /dev/null
@@ -1,121 +0,0 @@
-#ifndef __CR_ASM_RESTORER_H__
-#define __CR_ASM_RESTORER_H__
-
-#include <asm/sigcontext.h>
-#include <sys/ucontext.h>
-
-#include "asm/types.h"
-#include "protobuf/core.pb-c.h"
-
-/* Copied from the kernel header arch/arm64/include/uapi/asm/sigcontext.h */
-
-#define FPSIMD_MAGIC 0x46508001
-
-typedef struct fpsimd_context fpu_state_t;
-
-
-struct aux_context {
- struct fpsimd_context fpsimd;
- /* additional context to be added before "end" */
- struct _aarch64_ctx end;
-};
-
-
-// XXX: the idetifier rt_sigcontext is expected to be struct by the CRIU code
-#define rt_sigcontext sigcontext
-
-
-#include "sigframe.h"
-
-
-/* Copied from the kernel source arch/arm64/kernel/signal.c */
-
-struct rt_sigframe {
- siginfo_t info;
- struct ucontext uc;
- u64 fp;
- u64 lr;
-};
-
-
-#define ARCH_RT_SIGRETURN(new_sp) \
- asm volatile( \
- "mov sp, %0 \n" \
- "mov x8, #"__stringify(__NR_rt_sigreturn)" \n" \
- "svc #0 \n" \
- : \
- : "r"(new_sp) \
- : "sp", "x8", "memory")
-
-#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \
- thread_args, clone_restore_fn) \
- asm volatile( \
- "clone_emul: \n" \
- "ldr x1, %2 \n" \
- "and x1, x1, #~15 \n" \
- "sub x1, x1, #16 \n" \
- "stp %5, %6, [x1] \n" \
- "mov x0, %1 \n" \
- "mov x2, %3 \n" \
- "mov x3, %4 \n" \
- "mov x8, #"__stringify(__NR_clone)" \n" \
- "svc #0 \n" \
- \
- "cbz x0, thread_run \n" \
- \
- "mov %0, x0 \n" \
- "b clone_end \n" \
- \
- "thread_run: \n" \
- "ldp x1, x0, [sp] \n" \
- "br x1 \n" \
- \
- "clone_end: \n" \
- : "=r"(ret) \
- : "r"(clone_flags), \
- "m"(new_sp), \
- "r"(&parent_tid), \
- "r"(&thread_args[i].pid), \
- "r"(clone_restore_fn), \
- "r"(&thread_args[i]) \
- : "x0", "x1", "x2", "x3", "x8", "memory")
-
-#define ARCH_FAIL_CORE_RESTORE \
- asm volatile( \
- "mov sp, %0 \n" \
- "mov x0, #0 \n" \
- "b x0 \n" \
- : \
- : "r"(ret) \
- : "sp", "x0", "memory")
-
-
-#define RT_SIGFRAME_UC(rt_sigframe) rt_sigframe->uc
-#define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(rt_sigframe)->uc.uc_mcontext.pc)
-#define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1)
-#define RT_SIGFRAME_FPU(rt_sigframe) ((struct aux_context*)&(rt_sigframe)->uc.uc_mcontext.__reserved)->fpsimd
-
-#define SIGFRAME_OFFSET 0
-
-
-int restore_gpregs(struct rt_sigframe *f, UserAarch64RegsEntry *r);
-int restore_nonsigframe_gpregs(UserAarch64RegsEntry *r);
-
-static inline int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, fpu_state_t *fpu_state) { return 0; }
-
-static inline void restore_tls(tls_t *ptls)
-{
- asm("msr tpidr_el0, %0" : : "r" (*ptls));
-}
-
-static inline int ptrace_set_breakpoint(pid_t pid, void *addr)
-{
- return 0;
-}
-
-static inline int ptrace_flush_breakpoints(pid_t pid)
-{
- return 0;
-}
-
-#endif
diff --git a/arch/aarch64/include/asm/string.h b/arch/aarch64/include/asm/string.h
deleted file mode 100644
index 2c3a34fbbd3f..000000000000
--- a/arch/aarch64/include/asm/string.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef __CR_ASM_STRING_H__
-#define __CR_ASM_STRING_H__
-
-#include "compiler.h"
-#include "asm-generic/string.h"
-
-#endif /* __CR_ASM_STRING_H__ */
diff --git a/arch/aarch64/include/asm/syscall-aux.S b/arch/aarch64/include/asm/syscall-aux.S
deleted file mode 100644
index 00ccf79c30e3..000000000000
--- a/arch/aarch64/include/asm/syscall-aux.S
+++ /dev/null
@@ -1,37 +0,0 @@
-/**
- * This source contains emulation of syscalls
- * that are not implemented in the AArch64 Linux kernel
- */
-
-ENTRY(sys_open)
- mov x3, x2
- mov x2, x1
- mov x1, x0
- mov x0, #-100
- b sys_openat
-END(sys_open)
-
-
-ENTRY(sys_mkdir)
- mov x3, x2
- mov x2, x1
- mov x1, x0
- mov x0, #-100
- b sys_mkdirat
-END(sys_mkdir)
-
-
-ENTRY(sys_rmdir)
- mov x2, #0x200 // flags = AT_REMOVEDIR
- mov x1, x0
- mov x0, #-100
- b sys_unlinkat
-END(sys_rmdir)
-
-
-ENTRY(sys_unlink)
- mov x2, #0 // flags = 0
- mov x1, x0
- mov x0, #-100
- b sys_unlinkat
-END(sys_unlink)
diff --git a/arch/aarch64/include/asm/syscall-aux.h b/arch/aarch64/include/asm/syscall-aux.h
deleted file mode 100644
index 814c7a9dddc0..000000000000
--- a/arch/aarch64/include/asm/syscall-aux.h
+++ /dev/null
@@ -1 +0,0 @@
-#define __NR_openat 56
diff --git a/arch/aarch64/include/asm/types.h b/arch/aarch64/include/asm/types.h
deleted file mode 100644
index d6c890dc0b16..000000000000
--- a/arch/aarch64/include/asm/types.h
+++ /dev/null
@@ -1,102 +0,0 @@
-#ifndef __CR_ASM_TYPES_H__
-#define __CR_ASM_TYPES_H__
-
-#include <stdbool.h>
-#include <signal.h>
-#include <asm/ptrace.h>
-#include "protobuf/core.pb-c.h"
-
-#include "asm/page.h"
-#include "asm/bitops.h"
-#include "asm/int.h"
-
-
-#define SIGMAX 64
-#define SIGMAX_OLD 31
-
-typedef void rt_signalfn_t(int, siginfo_t *, void *);
-typedef rt_signalfn_t *rt_sighandler_t;
-
-typedef void rt_restorefn_t(void);
-typedef rt_restorefn_t *rt_sigrestore_t;
-
-#define _KNSIG 64
-#define _NSIG_BPW 64
-
-#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW)
-
-typedef struct {
- unsigned long sig[_KNSIG_WORDS];
-} k_rtsigset_t;
-
-static inline void ksigfillset(k_rtsigset_t *set)
-{
- int i;
- for (i = 0; i < _KNSIG_WORDS; i++)
- set->sig[i] = (unsigned long)-1;
-}
-
-#define SA_RESTORER 0x00000000
-
-typedef struct {
- rt_sighandler_t rt_sa_handler;
- unsigned long rt_sa_flags;
- rt_sigrestore_t rt_sa_restorer;
- k_rtsigset_t rt_sa_mask;
-} rt_sigaction_t;
-
-/*
- * Copied from the Linux kernel header arch/arm64/include/uapi/asm/ptrace.h
- *
- * A thread ARM CPU context
- */
-
-typedef struct user_pt_regs user_regs_struct_t;
-
-
-#define ASSIGN_TYPED(a, b) do { a = (typeof(a))b; } while (0)
-#define ASSIGN_MEMBER(a,b,m) do { ASSIGN_TYPED((a)->m, (b)->m); } while (0)
-
-#define REG_RES(regs) ((u64)(regs).regs[0])
-#define REG_IP(regs) ((u64)(regs).pc)
-#define REG_SYSCALL_NR(regs) ((u64)(regs).regs[8])
-
-/*
- * Range for task size calculated from the following Linux kernel files:
- * arch/arm64/include/asm/memory.h
- * arch/arm64/Kconfig
- *
- * TODO: handle 32 bit tasks
- */
-#define TASK_SIZE_MIN (1UL << 39)
-#define TASK_SIZE_MAX (1UL << 48)
-
-int munmap(void *addr, size_t length);
-
-static inline unsigned long task_size() {
- unsigned long task_size;
-
- for (task_size = TASK_SIZE_MIN; task_size < TASK_SIZE_MAX; task_size <<= 1)
- if (munmap((void *)task_size, page_size()))
- break;
-
- return task_size;
-}
-
-#define AT_VECTOR_SIZE 40
-
-typedef UserAarch64RegsEntry UserRegsEntry;
-
-#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__AARCH64
-
-#define CORE_THREAD_ARCH_INFO(core) core->ti_aarch64
-
-#define TI_SP(core) ((core)->ti_aarch64->gpregs->sp)
-
-typedef uint64_t auxv_t;
-typedef uint64_t tls_t;
-
-static inline void *decode_pointer(uint64_t v) { return (void*)v; }
-static inline uint64_t encode_pointer(void *p) { return (uint64_t)p; }
-
-#endif /* __CR_ASM_TYPES_H__ */
diff --git a/arch/aarch64/include/asm/vdso.h b/arch/aarch64/include/asm/vdso.h
deleted file mode 100644
index d015c63877e8..000000000000
--- a/arch/aarch64/include/asm/vdso.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef __CR_ASM_VDSO_H__
-#define __CR_ASM_VDSO_H__
-
-#include "asm/int.h"
-#include "asm-generic/vdso.h"
-
-/*
- * This is a minimal amount of symbols
- * we should support at the moment.
- */
-#define VDSO_SYMBOL_MAX 4
-
-#define ARCH_VDSO_SYMBOLS \
- "__kernel_clock_getres", \
- "__kernel_clock_gettime", \
- "__kernel_gettimeofday", \
- "__kernel_rt_sigreturn"
-
-struct vdso_symtable;
-extern int vdso_redirect_calls(unsigned long base_to,
- unsigned long base_from,
- struct vdso_symtable *to,
- struct vdso_symtable *from);
-extern void write_intraprocedure_branch(unsigned long to, unsigned long from);
-
-#endif /* __CR_ASM_VDSO_H__ */
diff --git a/arch/aarch64/intraprocedure.S b/arch/aarch64/intraprocedure.S
deleted file mode 100644
index e139dc8b573b..000000000000
--- a/arch/aarch64/intraprocedure.S
+++ /dev/null
@@ -1,22 +0,0 @@
-.global write_intraprocedure_branch
-
-/* to is x0, from is x1 */
-write_intraprocedure_branch:
- /* load two 32-bit instructions */
- ldr x2, loadbranch
- /* store 64 bits of instructions and 64 bits of destination address */
- stp x2, x0, [x1]
- /* perform required cache maintenance and synronization operations */
- dc cvau, x1
- dsb ish
- ic ivau, x1
- dsb ish
- isb
- ret
-
-/* intraprocedure trampoline instructions */
-loadbranch:
- ldr x16, =destination
- br x16
-/* label to get relative position of literal pool */
-destination:
diff --git a/arch/aarch64/parasite-head.S b/arch/aarch64/parasite-head.S
deleted file mode 100644
index 7a359061c4b9..000000000000
--- a/arch/aarch64/parasite-head.S
+++ /dev/null
@@ -1,21 +0,0 @@
-#include "asm/linkage.h"
-#include "parasite.h"
-
- .section .head.text, "ax"
-ENTRY(__export_parasite_head_start)
- adr x2, __export_parasite_head_start // get the address of this instruction
-
- ldr x0, __export_parasite_cmd
-
- ldr x1, parasite_args_ptr
- add x1, x1, x2 // fixup __export_parasite_args
-
- bl parasite_service
- brk #0 // the instruction BRK #0 generates the signal SIGTRAP in Linux
-
-parasite_args_ptr:
- .quad __export_parasite_args
-
-__export_parasite_cmd:
- .quad 0
-END(__export_parasite_head_start)
diff --git a/arch/aarch64/restorer.c b/arch/aarch64/restorer.c
deleted file mode 100644
index 2c61e2d03109..000000000000
--- a/arch/aarch64/restorer.c
+++ /dev/null
@@ -1,15 +0,0 @@
-#include <unistd.h>
-
-#include "restorer.h"
-#include "asm/restorer.h"
-#include "asm/string.h"
-
-#include "syscall.h"
-#include "log.h"
-#include "asm/fpu.h"
-#include "cpu.h"
-
-int restore_nonsigframe_gpregs(UserRegsEntry *r)
-{
- return 0;
-}
diff --git a/arch/aarch64/syscall-common.S b/arch/aarch64/syscall-common.S
deleted file mode 100644
index 81ec20f5516c..000000000000
--- a/arch/aarch64/syscall-common.S
+++ /dev/null
@@ -1,19 +0,0 @@
-#include "asm/linkage.h"
-
-syscall_common:
- svc #0
- ret
-
-
-.macro syscall name, nr
- ENTRY(\name)
- mov x8, \nr
- b syscall_common
- END(\name)
-.endm
-
-
-ENTRY(__cr_restore_rt)
- mov x8, __NR_rt_sigreturn
- svc #0
-END(__cr_restore_rt)
diff --git a/arch/aarch64/vdso-pie.c b/arch/aarch64/vdso-pie.c
deleted file mode 100644
index 0f06c2d191d1..000000000000
--- a/arch/aarch64/vdso-pie.c
+++ /dev/null
@@ -1,35 +0,0 @@
-#include <unistd.h>
-
-#include "asm/string.h"
-#include "asm/types.h"
-
-#include "syscall.h"
-#include "parasite-vdso.h"
-#include "log.h"
-#include "bug.h"
-
-#ifdef LOG_PREFIX
-# undef LOG_PREFIX
-#endif
-#define LOG_PREFIX "vdso: "
-
-int vdso_redirect_calls(unsigned long base_to, unsigned long base_from,
- struct vdso_symtable *to,
- struct vdso_symtable *from)
-{
- unsigned int i;
-
- for (i = 0; i < ARRAY_SIZE(to->symbols); i++) {
- if (vdso_symbol_empty(&from->symbols[i]))
- continue;
-
- pr_debug("br: %lx/%lx -> %lx/%lx (index %d)\n",
- base_from, from->symbols[i].offset,
- base_to, to->symbols[i].offset, i);
-
- write_intraprocedure_branch(base_to + to->symbols[i].offset,
- base_from + from->symbols[i].offset);
- }
-
- return 0;
-}
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
deleted file mode 100644
index 2359a2c0e7be..000000000000
--- a/arch/arm/Makefile
+++ /dev/null
@@ -1,59 +0,0 @@
-targets += syscalls
-targets += crtools
-
-SYS-ASM := syscalls.S
-
-syscalls-asm-y += $(SYS-ASM:.S=).o
-crtools-obj-y += crtools.o
-crtools-obj-y += cpu.o
-
-SYS-DEF := syscall.def
-SYS-ASM-COMMON := syscall-common.S
-SYS-TYPES := include/syscall-types.h
-
-SYS-CODES := include/syscall-codes.h
-SYS-PROTO := include/syscall.h
-
-SYS-GEN := ../scripts/arm/gen-syscalls.pl
-SYS-GEN-TBL := ../scripts/arm/gen-sys-exec-tbl.pl
-
-SYS-EXEC-TBL := sys-exec-tbl.c
-
-syscalls-asm-y-asmflags += -fpie -Wstrict-prototypes -Wa,--noexecstack
-syscalls-asm-y-asmflags += -nostdlib -fomit-frame-pointer -I$(obj)
-ASMFLAGS += -D__ASSEMBLY__
-
-ARCH_BITS := 32
-
-$(obj)/$(SYS-ASM): $(obj)/$(SYS-GEN) $(obj)/$(SYS-DEF) $(obj)/$(SYS-ASM-COMMON) $(SYS-TYPES)
- $(E) " GEN " $@
- $(Q) perl \
- $(obj)/$(SYS-GEN) \
- $(obj)/$(SYS-DEF) \
- $(SYS-CODES) \
- $(SYS-PROTO) \
- $(obj)/$(SYS-ASM) \
- $(SYS-ASM-COMMON) \
- $(SYS-TYPES) \
- $(ARCH_BITS)
-
-$(obj)/syscalls.o: $(obj)/$(SYS-ASM)
-
-$(obj)/$(SYS-EXEC-TBL): $(obj)/$(SYS-GEN-TBL) $(obj)/$(SYS-DEF)
- $(E) " GEN " $@
- $(Q) perl \
- $(obj)/$(SYS-GEN-TBL) \
- $(obj)/$(SYS-DEF) \
- $(obj)/$(SYS-EXEC-TBL) \
- $(ARCH_BITS)
-
-_all += $(obj)/$(SYS-EXEC-TBL)
-
-cleanup-y += $(obj)/$(SYS-EXEC-TBL) $(obj)/$(SYS-ASM)
-cleanup-y += $(SYS-CODES)
-cleanup-y += $(SYS-PROTO)
-
-ifneq ($(MAKECMDGOALS),clean)
-deps-after := $(obj)/$(SYS-ASM)
-incdeps := y
-endif
diff --git a/arch/arm/cpu.c b/arch/arm/cpu.c
deleted file mode 100644
index 040fe14fcfb7..000000000000
--- a/arch/arm/cpu.c
+++ /dev/null
@@ -1,45 +0,0 @@
-#undef LOG_PREFIX
-#define LOG_PREFIX "cpu: "
-
-#include <errno.h>
-#include "cpu.h"
-
-bool cpu_has_feature(unsigned int feature)
-{
- return false;
-}
-
-int cpu_init(void)
-{
- return 0;
-}
-
-int cpu_dump_cpuinfo(void)
-{
- return 0;
-}
-
-int cpu_validate_cpuinfo(void)
-{
- return 0;
-}
-
-int cpu_dump_cpuinfo_single(void)
-{
- return -ENOTSUP;
-}
-
-int cpu_validate_image_cpuinfo_single(void)
-{
- return -ENOTSUP;
-}
-
-int cpuinfo_dump(void)
-{
- return -ENOTSUP;
-}
-
-int cpuinfo_check(void)
-{
- return -ENOTSUP;
-}
diff --git a/arch/arm/crtools.c b/arch/arm/crtools.c
deleted file mode 100644
index 8ce889463f6a..000000000000
--- a/arch/arm/crtools.c
+++ /dev/null
@@ -1,248 +0,0 @@
-#include <string.h>
-#include <unistd.h>
-
-#include "asm/types.h"
-#include "asm/restorer.h"
-#include "compiler.h"
-#include "ptrace.h"
-#include "asm/processor-flags.h"
-#include "protobuf.h"
-#include "protobuf/core.pb-c.h"
-#include "protobuf/creds.pb-c.h"
-#include "parasite-syscall.h"
-#include "log.h"
-#include "util.h"
-#include "cpu.h"
-#include "elf.h"
-#include "parasite-syscall.h"
-#include "restorer.h"
-#include "errno.h"
-#include "kerndat.h"
-
-
-/*
- * Injected syscall instruction
- */
-const char code_syscall[] = {
- 0x00, 0x00, 0x00, 0xef, /* SVC #0 */
- 0xf0, 0x01, 0xf0, 0xe7 /* UDF #32 */
-};
-
-const int code_syscall_size = round_up(sizeof(code_syscall), sizeof(long));
-
-static inline __always_unused void __check_code_syscall(void)
-{
- BUILD_BUG_ON(sizeof(code_syscall) != BUILTIN_SYSCALL_SIZE);
- BUILD_BUG_ON(!is_log2(sizeof(code_syscall)));
-}
-
-void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs)
-{
- regs->ARM_pc = new_ip;
- if (stack)
- regs->ARM_sp = (unsigned long)stack;
-
- /* Make sure flags are in known state */
- regs->ARM_cpsr &= PSR_f | PSR_s | PSR_x | MODE32_BIT;
-}
-
-bool arch_can_dump_task(pid_t pid)
-{
- /*
- * TODO: Add proper check here
- */
- return true;
-}
-
-int syscall_seized(struct parasite_ctl *ctl, int nr, unsigned long *ret,
- unsigned long arg1,
- unsigned long arg2,
- unsigned long arg3,
- unsigned long arg4,
- unsigned long arg5,
- unsigned long arg6)
-{
- user_regs_struct_t regs = ctl->orig.regs;
- int err;
-
- regs.ARM_r7 = (unsigned long)nr;
- regs.ARM_r0 = arg1;
- regs.ARM_r1 = arg2;
- regs.ARM_r2 = arg3;
- regs.ARM_r3 = arg4;
- regs.ARM_r4 = arg5;
- regs.ARM_r5 = arg6;
-
- err = __parasite_execute_syscall(ctl, ®s);
-
- *ret = regs.ARM_r0;
- return err;
-}
-
-#define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))src.ARM_##e
-
-#define PTRACE_GETVFPREGS 27
-int get_task_regs(pid_t pid, user_regs_struct_t regs, CoreEntry *core)
-{
- struct user_vfp vfp;
- int ret = -1;
-
- pr_info("Dumping GP/FPU registers for %d\n", pid);
-
- if (ptrace(PTRACE_GETVFPREGS, pid, NULL, &vfp)) {
- pr_perror("Can't obtain FPU registers for %d", pid);
- goto err;
- }
-
- /* Did we come from a system call? */
- if ((int)regs.ARM_ORIG_r0 >= 0) {
- /* Restart the system call */
- switch ((long)(int)regs.ARM_r0) {
- case -ERESTARTNOHAND:
- case -ERESTARTSYS:
- case -ERESTARTNOINTR:
- regs.ARM_r0 = regs.ARM_ORIG_r0;
- regs.ARM_pc -= 4;
- break;
- case -ERESTART_RESTARTBLOCK:
- regs.ARM_r0 = __NR_restart_syscall;
- regs.ARM_pc -= 4;
- break;
- }
- }
-
-
- // Save the ARM CPU state
-
- assign_reg(core->ti_arm->gpregs, regs, r0);
- assign_reg(core->ti_arm->gpregs, regs, r1);
- assign_reg(core->ti_arm->gpregs, regs, r2);
- assign_reg(core->ti_arm->gpregs, regs, r3);
- assign_reg(core->ti_arm->gpregs, regs, r4);
- assign_reg(core->ti_arm->gpregs, regs, r5);
- assign_reg(core->ti_arm->gpregs, regs, r6);
- assign_reg(core->ti_arm->gpregs, regs, r7);
- assign_reg(core->ti_arm->gpregs, regs, r8);
- assign_reg(core->ti_arm->gpregs, regs, r9);
- assign_reg(core->ti_arm->gpregs, regs, r10);
- assign_reg(core->ti_arm->gpregs, regs, fp);
- assign_reg(core->ti_arm->gpregs, regs, ip);
- assign_reg(core->ti_arm->gpregs, regs, sp);
- assign_reg(core->ti_arm->gpregs, regs, lr);
- assign_reg(core->ti_arm->gpregs, regs, pc);
- assign_reg(core->ti_arm->gpregs, regs, cpsr);
- core->ti_arm->gpregs->orig_r0 = regs.ARM_ORIG_r0;
-
-
- // Save the VFP state
-
- memcpy(CORE_THREAD_ARCH_INFO(core)->fpstate->vfp_regs, &vfp.fpregs, sizeof(vfp.fpregs));
- CORE_THREAD_ARCH_INFO(core)->fpstate->fpscr = vfp.fpscr;
-
- ret = 0;
-
-err:
- return ret;
-}
-
-int arch_alloc_thread_info(CoreEntry *core)
-{
- ThreadInfoArm *ti_arm;
- UserArmRegsEntry *gpregs;
- UserArmVfpstateEntry *fpstate;
-
- ti_arm = xmalloc(sizeof(*ti_arm));
- if (!ti_arm)
- goto err;
- thread_info_arm__init(ti_arm);
- core->ti_arm = ti_arm;
-
- gpregs = xmalloc(sizeof(*gpregs));
- user_arm_regs_entry__init(gpregs);
- ti_arm->gpregs = gpregs;
-
- fpstate = xmalloc(sizeof(*fpstate));
- if (!fpstate)
- goto err;
- user_arm_vfpstate_entry__init(fpstate);
- ti_arm->fpstate = fpstate;
- fpstate->vfp_regs = xmalloc(32*sizeof(unsigned long long));
- fpstate->n_vfp_regs = 32;
- if (!fpstate->vfp_regs)
- goto err;
-
- return 0;
-err:
- return -1;
-}
-
-void arch_free_thread_info(CoreEntry *core)
-{
- if (CORE_THREAD_ARCH_INFO(core)) {
- if (CORE_THREAD_ARCH_INFO(core)->fpstate) {
- xfree(CORE_THREAD_ARCH_INFO(core)->fpstate->vfp_regs);
- xfree(CORE_THREAD_ARCH_INFO(core)->fpstate);
- }
- xfree(CORE_THREAD_ARCH_INFO(core)->gpregs);
- xfree(CORE_THREAD_ARCH_INFO(core));
- CORE_THREAD_ARCH_INFO(core) = NULL;
- }
-}
-
-int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core)
-{
- struct aux_sigframe *aux = (struct aux_sigframe *)&sigframe->sig.uc.uc_regspace;
-
- memcpy(&aux->vfp.ufp.fpregs, CORE_THREAD_ARCH_INFO(core)->fpstate->vfp_regs, sizeof(aux->vfp.ufp.fpregs));
- aux->vfp.ufp.fpscr = CORE_THREAD_ARCH_INFO(core)->fpstate->fpscr;
- aux->vfp.magic = VFP_MAGIC;
- aux->vfp.size = VFP_STORAGE_SIZE;
- return 0;
-}
-
-void *mmap_seized(struct parasite_ctl *ctl,
- void *addr, size_t length, int prot,
- int flags, int fd, off_t offset)
-{
- unsigned long map;
- int err;
-
- if (offset & ~PAGE_MASK)
- return 0;
-
- err = syscall_seized(ctl, __NR_mmap2, &map,
- (unsigned long)addr, length, prot, flags, fd, offset >> 12);
- if (err < 0 || map > kdat.task_size)
- map = 0;
-
- return (void *)map;
-}
-
-int restore_gpregs(struct rt_sigframe *f, UserArmRegsEntry *r)
-{
-#define CPREG1(d) f->sig.uc.uc_mcontext.arm_##d = r->d
-#define CPREG2(d, s) f->sig.uc.uc_mcontext.arm_##d = r->s
-
- CPREG1(r0);
- CPREG1(r1);
- CPREG1(r2);
- CPREG1(r3);
- CPREG1(r4);
- CPREG1(r5);
- CPREG1(r6);
- CPREG1(r7);
- CPREG1(r8);
- CPREG1(r9);
- CPREG1(r10);
- CPREG1(fp);
- CPREG1(ip);
- CPREG1(sp);
- CPREG1(lr);
- CPREG1(pc);
- CPREG1(cpsr);
-
-#undef CPREG1
-#undef CPREG2
-
- return 0;
-}
diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h
deleted file mode 100644
index cd0df377245c..000000000000
--- a/arch/arm/include/asm/atomic.h
+++ /dev/null
@@ -1,131 +0,0 @@
-#ifndef __CR_ATOMIC_H__
-#define __CR_ATOMIC_H__
-
-#include "asm/processor.h"
-
-typedef struct {
- int counter;
-} atomic_t;
-
-
-/* Copied from the Linux kernel header arch/arm/include/asm/atomic.h */
-
-#if defined(CONFIG_ARMV7)
-
-#define smp_mb() __asm__ __volatile__ ("dmb" : : : "memory")
-
-static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
-{
- int oldval;
- unsigned long res;
-
- smp_mb();
- prefetchw(&ptr->counter);
-
- do {
- __asm__ __volatile__("@ atomic_cmpxchg\n"
- "ldrex %1, [%3]\n"
- "mov %0, #0\n"
- "teq %1, %4\n"
- "strexeq %0, %5, [%3]\n"
- : "=&r" (res), "=&r" (oldval), "+Qo" (ptr->counter)
- : "r" (&ptr->counter), "Ir" (old), "r" (new)
- : "cc");
- } while (res);
-
- smp_mb();
-
- return oldval;
-}
-
-#elif defined(CONFIG_ARMV6)
-
-/* SMP isn't supported for ARMv6 */
-
-#define smp_mb() __asm__ __volatile__ ("mcr p15, 0, %0, c7, c10, 5" : : "r" (0) : "memory")
-
-static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
-{
- int ret;
-
- ret = v->counter;
- if (ret == old)
- v->counter = new;
-
- return ret;
-}
-
-#else
-
-#error ARM architecture version (CONFIG_ARMV*) not set or unsupported.
-
-#endif
-
-static inline int atomic_read(const atomic_t *v)
-{
- return (*(volatile int *)&(v)->counter);
-}
-
-static inline void atomic_set(atomic_t *v, int i)
-{
- v->counter = i;
-}
-
-#define atomic_get atomic_read
-
-static inline int atomic_add_return(int i, atomic_t *v)
-{
- unsigned long tmp;
- int result;
-
- smp_mb();
-
- __asm__ __volatile__("@ atomic_add_return\n"
-"1: ldrex %0, [%3]\n"
-" add %0, %0, %4\n"
-" strex %1, %0, [%3]\n"
-" teq %1, #0\n"
-" bne 1b\n"
- : "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)
- : "r" (&v->counter), "Ir" (i)
- : "cc");
-
- smp_mb();
-
- return result;
-}
-
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
- unsigned long tmp;
- int result;
-
- smp_mb();
-
- __asm__ __volatile__("@ atomic_sub_return\n"
-"1: ldrex %0, [%3]\n"
-" sub %0, %0, %4\n"
-" strex %1, %0, [%3]\n"
-" teq %1, #0\n"
-" bne 1b\n"
- : "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)
- : "r" (&v->counter), "Ir" (i)
- : "cc");
-
- smp_mb();
-
- return result;
-}
-
-static inline int atomic_inc(atomic_t *v) { return atomic_add_return(1, v) - 1; }
-
-static inline int atomic_add(int val, atomic_t *v) { return atomic_add_return(val, v) - val; }
-
-static inline int atomic_dec(atomic_t *v) { return atomic_sub_return(1, v) + 1; }
-
-/* true if the result is 0, or false for all other cases. */
-#define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0)
-
-#define atomic_inc_return(v) (atomic_add_return(1, v))
-
-#endif /* __CR_ATOMIC_H__ */
diff --git a/arch/arm/include/asm/bitops.h b/arch/arm/include/asm/bitops.h
deleted file mode 100644
index 5a750447f25f..000000000000
--- a/arch/arm/include/asm/bitops.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef __CR_ASM_BITOPS_H__
-#define __CR_ASM_BITOPS_H__
-
-#include "compiler.h"
-#include "asm-generic/bitops.h"
-
-#endif /* __CR_ASM_BITOPS_H__ */
diff --git a/arch/arm/include/asm/bitsperlong.h b/arch/arm/include/asm/bitsperlong.h
deleted file mode 100644
index 43858b765320..000000000000
--- a/arch/arm/include/asm/bitsperlong.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __CR_BITSPERLONG_H__
-#define __CR_BITSPERLONG_H__
-
-#define BITS_PER_LONG 32
-
-#endif /* __CR_BITSPERLONG_H__ */
diff --git a/arch/arm/include/asm/cpu.h b/arch/arm/include/asm/cpu.h
deleted file mode 100644
index 59118c211d10..000000000000
--- a/arch/arm/include/asm/cpu.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <stdbool.h>
diff --git a/arch/arm/include/asm/dump.h b/arch/arm/include/asm/dump.h
deleted file mode 100644
index ae1588da8792..000000000000
--- a/arch/arm/include/asm/dump.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef __CR_ASM_DUMP_H__
-#define __CR_ASM_DUMP_H__
-
-extern int get_task_regs(pid_t pid, user_regs_struct_t regs, CoreEntry *core);
-extern int arch_alloc_thread_info(CoreEntry *core);
-extern void arch_free_thread_info(CoreEntry *core);
-
-
-static inline void core_put_tls(CoreEntry *core, tls_t tls)
-{
- core->ti_arm->tls = tls;
-}
-
-#endif
diff --git a/arch/arm/include/asm/fpu.h b/arch/arm/include/asm/fpu.h
deleted file mode 100644
index 7f476d541a7d..000000000000
--- a/arch/arm/include/asm/fpu.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef __CR_ASM_FPU_H__
-#define __CR_ASM_FPU_H__
-
-#endif /* __CR_ASM_FPU_H__ */
diff --git a/arch/arm/include/asm/int.h b/arch/arm/include/asm/int.h
deleted file mode 100644
index 642804e9b485..000000000000
--- a/arch/arm/include/asm/int.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __CR_ASM_INT_H__
-#define __CR_ASM_INT_H__
-
-#include "asm-generic/int.h"
-
-#endif /* __CR_ASM_INT_H__ */
diff --git a/arch/arm/include/asm/linkage.h b/arch/arm/include/asm/linkage.h
deleted file mode 100644
index 7380642337a0..000000000000
--- a/arch/arm/include/asm/linkage.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef __CR_LINKAGE_H__
-#define __CR_LINKAGE_H__
-
-#ifdef __ASSEMBLY__
-
-#define __ALIGN .align 4, 0x00
-#define __ALIGN_STR ".align 4, 0x00"
-
-#define GLOBAL(name) \
- .globl name; \
- name:
-
-#define ENTRY(name) \
- .globl name; \
- .type name, #function; \
- __ALIGN; \
- name:
-
-#define END(sym) \
- .size sym, . - sym
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* __CR_LINKAGE_H__ */
diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h
deleted file mode 100644
index 134835556c62..000000000000
--- a/arch/arm/include/asm/page.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef __CR_ASM_PAGE_H__
-#define __CR_ASM_PAGE_H__
-
-#ifndef PAGE_SHIFT
-# define PAGE_SHIFT 12
-#endif
-
-#ifndef PAGE_SIZE
-# define PAGE_SIZE (1UL << PAGE_SHIFT)
-#endif
-
-#ifndef PAGE_MASK
-# define PAGE_MASK (~(PAGE_SIZE - 1))
-#endif
-
-#define PAGE_PFN(addr) ((addr) / PAGE_SIZE)
-#define page_size() PAGE_SIZE
-
-#endif /* __CR_ASM_PAGE_H__ */
diff --git a/arch/arm/include/asm/parasite-syscall.h b/arch/arm/include/asm/parasite-syscall.h
deleted file mode 100644
index 0c66bf992cad..000000000000
--- a/arch/arm/include/asm/parasite-syscall.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef __CR_ASM_PARASITE_SYSCALL_H__
-#define __CR_ASM_PARASITE_SYSCALL_H__
-
-
-#define ARCH_SI_TRAP TRAP_BRKPT
-
-
-extern const char code_syscall[];
-extern const int code_syscall_size;
-
-
-void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs);
-
-void *mmap_seized(struct parasite_ctl *ctl,
- void *addr, size_t length, int prot,
- int flags, int fd, off_t offset);
-
-#endif
diff --git a/arch/arm/include/asm/parasite.h b/arch/arm/include/asm/parasite.h
deleted file mode 100644
index 7f62bb9d27be..000000000000
--- a/arch/arm/include/asm/parasite.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef __ASM_PARASITE_H__
-#define __ASM_PARASITE_H__
-
-static inline void arch_get_tls(tls_t *ptls)
-{
- *ptls = ((tls_t (*)())0xffff0fe0)();
-}
-
-#endif
diff --git a/arch/arm/include/asm/processor-flags.h b/arch/arm/include/asm/processor-flags.h
deleted file mode 100644
index fc00a9e64a2e..000000000000
--- a/arch/arm/include/asm/processor-flags.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef __CR_PROCESSOR_FLAGS_H__
-#define __CR_PROCESSOR_FLAGS_H__
-
-/* Copied from the Linux kernel header arch/arm/include/uapi/asm/ptrace.h */
-
-/*
- * PSR bits
- */
-#define USR26_MODE 0x00000000
-#define FIQ26_MODE 0x00000001
-#define IRQ26_MODE 0x00000002
-#define SVC26_MODE 0x00000003
-#define USR_MODE 0x00000010
-#define FIQ_MODE 0x00000011
-#define IRQ_MODE 0x00000012
-#define SVC_MODE 0x00000013
-#define ABT_MODE 0x00000017
-#define UND_MODE 0x0000001b
-#define SYSTEM_MODE 0x0000001f
-#define MODE32_BIT 0x00000010
-#define MODE_MASK 0x0000001f
-#define PSR_T_BIT 0x00000020
-#define PSR_F_BIT 0x00000040
-#define PSR_I_BIT 0x00000080
-#define PSR_A_BIT 0x00000100
-#define PSR_E_BIT 0x00000200
-#define PSR_J_BIT 0x01000000
-#define PSR_Q_BIT 0x08000000
-#define PSR_V_BIT 0x10000000
-#define PSR_C_BIT 0x20000000
-#define PSR_Z_BIT 0x40000000
-#define PSR_N_BIT 0x80000000
-
-/*
- * Groups of PSR bits
- */
-#define PSR_f 0xff000000 /* Flags */
-#define PSR_s 0x00ff0000 /* Status */
-#define PSR_x 0x0000ff00 /* Extension */
-#define PSR_c 0x000000ff /* Control */
-
-#endif
diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h
deleted file mode 100644
index a390cfd322ec..000000000000
--- a/arch/arm/include/asm/processor.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef __CR_PROCESSOR_H__
-#define __CR_PROCESSOR_H__
-
-/* Copied from linux kernel arch/arm/include/asm/unified.h */
-
-#define WASM(instr) #instr
-
-/* Copied from linux kernel arch/arm/include/asm/processor.h */
-
-#define __ALT_SMP_ASM(smp, up) \
- "9998: " smp "\n" \
- " .pushsection \".alt.smp.init\", \"a\"\n" \
- " .long 9998b\n" \
- " " up "\n" \
- " .popsection\n"
-
-static inline void prefetchw(const void *ptr)
-{
- __asm__ __volatile__(
- ".arch_extension mp\n"
- __ALT_SMP_ASM(
- WASM(pldw) "\t%a0",
- WASM(pld) "\t%a0"
- )
- :: "p" (ptr));
-}
-
-#endif /* __CR_PROCESSOR_H__ */
diff --git a/arch/arm/include/asm/restore.h b/arch/arm/include/asm/restore.h
deleted file mode 100644
index a1e66a5d5aab..000000000000
--- a/arch/arm/include/asm/restore.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef __CR_ASM_RESTORE_H__
-#define __CR_ASM_RESTORE_H__
-
-#include "asm/restorer.h"
-
-#include "protobuf/core.pb-c.h"
-
-#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \
- task_args) \
- asm volatile( \
- "mov %%sp, %%%0 \n" \
- "mov %%r1, %%%1 \n" \
- "mov %%r0, %%%2 \n" \
- "bx %%r1 \n" \
- : \
- : "r"(new_sp), \
- "r"(restore_task_exec_start), \
- "r"(task_args) \
- : "sp", "r0", "r1", "memory")
-
-static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls)
-{
- *ptls = pcore->ti_arm->tls;
-}
-
-
-int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core);
-
-#endif
diff --git a/arch/arm/include/asm/restorer.h b/arch/arm/include/asm/restorer.h
deleted file mode 100644
index 8acb2d3e7db0..000000000000
--- a/arch/arm/include/asm/restorer.h
+++ /dev/null
@@ -1,163 +0,0 @@
-#ifndef __CR_ASM_RESTORER_H__
-#define __CR_ASM_RESTORER_H__
-
-#include "asm/types.h"
-#include "protobuf/core.pb-c.h"
-
-/* Copied from the Linux kernel header arch/arm/include/asm/sigcontext.h */
-
-struct rt_sigcontext {
- unsigned long trap_no;
- unsigned long error_code;
- unsigned long oldmask;
- unsigned long arm_r0;
- unsigned long arm_r1;
- unsigned long arm_r2;
- unsigned long arm_r3;
- unsigned long arm_r4;
- unsigned long arm_r5;
- unsigned long arm_r6;
- unsigned long arm_r7;
- unsigned long arm_r8;
- unsigned long arm_r9;
- unsigned long arm_r10;
- unsigned long arm_fp;
- unsigned long arm_ip;
- unsigned long arm_sp;
- unsigned long arm_lr;
- unsigned long arm_pc;
- unsigned long arm_cpsr;
- unsigned long fault_address;
-};
-
-/* Copied from the Linux kernel header arch/arm/include/asm/ucontext.h */
-
-#define VFP_MAGIC 0x56465001
-#define VFP_STORAGE_SIZE sizeof(struct vfp_sigframe)
-
-struct vfp_sigframe {
- unsigned long magic;
- unsigned long size;
- struct user_vfp ufp;
- struct user_vfp_exc ufp_exc;
-};
-
-typedef struct vfp_sigframe fpu_state_t;
-
-struct aux_sigframe {
- /*
- struct crunch_sigframe crunch;
- struct iwmmxt_sigframe iwmmxt;
- */
-
- struct vfp_sigframe vfp;
- unsigned long end_magic;
-} __attribute__((__aligned__(8)));
-
-#include "sigframe.h"
-
-struct sigframe {
- struct rt_ucontext uc;
- unsigned long retcode[2];
-};
-
-struct rt_sigframe {
- struct rt_siginfo info;
- struct sigframe sig;
-};
-
-
-#define ARCH_RT_SIGRETURN(new_sp) \
- asm volatile( \
- "mov %%sp, %0 \n" \
- "mov %%r7, #"__stringify(__NR_rt_sigreturn)" \n" \
- "svc #0 \n" \
- : \
- : "r"(new_sp) \
- : "sp","memory")
-
-#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \
- thread_args, clone_restore_fn) \
- asm volatile( \
- "clone_emul: \n" \
- "ldr %%r1, %2 \n" \
- "sub %%r1, #16 \n" \
- "mov %%r0, %%%6 \n" \
- "str %%r0, [%%r1, #4] \n" \
- "mov %%r0, %%%5 \n" \
- "str %%r0, [%%r1] \n" \
- "mov %%r0, %%%1 \n" \
- "mov %%r2, %%%3 \n" \
- "mov %%r3, %%%4 \n" \
- "mov %%r7, #"__stringify(__NR_clone)" \n" \
- "svc #0 \n" \
- \
- "cmp %%r0, #0 \n" \
- "beq thread_run \n" \
- \
- "mov %%%0, %%r0 \n" \
- "b clone_end \n" \
- \
- "thread_run: \n" \
- "pop { %%r1 } \n" \
- "pop { %%r0 } \n" \
- "bx %%r1 \n" \
- \
- "clone_end: \n" \
- : "=r"(ret) \
- : "r"(clone_flags), \
- "m"(new_sp), \
- "r"(&parent_tid), \
- "r"(&thread_args[i].pid), \
- "r"(clone_restore_fn), \
- "r"(&thread_args[i]) \
- : "r0", "r1", "r2", "r3", "r7", "memory")
-
-#define ARCH_FAIL_CORE_RESTORE \
- asm volatile( \
- "mov %%sp, %0 \n" \
- "mov %%r0, #0 \n" \
- "bx %%r0 \n" \
- : \
- : "r"(ret) \
- : "memory")
-
-
-#define RT_SIGFRAME_UC(rt_sigframe) rt_sigframe->sig.uc
-#define RT_SIGFRAME_REGIP(rt_sigframe) (rt_sigframe)->sig.uc.uc_mcontext.arm_ip
-#define RT_SIGFRAME_HAS_FPU(rt_sigframe) 1
-#define RT_SIGFRAME_FPU(rt_sigframe) ((struct aux_sigframe *)&sigframe->sig.uc.uc_regspace)->vfp
-
-#define SIGFRAME_OFFSET 0
-
-
-int restore_gpregs(struct rt_sigframe *f, UserArmRegsEntry *r);
-int restore_nonsigframe_gpregs(UserArmRegsEntry *r);
-
-static inline int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, fpu_state_t *fpu_state) { return 0; }
-
-static inline void restore_tls(tls_t *ptls) {
- asm (
- "mov %%r7, #15 \n"
- "lsl %%r7, #16 \n"
- "mov %%r0, #5 \n"
- "add %%r7, %%r0 \n" /* r7 = 0xF005 */
- "ldr %%r0, [%0] \n"
- "svc #0 \n"
- :
- : "r"(ptls)
- : "r0", "r7"
- );
-}
-
-static inline int ptrace_set_breakpoint(pid_t pid, void *addr)
-{
- return 0;
-}
-
-static inline int ptrace_flush_breakpoints(pid_t pid)
-{
- return 0;
-}
-
-#endif
diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h
deleted file mode 100644
index 2c3a34fbbd3f..000000000000
--- a/arch/arm/include/asm/string.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef __CR_ASM_STRING_H__
-#define __CR_ASM_STRING_H__
-
-#include "compiler.h"
-#include "asm-generic/string.h"
-
-#endif /* __CR_ASM_STRING_H__ */
diff --git a/arch/arm/include/asm/syscall-aux.S b/arch/arm/include/asm/syscall-aux.S
deleted file mode 100644
index 8bc01c3eccb2..000000000000
--- a/arch/arm/include/asm/syscall-aux.S
+++ /dev/null
@@ -1,13 +0,0 @@
-nr_sys_mmap:
- .long 192
-
-ENTRY(sys_mmap)
- push {%r4, %r5, %r7, %lr}
- ldr %r4, [%sp, #16]
- ldr %r5, [%sp, #20]
- lsr %r5, #12
- adr %r7, nr_sys_mmap
- ldr %r7, [%r7]
- svc 0x00000000
- pop {%r4, %r5, %r7, %pc}
-END(sys_mmap)
diff --git a/arch/arm/include/asm/syscall-aux.h b/arch/arm/include/asm/syscall-aux.h
deleted file mode 100644
index ec8c2d38352a..000000000000
--- a/arch/arm/include/asm/syscall-aux.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#define __NR_mmap2 192
-
-#define __ARM_NR_BASE 0x0f0000
-#define __ARM_NR_breakpoint (__ARM_NR_BASE+1)
-#define __ARM_NR_cacheflush (__ARM_NR_BASE+2)
-#define __ARM_NR_usr26 (__ARM_NR_BASE+3)
-#define __ARM_NR_usr32 (__ARM_NR_BASE+4)
-#define __ARM_NR_set_tls (__ARM_NR_BASE+5)
diff --git a/arch/arm/include/asm/types.h b/arch/arm/include/asm/types.h
deleted file mode 100644
index d1d6cc3a2fa5..000000000000
--- a/arch/arm/include/asm/types.h
+++ /dev/null
@@ -1,137 +0,0 @@
-#ifndef __CR_ASM_TYPES_H__
-#define __CR_ASM_TYPES_H__
-
-#include <stdbool.h>
-#include <signal.h>
-#include "protobuf/core.pb-c.h"
-
-#include "asm/page.h"
-#include "asm/bitops.h"
-#include "asm/int.h"
-
-#define SIGMAX 64
-#define SIGMAX_OLD 31
-
-#define MAJOR(dev) ((dev)>>8)
-#define MINOR(dev) ((dev) & 0xff)
-
-typedef void rt_signalfn_t(int, siginfo_t *, void *);
-typedef rt_signalfn_t *rt_sighandler_t;
-
-typedef void rt_restorefn_t(void);
-typedef rt_restorefn_t *rt_sigrestore_t;
-
-#define _KNSIG 64
-#define _NSIG_BPW 32
-
-#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW)
-
-typedef struct {
- unsigned long sig[_KNSIG_WORDS];
-} k_rtsigset_t;
-
-static inline void ksigfillset(k_rtsigset_t *set)
-{
- int i;
- for (i = 0; i < _KNSIG_WORDS; i++)
- set->sig[i] = (unsigned long)-1;
-}
-
-#define SA_RESTORER 0x04000000
-
-typedef struct {
- rt_sighandler_t rt_sa_handler;
- unsigned long rt_sa_flags;
- rt_sigrestore_t rt_sa_restorer;
- k_rtsigset_t rt_sa_mask;
-} rt_sigaction_t;
-
-/*
- * Copied from the Linux kernel header arch/arm/include/asm/ptrace.h
- *
- * A thread ARM CPU context
- */
-
-typedef struct {
- long uregs[18];
-} user_regs_struct_t;
-
-#define ARM_cpsr uregs[16]
-#define ARM_pc uregs[15]
-#define ARM_lr uregs[14]
-#define ARM_sp uregs[13]
-#define ARM_ip uregs[12]
-#define ARM_fp uregs[11]
-#define ARM_r10 uregs[10]
-#define ARM_r9 uregs[9]
-#define ARM_r8 uregs[8]
-#define ARM_r7 uregs[7]
-#define ARM_r6 uregs[6]
-#define ARM_r5 uregs[5]
-#define ARM_r4 uregs[4]
-#define ARM_r3 uregs[3]
-#define ARM_r2 uregs[2]
-#define ARM_r1 uregs[1]
-#define ARM_r0 uregs[0]
-#define ARM_ORIG_r0 uregs[17]
-
-
-/* Copied from arch/arm/include/asm/user.h */
-
-struct user_vfp {
- unsigned long long fpregs[32];
- unsigned long fpscr;
-};
-
-struct user_vfp_exc {
- unsigned long fpexc;
- unsigned long fpinst;
- unsigned long fpinst2;
-};
-
-#define ASSIGN_TYPED(a, b) do { a = (typeof(a))b; } while (0)
-#define ASSIGN_MEMBER(a,b,m) do { ASSIGN_TYPED((a)->m, (b)->m); } while (0)
-
-#define REG_RES(regs) ((regs).ARM_r0)
-#define REG_IP(regs) ((regs).ARM_pc)
-#define REG_SYSCALL_NR(regs) ((regs).ARM_r7)
-
-/*
- * Range for task size calculated from the following Linux kernel files:
- * arch/arm/include/asm/memory.h
- * arch/arm/Kconfig (PAGE_OFFSET values in Memory split section)
- */
-#define TASK_SIZE_MIN 0x3f000000
-#define TASK_SIZE_MAX 0xbf000000
-#define SZ_1G 0x40000000
-
-int munmap(void *addr, size_t length);
-
-static inline unsigned long task_size(void)
-{
- unsigned long task_size;
-
- for (task_size = TASK_SIZE_MIN; task_size < TASK_SIZE_MAX; task_size += SZ_1G)
- if (munmap((void *)task_size, page_size()))
- break;
-
- return task_size;
-}
-
-#define AT_VECTOR_SIZE 40
-
-typedef UserArmRegsEntry UserRegsEntry;
-
-#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__ARM
-
-#define CORE_THREAD_ARCH_INFO(core) core->ti_arm
-
-#define TI_SP(core) ((core)->ti_arm->gpregs->sp)
-
-typedef u32 auxv_t;
-typedef u32 tls_t;
-
-static inline void *decode_pointer(u64 v) { return (void*)(u32)v; }
-static inline u64 encode_pointer(void *p) { return (u32)p; }
-
-#endif /* __CR_ASM_TYPES_H__ */
diff --git a/arch/arm/parasite-head.S b/arch/arm/parasite-head.S
deleted file mode 100644
index b15fcbae275b..000000000000
--- a/arch/arm/parasite-head.S
+++ /dev/null
@@ -1,23 +0,0 @@
-#include "asm/linkage.h"
-#include "parasite.h"
-
- .section .head.text, "ax"
-ENTRY(__export_parasite_head_start)
- sub %r2, %pc, #8 @ get the address of this instruction
-
- adr %r0, __export_parasite_cmd
- ldr %r0, [%r0]
-
- adr %r1, parasite_args_ptr
- ldr %r1, [%r1]
- add %r1, %r1, %r2 @ fixup __export_parasite_args
-
- bl parasite_service
- .byte 0xf0, 0x01, 0xf0, 0xe7 @ the instruction UDF #32 generates the signal SIGTRAP in Linux
-
-parasite_args_ptr:
- .long __export_parasite_args
-
-__export_parasite_cmd:
- .long 0
-END(__export_parasite_head_start)
diff --git a/arch/arm/restorer.c b/arch/arm/restorer.c
deleted file mode 100644
index 786feeeb31bd..000000000000
--- a/arch/arm/restorer.c
+++ /dev/null
@@ -1,15 +0,0 @@
-#include <unistd.h>
-
-#include "restorer.h"
-#include "asm/restorer.h"
-#include "asm/string.h"
-
-#include "syscall.h"
-#include "log.h"
-#include "asm/fpu.h"
-#include "cpu.h"
-
-int restore_nonsigframe_gpregs(UserArmRegsEntry *r)
-{
- return 0;
-}
diff --git a/arch/arm/syscall-common.S b/arch/arm/syscall-common.S
deleted file mode 100644
index c3cbf71050f3..000000000000
--- a/arch/arm/syscall-common.S
+++ /dev/null
@@ -1,34 +0,0 @@
-#include "asm/linkage.h"
-
-@ We use the register R8 unlike libc that uses R12.
-@ This avoids corruption of the register by the stub
-@ for the syscall sys_munmap() when syscalls are hooked
-@ by ptrace(). However we have to make sure that
-@ the compiler doesn't use the register on the route
-@ between parasite_service() and sys_munmap().
-
-syscall_common:
- ldr %r7, [%r7]
- add %r8, %sp, #24
- ldm %r8, {%r4, %r5, %r6}
- svc 0x00000000
- pop {%r4, %r5, %r6, %r7, %r8, %pc}
-
-
-.macro syscall name, nr
- .nr_\name :
- .long \nr
-
- ENTRY(\name)
- push {%r4, %r5, %r6, %r7, %r8, %lr}
- adr %r7, .nr_\name
- b syscall_common
- END(\name)
-.endm
-
-
-ENTRY(__cr_restore_rt)
- adr %r7, .nr_sys_rt_sigreturn
- ldr %r7, [%r7]
- svc #0
-END(__cr_restore_rt)
diff --git a/arch/arm/syscall.def b/arch/arm/syscall.def
deleted file mode 100644
index 5d57169acb14..000000000000
--- a/arch/arm/syscall.def
+++ /dev/null
@@ -1,107 +0,0 @@
-#
-# System calls table, please make sure the table consist only the syscalls
-# really used somewhere in project.
-#
-# The template is (name and arguments are optinal if you need only __NR_x
-# defined, but no realy entry point in syscalls lib).
-#
-# name/alias code64 code32 arguments
-# -----------------------------------------------------------------------
-#
-read 63 3 (int fd, void *buf, unsigned long count)
-write 64 4 (int fd, const void *buf, unsigned long count)
-open ! 5 (const char *filename, unsigned long flags, unsigned long mode)
-close 57 6 (int fd)
-lseek 62 19 (int fd, unsigned long offset, unsigned long origin)
-mmap 222 ! (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset)
-mprotect 226 125 (const void *addr, unsigned long len, unsigned long prot)
-munmap 215 91 (void *addr, unsigned long len)
-brk 214 45 (void *addr)
-rt_sigaction sigaction 134 174 (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize)
-rt_sigprocmask sigprocmask 135 175 (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize)
-rt_sigreturn 139 173 (void)
-ioctl 29 54 (unsigned int fd, unsigned int cmd, unsigned long arg)
-pread64 67 180 (unsigned int fd, char *buf, size_t count, loff_t pos)
-ptrace 117 26 (long request, pid_t pid, void *addr, void *data)
-mremap 216 163 (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flag, unsigned long new_addr)
-mincore 232 219 (void *addr, unsigned long size, unsigned char *vec)
-madvise 233 220 (unsigned long start, size_t len, int behavior)
-shmat 196 305 (int shmid, void *shmaddr, int shmflag)
-pause 1061 29 (void)
-nanosleep 101 162 (struct timespec *req, struct timespec *rem)
-getitimer 102 105 (int which, const struct itimerval *val)
-setitimer 103 104 (int which, const struct itimerval *val, struct itimerval *old)
-getpid 172 20 (void)
-socket 198 281 (int domain, int type, int protocol)
-connect 203 283 (int sockfd, struct sockaddr *addr, int addrlen)
-sendto 206 290 (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len)
-recvfrom 207 292 (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len)
-sendmsg 211 296 (int sockfd, const struct msghdr *msg, int flags)
-recvmsg 212 297 (int sockfd, struct msghdr *msg, int flags)
-shutdown 210 293 (int sockfd, int how)
-bind 235 282 (int sockfd, const struct sockaddr *addr, int addrlen)
-setsockopt 208 294 (int sockfd, int level, int optname, const void *optval, socklen_t optlen)
-getsockopt 209 295 (int sockfd, int level, int optname, const void *optval, socklen_t *optlen)
-clone 220 120 (unsigned long flags, void *child_stack, void *parent_tid, void *child_tid)
-exit 93 1 (unsigned long error_code)
-wait4 260 114 (int pid, int *status, int options, struct rusage *ru)
-waitid 95 280 (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru)
-kill 129 37 (long pid, int sig)
-fcntl 25 55 (int fd, int type, long arg)
-flock 32 143 (int fd, unsigned long cmd)
-mkdir ! 39 (const char *name, int mode)
-rmdir ! 40 (const char *name)
-unlink ! 10 (char *pathname)
-readlinkat 78 332 (int fd, const char *path, char *buf, int bufsize)
-umask 166 60 (int mask)
-getgroups 158 205 (int gsize, unsigned int *groups)
-setgroups 159 206 (int gsize, unsigned int *groups)
-setresuid 147 164 (int uid, int euid, int suid)
-getresuid 148 165 (int *uid, int *euid, int *suid)
-setresgid 149 170 (int gid, int egid, int sgid)
-getresgid 150 171 (int *gid, int *egid, int *sgid)
-getpgid 155 132 (pid_t pid)
-setfsuid 151 138 (int fsuid)
-setfsgid 152 139 (int fsgid)
-getsid 156 147 (void)
-capget 90 184 (struct cap_header *h, struct cap_data *d)
-capset 91 185 (struct cap_header *h, struct cap_data *d)
-rt_sigqueueinfo 138 178 (pid_t pid, int sig, siginfo_t *info)
-setpriority 140 97 (int which, int who, int nice)
-sched_setscheduler 119 156 (int pid, int policy, struct sched_param *p)
-sigaltstack 132 186 (const void *uss, void *uoss)
-personality 92 136 (unsigned int personality)
-prctl 167 172 (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5)
-arch_prctl ! 17 (int option, unsigned long addr)
-setrlimit 164 75 (int resource, struct krlimit *rlim)
-mount 40 21 (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data)
-umount2 39 52 (char *name, int flags)
-gettid 178 224 (void)
-futex 98 240 (u32 *uaddr, int op, u32 val, struct timespec *utime, u32 *uaddr2, u32 val3)
-set_tid_address 96 256 (int *tid_addr)
-restart_syscall 128 0 (void)
-timer_create 107 257 (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id)
-timer_settime 110 258 (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting)
-timer_gettime 108 259 (int timer_id, const struct itimerspec *setting)
-timer_getoverrun 109 260 (int timer_id)
-timer_delete 111 261 (kernel_timer_t timer_id)
-clock_gettime 113 263 (const clockid_t which_clock, const struct timespec *tp)
-exit_group 94 248 (int error_code)
-set_robust_list 99 338 (struct robust_list_head *head, size_t len)
-get_robust_list 100 339 (int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
-signalfd4 74 355 (int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
-rt_tgsigqueueinfo 240 363 (pid_t tgid, pid_t pid, int sig, siginfo_t *info)
-vmsplice 75 343 (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags)
-timerfd_settime 86 353 (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
-fanotify_init 262 367 (unsigned int flags, unsigned int event_f_flags)
-fanotify_mark 263 368 (int fanotify_fd, unsigned int flags, u64 mask, int dfd, const char *pathname)
-open_by_handle_at 265 371 (int mountdirfd, struct file_handle *handle, int flags)
-setns 268 375 (int fd, int nstype)
-kcmp 272 378 (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2)
-openat 56 322 (int dirfd, const char *pathname, int flags, mode_t mode)
-mkdirat 34 323 (int dirfd, const char *pathname, mode_t mode)
-unlinkat 35 328 (int dirfd, const char *pathname, int flags)
-memfd_create 279 385 (const char *name, unsigned int flags)
-io_setup 0 243 (unsigned nr_events, aio_context_t *ctx)
-io_getevents 4 245 (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo)
-seccomp 277 383 (unsigned int op, unsigned int flags, const char *uargs)
diff --git a/arch/arm/uidiv.S b/arch/arm/uidiv.S
deleted file mode 100644
index e77f6100c784..000000000000
--- a/arch/arm/uidiv.S
+++ /dev/null
@@ -1,186 +0,0 @@
-.globl __aeabi_uidiv
-
-work .req r4 @ XXXX is this safe ?
-dividend .req r0
-divisor .req r1
-overdone .req r2
-result .req r2
-curbit .req r3
-
-#define LSYM(x) x
-
-.macro THUMB_DIV_MOD_BODY modulo
- @ Load the constant 0x10000000 into our work register.
- mov work, #1
- lsl work, #28
-LSYM(Loop1):
- @ Unless the divisor is very big, shift it up in multiples of
- @ four bits, since this is the amount of unwinding in the main
- @ division loop. Continue shifting until the divisor is
- @ larger than the dividend.
- cmp divisor, work
- bhs LSYM(Lbignum)
- cmp divisor, dividend
- bhs LSYM(Lbignum)
- lsl divisor, #4
- lsl curbit, #4
- b LSYM(Loop1)
-LSYM(Lbignum):
- @ Set work to 0x80000000
- lsl work, #3
-LSYM(Loop2):
- @ For very big divisors, we must shift it a bit at a time, or
- @ we will be in danger of overflowing.
- cmp divisor, work
- bhs LSYM(Loop3)
- cmp divisor, dividend
- bhs LSYM(Loop3)
- lsl divisor, #1
- lsl curbit, #1
- b LSYM(Loop2)
-LSYM(Loop3):
- @ Test for possible subtractions ...
- .if \modulo
- @ ... On the final pass, this may subtract too much from the dividend,
- @ so keep track of which subtractions are done, we can fix them up
- @ afterwards.
- mov overdone, #0
- cmp dividend, divisor
- blo LSYM(Lover1)
- sub dividend, dividend, divisor
-LSYM(Lover1):
- lsr work, divisor, #1
- cmp dividend, work
- blo LSYM(Lover2)
- sub dividend, dividend, work
- mov ip, curbit
- mov work, #1
- ror curbit, work
- orr overdone, curbit
- mov curbit, ip
-LSYM(Lover2):
- lsr work, divisor, #2
- cmp dividend, work
- blo LSYM(Lover3)
- sub dividend, dividend, work
- mov ip, curbit
- mov work, #2
- ror curbit, work
- orr overdone, curbit
- mov curbit, ip
-LSYM(Lover3):
- lsr work, divisor, #3
- cmp dividend, work
- blo LSYM(Lover4)
- sub dividend, dividend, work
- mov ip, curbit
- mov work, #3
- ror curbit, work
- orr overdone, curbit
- mov curbit, ip
-LSYM(Lover4):
- mov ip, curbit
- .else
- @ ... and note which bits are done in the result. On the final pass,
- @ this may subtract too much from the dividend, but the result will be ok,
- @ since the "bit" will have been shifted out at the bottom.
- cmp dividend, divisor
- blo LSYM(Lover1)
- sub dividend, dividend, divisor
- orr result, result, curbit
-LSYM(Lover1):
- lsr work, divisor, #1
- cmp dividend, work
- blo LSYM(Lover2)
- sub dividend, dividend, work
- lsr work, curbit, #1
- orr result, work
-LSYM(Lover2):
- lsr work, divisor, #2
- cmp dividend, work
- blo LSYM(Lover3)
- sub dividend, dividend, work
- lsr work, curbit, #2
- orr result, work
-LSYM(Lover3):
- lsr work, divisor, #3
- cmp dividend, work
- blo LSYM(Lover4)
- sub dividend, dividend, work
- lsr work, curbit, #3
- orr result, work
-LSYM(Lover4):
- .endif
-
- cmp dividend, #0 @ Early termination?
- beq LSYM(Lover5)
- lsr curbit, #4 @ No, any more bits to do?
- beq LSYM(Lover5)
- lsr divisor, #4
- b LSYM(Loop3)
-LSYM(Lover5):
- .if \modulo
- @ Any subtractions that we should not have done will be recorded in
- @ the top three bits of "overdone". Exactly which were not needed
- @ are governed by the position of the bit, stored in ip.
- mov work, #0xe
- lsl work, #28
- and overdone, work
- beq LSYM(Lgot_result)
-
- @ If we terminated early, because dividend became zero, then the
- @ bit in ip will not be in the bottom nibble, and we should not
- @ perform the additions below. We must test for this though
- @ (rather relying upon the TSTs to prevent the additions) since
- @ the bit in ip could be in the top two bits which might then match
- @ with one of the smaller RORs.
- mov curbit, ip
- mov work, #0x7
- tst curbit, work
- beq LSYM(Lgot_result)
-
- mov curbit, ip
- mov work, #3
- ror curbit, work
- tst overdone, curbit
- beq LSYM(Lover6)
- lsr work, divisor, #3
- add dividend, work
-LSYM(Lover6):
- mov curbit, ip
- mov work, #2
- ror curbit, work
- tst overdone, curbit
- beq LSYM(Lover7)
- lsr work, divisor, #2
- add dividend, work
-LSYM(Lover7):
- mov curbit, ip
- mov work, #1
- ror curbit, work
- tst overdone, curbit
- beq LSYM(Lgot_result)
- lsr work, divisor, #1
- add dividend, work
- .endif
-LSYM(Lgot_result):
-.endm
-
-
- .thumb
- .text
-
-__aeabi_uidiv:
- mov curbit, #1
- mov result, #0
-
- push { work }
- cmp dividend, divisor
- blo LSYM(Lgot_result)
-
- THUMB_DIV_MOD_BODY 0
-
- mov r0, result
- pop { work }
-
- bx lr
diff --git a/arch/ppc64/Makefile b/arch/ppc64/Makefile
deleted file mode 100644
index c5d332364aa2..000000000000
--- a/arch/ppc64/Makefile
+++ /dev/null
@@ -1,55 +0,0 @@
-targets += syscalls
-targets += crtools
-
-SYS-ASM := syscalls.S
-
-syscalls-asm-y += $(SYS-ASM:.S=).o
-crtools-obj-y += crtools.o
-crtools-obj-y += cpu.o
-
-SYS-DEF := syscall-ppc64.def
-SYS-ASM-COMMON := syscall-common-ppc64.S
-
-SYS-TYPES := include/syscall-types.h
-SYS-CODES := include/syscall-codes.h
-SYS-PROTO := include/syscall.h
-
-SYS-GEN := syscalls-ppc64.sh
-
-SYS-EXEC-TBL := sys-exec-tbl.c
-
-syscalls-asm-y-asmflags := -fpie -Wstrict-prototypes -Wa,--noexecstack
-syscalls-asm-y-asmflags += -nostdlib -fomit-frame-pointer -I$(obj)
-
-ASMFLAGS += -D__ASSEMBLY__
-
-$(obj)/$(SYS-ASM): $(obj)/$(SYS-GEN) $(obj)/$(SYS-DEF) $(obj)/$(SYS-ASM-COMMON) $(SYS-TYPES)
- $(E) " GEN " $@
- $(Q) $(SH) \
- $(obj)/$(SYS-GEN) --asm \
- $(obj)/$(SYS-DEF) \
- $(SYS-CODES) \
- $(SYS-PROTO) \
- $(obj)/$(SYS-ASM) \
- $(SYS-ASM-COMMON) \
- $(SYS-TYPES)
-
-$(obj)/syscalls.o: $(obj)/$(SYS-ASM)
-
-$(obj)/$(SYS-EXEC-TBL): $(obj)/$(SYS-GEN) $(obj)/$(SYS-DEF)
- $(E) " GEN " $@
- $(Q) $(SH) \
- $(obj)/$(SYS-GEN) --exec \
- $(obj)/$(SYS-DEF) \
- $(obj)/$(SYS-EXEC-TBL)
-
-_all += $(obj)/$(SYS-EXEC-TBL)
-
-cleanup-y += $(obj)/$(SYS-EXEC-TBL) $(obj)/$(SYS-ASM)
-cleanup-y += $(SYS-CODES)
-cleanup-y += $(SYS-PROTO)
-
-ifneq ($(MAKECMDGOALS),clean)
-deps-after := $(obj)/$(SYS-ASM)
-incdeps := y
-endif
diff --git a/arch/ppc64/cpu.c b/arch/ppc64/cpu.c
deleted file mode 100644
index d84a782c9047..000000000000
--- a/arch/ppc64/cpu.c
+++ /dev/null
@@ -1,149 +0,0 @@
-#undef LOG_PREFIX
-#define LOG_PREFIX "cpu: "
-
-#include <sys/auxv.h>
-#include <errno.h>
-#include <asm/cputable.h>
-
-#include "asm/types.h"
-#include "asm/cpu.h"
-
-#include "cr_options.h"
-#include "proc_parse.h"
-#include "util.h"
-#include "log.h"
-#include "cpu.h"
-
-#include "protobuf.h"
-#include "protobuf/cpuinfo.pb-c.h"
-
-static uint64_t hwcap[2];
-
-#ifdef __LITTLE_ENDIAN__
-#define CURRENT_ENDIANNESS CPUINFO_PPC64_ENTRY__ENDIANNESS__LITTLEENDIAN
-#else
-#define CURRENT_ENDIANNESS CPUINFO_PPC64_ENTRY__ENDIANESS__BIGENDIAN
-#endif
-
-int cpu_init(void)
-{
- hwcap[0] = getauxval(AT_HWCAP);
- hwcap[1] = getauxval(AT_HWCAP2);
-
- if (!hwcap[0] || !hwcap[1]) {
- pr_err("Can't read the hardware capabilities");
- return -1;
- }
- return 0;
-}
-
-int cpu_dump_cpuinfo(void)
-{
- CpuinfoEntry cpu_info = CPUINFO_ENTRY__INIT;
- CpuinfoPpc64Entry cpu_ppc64_info = CPUINFO_PPC64_ENTRY__INIT;
- CpuinfoPpc64Entry *cpu_ppc64_info_ptr = &cpu_ppc64_info;
- struct cr_img *img;
- int ret = -1;
-
- img = open_image(CR_FD_CPUINFO, O_DUMP);
- if (!img)
- return -1;
-
- cpu_info.ppc64_entry = &cpu_ppc64_info_ptr;
- cpu_info.n_ppc64_entry = 1;
-
- cpu_ppc64_info.endian = CURRENT_ENDIANNESS;
- cpu_ppc64_info.n_hwcap = 2;
- cpu_ppc64_info.hwcap = hwcap;
-
- ret = pb_write_one(img, &cpu_info, PB_CPUINFO);
-
- close_image(img);
- return ret;
-}
-
-int cpu_validate_cpuinfo(void)
-{
- CpuinfoEntry *cpu_info;
- CpuinfoPpc64Entry *cpu_ppc64_entry;
- struct cr_img *img;
- int ret = -1;
- img = open_image(CR_FD_CPUINFO, O_RSTR);
- if (!img)
- return -1;
-
- if (pb_read_one(img, &cpu_info, PB_CPUINFO) < 0)
- goto error;
-
- if (cpu_info->n_ppc64_entry != 1) {
- pr_err("No PPC64 related entry in image");
- goto error;
- }
- cpu_ppc64_entry = cpu_info->ppc64_entry[0];
-
- if (cpu_ppc64_entry->endian != CURRENT_ENDIANNESS) {
- pr_err("Bad endianness");
- goto error;
- }
-
- if (cpu_ppc64_entry->n_hwcap != 2) {
- pr_err("Hardware capabilities information missing\n");
- goto error;
- }
-
-#define CHECK_FEATURE(s,f) do { \
- if ((cpu_ppc64_entry->hwcap[s] & f) && !(hwcap[s] & f)) { \
- pr_err("CPU Feature %s required by image " \
- "is not supported on host.\n", #f); \
- goto error; \
- } \
- } while(0)
-
-#define REQUIRE_FEATURE(s,f) do { \
- if (!(cpu_ppc64_entry->hwcap[s] & f)) { \
- pr_err("CPU Feature %s missing in image.\n", #f); \
- goto error; \
- } \
- } while(0)
-
- REQUIRE_FEATURE(0, PPC_FEATURE_64);
- REQUIRE_FEATURE(0, PPC_FEATURE_HAS_FPU);
- REQUIRE_FEATURE(0, PPC_FEATURE_HAS_MMU);
- REQUIRE_FEATURE(0, PPC_FEATURE_HAS_VSX);
- REQUIRE_FEATURE(1, PPC_FEATURE2_ARCH_2_07);
-
- CHECK_FEATURE(0, PPC_FEATURE_TRUE_LE);
- CHECK_FEATURE(1, PPC_FEATURE2_HTM);
- CHECK_FEATURE(1, PPC_FEATURE2_DSCR);
- CHECK_FEATURE(1, PPC_FEATURE2_EBB);
- CHECK_FEATURE(1, PPC_FEATURE2_ISEL);
- CHECK_FEATURE(1, PPC_FEATURE2_TAR);
- CHECK_FEATURE(1, PPC_FEATURE2_VEC_CRYPTO);
-
- ret = 0;
-error:
- close_image(img);
- return ret;
-}
-
-int cpuinfo_dump(void)
-{
- if (cpu_init())
- return -1;
-
- if (cpu_dump_cpuinfo())
- return -1;
-
- return 0;
-}
-
-int cpuinfo_check(void)
-{
- if (cpu_init())
- return -1;
-
- if (cpu_validate_cpuinfo())
- return 1;
-
- return 0;
-}
diff --git a/arch/ppc64/crtools.c b/arch/ppc64/crtools.c
deleted file mode 100644
index b1f4f6381591..000000000000
--- a/arch/ppc64/crtools.c
+++ /dev/null
@@ -1,524 +0,0 @@
-#include <string.h>
-#include <unistd.h>
-#include <elf.h>
-#include <sys/user.h>
-#include <asm/unistd.h>
-
-#include "asm/types.h"
-#include "asm/fpu.h"
-#include "asm/restorer.h"
-
-#include "cr_options.h"
-#include "compiler.h"
-#include "ptrace.h"
-#include "parasite-syscall.h"
-#include "log.h"
-#include "util.h"
-#include "cpu.h"
-#include "errno.h"
-
-#include "protobuf.h"
-#include "protobuf/core.pb-c.h"
-#include "protobuf/creds.pb-c.h"
-
-#define MSR_VEC (1<<25)
-#define MSR_VSX (1<<23)
-
-/*
- * Injected syscall instruction
- */
-const u32 code_syscall[] = {
- 0x44000002, /* sc */
- 0x0fe00000 /* twi 31,0,0 */
-};
-
-const int code_syscall_size = sizeof(code_syscall);
-
-static inline void __check_code_syscall(void)
-{
- BUILD_BUG_ON(sizeof(code_syscall) != BUILTIN_SYSCALL_SIZE);
- BUILD_BUG_ON(!is_log2(sizeof(code_syscall)));
-}
-
-void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs)
-{
- /*
- * OpenPOWER ABI requires that r12 is set to the calling function addressi
- * to compute the TOC pointer.
- */
- regs->gpr[12] = new_ip;
- regs->nip = new_ip;
- if (stack)
- regs->gpr[1] = (unsigned long) stack;
- regs->trap = 0;
-}
-
-bool arch_can_dump_task(pid_t pid)
-{
- /*
- * TODO: We should detect 32bit task when BE support is done.
- */
- return true;
-}
-
-int syscall_seized(struct parasite_ctl *ctl, int nr, unsigned long *ret,
- unsigned long arg1,
- unsigned long arg2,
- unsigned long arg3,
- unsigned long arg4,
- unsigned long arg5,
- unsigned long arg6)
-{
- user_regs_struct_t regs = ctl->orig.regs;
- int err;
-
- regs.gpr[0] = (unsigned long)nr;
- regs.gpr[3] = arg1;
- regs.gpr[4] = arg2;
- regs.gpr[5] = arg3;
- regs.gpr[6] = arg4;
- regs.gpr[7] = arg5;
- regs.gpr[8] = arg6;
-
- err = __parasite_execute_syscall(ctl, ®s);
-
- *ret = regs.gpr[3];
- return err;
-}
-
-/* This is the layout of the POWER7 VSX registers and the way they
- * overlap with the existing FPR and VMX registers.
- *
- * VSR doubleword 0 VSR doubleword 1
- * ----------------------------------------------------------------
- * VSR[0] | FPR[0] | |
- * ----------------------------------------------------------------
- * VSR[1] | FPR[1] | |
- * ----------------------------------------------------------------
- * | ... | |
- * ----------------------------------------------------------------
- * VSR[30] | FPR[30] | |
- * ----------------------------------------------------------------
- * VSR[31] | FPR[31] | |
- * ----------------------------------------------------------------
- * VSR[32] | VR[0] |
- * ----------------------------------------------------------------
- * VSR[33] | VR[1] |
- * ----------------------------------------------------------------
- * | ... |
- * ----------------------------------------------------------------
- * VSR[62] | VR[30] |
- * ----------------------------------------------------------------
- * VSR[63] | VR[31] |
- * ----------------------------------------------------------------
- *
- * PTRACE_GETFPREGS returns FPR[0..31] + FPSCR
- * PTRACE_GETVRREGS returns VR[0..31] + VSCR + VRSAVE
- * PTRACE_GETVSRREGS returns VSR[0..31]
- *
- * PTRACE_GETVSRREGS and PTRACE_GETFPREGS are required since we need
- * to save FPSCR too.
- */
-static int get_fpu_regs(pid_t pid, CoreEntry *core)
-{
- uint64_t fpregs[NFPREG];
- UserPpc64FpstateEntry *fpe;
- int i;
-
- if (ptrace(PTRACE_GETFPREGS, pid, 0, (void *)&fpregs) < 0) {
- pr_perror("Couldn't get floating-point registers");
- return -1;
- }
-
- fpe = xmalloc(sizeof(UserPpc64FpstateEntry));
- if (!fpe)
- return -1;
- user_ppc64_fpstate_entry__init(fpe);
-
- fpe->n_fpregs = NFPREG;
- fpe->fpregs = xmalloc(fpe->n_fpregs * sizeof(fpe->fpregs[0]));
- if (!fpe->fpregs) {
- xfree(fpe);
- return -1;
- }
-
- /* FPSRC is the last (33th) register in the set */
- for (i = 0; i < NFPREG; i++)
- fpe->fpregs[i] = fpregs[i];
-
- core->ti_ppc64->fpstate = fpe;
- return 0;
-}
-
-static void put_fpu_regs(mcontext_t *mc, UserPpc64FpstateEntry *fpe)
-{
- int i;
- uint64_t *mcfp = (uint64_t *)mc->fp_regs;
-
- for (i = 0; i < fpe->n_fpregs; i++)
- mcfp[i] = fpe->fpregs[i];
-}
-
-static int get_altivec_regs(pid_t pid, CoreEntry *core)
-{
- /* The kernel returns :
- * 32 Vector registers (128bit)
- * VSCR (32bit) stored in a 128bit entry (odd)
- * VRSAVE (32bit) store at the end.
- *
- * Kernel setup_sigcontext's comment mentions:
- * "Userland shall check AT_HWCAP to know whether it can rely on the
- * v_regs pointer or not"
- */
- unsigned char vrregs[33 * 16 + 4];
- UserPpc64VrstateEntry *vse;
- uint64_t *p64;
- uint32_t *p32;
- int i;
-
- if (ptrace(PTRACE_GETVRREGS, pid, 0, (void*)&vrregs) < 0) {
- /* PTRACE_GETVRREGS returns EIO if Altivec is not supported.
- * This should not happen if msr_vec is set. */
- if (errno != EIO) {
- pr_perror("Couldn't get Altivec registers");
- return -1;
- }
- pr_debug("Altivec not supported\n");
- return 0;
- }
-
- pr_debug("Dumping Altivec registers\n");
-
- vse = xmalloc(sizeof(*vse));
- if (!vse)
- return -1;
- user_ppc64_vrstate_entry__init(vse);
-
- vse->n_vrregs = 33 * 2; /* protocol buffer store 64bit entries */
- vse->vrregs = xmalloc(vse->n_vrregs * sizeof(vse->vrregs[0]));
- if (!vse->vrregs) {
- xfree(vse);
- return -1;
- }
-
- /* Vectors are 2*64bits entries */
- for (i = 0; i < 33; i++) {
- p64 = (uint64_t*) &vrregs[i * 2 * sizeof(uint64_t)];
- vse->vrregs[i*2] = p64[0];
- vse->vrregs[i*2 + 1] = p64[1];
- }
-
- p32 = (uint32_t*) &vrregs[33 * 2 * sizeof(uint64_t)];
- vse->vrsave = *p32;
-
- core->ti_ppc64->vrstate = vse;
-
- /*
- * Force the MSR_VEC bit of the restored MSR otherwise the kernel
- * will not restore them from the signal frame.
- */
- core->ti_ppc64->gpregs->msr |= MSR_VEC;
-
- return 0;
-}
-
-static int put_altivec_regs(mcontext_t *mc, UserPpc64VrstateEntry *vse)
-{
- vrregset_t *v_regs = (vrregset_t *)(((unsigned long)mc->vmx_reserve + 15) & ~0xful);
-
- pr_debug("Restoring Altivec registers\n");
-
- if (vse->n_vrregs != 33*2) {
- pr_err("Corrupted Altivec dump data\n");
- return -1;
- }
-
- /* Note that this should only be done in the case MSR_VEC is set but
- * this is not a big deal to do that in all cases.
- */
- memcpy(&v_regs->vrregs[0][0], vse->vrregs, sizeof(uint64_t) * 2 * 33);
- /* vscr has been restored with the previous memcpy which copied 32
- * 128bits registers + a 128bits field containing the vscr value in
- * the low part.
- */
-
- v_regs->vrsave = vse->vrsave;
- mc->v_regs = v_regs;
-
- return 0;
-}
-
-/*
- * Since the FPR[0-31] is stored in the first double word of VSR[0-31] and
- * FPR are saved through the FP state, there is no need to save the upper part
- * of the first 32 VSX registers.
- * Furthermore, the 32 last VSX registers are also the 32 Altivec registers
- * already saved, so no need to save them.
- * As a consequence, only the doubleword 1 of the 32 first VSX registers have
- * to be saved (the ones are returned by PTRACE_GETVSRREGS).
- */
-static int get_vsx_regs(pid_t pid, CoreEntry *core)
-{
- UserPpc64VsxstateEntry *vse;
- uint64_t vsregs[32];
- int i;
-
- if (ptrace(PTRACE_GETVSRREGS, pid, 0, (void*)&vsregs) < 0) {
- /*
- * EIO is returned in the case PTRACE_GETVRREGS is not
- * supported.
- */
- if (errno == EIO) {
- pr_debug("VSX register's dump not supported.\n");
- return 0;
- }
- pr_perror("Couldn't get VSX registers");
- return -1;
- }
-
- pr_debug("Dumping VSX registers\n");
-
- vse = xmalloc(sizeof(*vse));
- if (!vse)
- return -1;
- user_ppc64_vsxstate_entry__init(vse);
-
- vse->n_vsxregs = 32;
- vse->vsxregs = xmalloc(vse->n_vsxregs * sizeof(vse->vsxregs[0]));
- if (!vse->vsxregs) {
- xfree(vse);
- return -1;
- }
-
- for (i = 0; i < vse->n_vsxregs; i++)
- vse->vsxregs[i] = vsregs[i];
-
- core->ti_ppc64->vsxstate = vse;
-
- /*
- * Force the MSR_VSX bit of the restored MSR otherwise the kernel
- * will not restore them from the signal frame.
- */
- core->ti_ppc64->gpregs->msr |= MSR_VSX;
- return 0;
-}
-
-static int put_vsx_regs(mcontext_t *mc, UserPpc64VsxstateEntry *vse)
-{
- uint64_t *buf;
- int i;
-
- pr_debug("Restoring VSX registers\n");
- if (!mc->v_regs) {
- /* VSX implies Altivec so v_regs should be set */
- pr_err("Internal error\n");
- return -1;
- }
-
- /* point after the Altivec registers */
- buf = (uint64_t*) (mc->v_regs + 1);
-
- /* Copy the value saved by get_vsx_regs in the sigframe */
- for (i=0; i<vse->n_vsxregs; i++)
- buf[i] = vse->vsxregs[i];
-
- return 0;
-}
-
-int get_task_regs(pid_t pid, user_regs_struct_t regs, CoreEntry *core)
-{
- int i;
-
- pr_info("Dumping GP/FPU registers for %d\n", pid);
-
- /*
- * This is inspired by kernel function check_syscall_restart in
- * arch/powerpc/kernel/signal.c
- */
-#ifndef TRAP
-#define TRAP(r) ((r).trap & ~0xF)
-#endif
-
- if (TRAP(regs) == 0x0C00 && regs.ccr & 0x10000000) {
- /* Restart the system call */
- switch (regs.gpr[3]) {
- case ERESTARTNOHAND:
- case ERESTARTSYS:
- case ERESTARTNOINTR:
- regs.gpr[3] = regs.orig_gpr3;
- regs.nip -= 4;
- break;
- case ERESTART_RESTARTBLOCK:
- regs.gpr[0] = __NR_restart_syscall;
- regs.nip -= 4;
- break;
- }
- }
-
- /* Resetting trap since we are now comming from user space. */
- regs.trap = 0;
-
-#define assign_reg(dst, src, e) do { \
- dst->e = (__typeof__(dst->e))src.e; \
-} while (0)
-
- for (i=0; i<32; i++)
- assign_reg(core->ti_ppc64->gpregs, regs, gpr[i]);
-
- assign_reg(core->ti_ppc64->gpregs, regs, nip);
- assign_reg(core->ti_ppc64->gpregs, regs, msr);
- assign_reg(core->ti_ppc64->gpregs, regs, orig_gpr3);
- assign_reg(core->ti_ppc64->gpregs, regs, ctr);
- assign_reg(core->ti_ppc64->gpregs, regs, link);
- assign_reg(core->ti_ppc64->gpregs, regs, xer);
- assign_reg(core->ti_ppc64->gpregs, regs, ccr);
- assign_reg(core->ti_ppc64->gpregs, regs, trap);
-#undef assign_reg
-
- if (get_fpu_regs(pid, core))
- return -1;
-
- if (get_altivec_regs(pid, core))
- return -1;
-
- /*
- * Don't save the VSX registers if Altivec registers are not
- * supported
- */
- if (CORE_THREAD_ARCH_INFO(core)->vrstate && get_vsx_regs(pid, core))
- return -1;
-
- return 0;
-}
-
-int arch_alloc_thread_info(CoreEntry *core)
-{
- ThreadInfoPpc64 *ti_ppc64;
- UserPpc64RegsEntry *regs;
-
- ti_ppc64 = xmalloc(sizeof(*ti_ppc64));
- if(!ti_ppc64)
- goto err;
- thread_info_ppc64__init(ti_ppc64);
- CORE_THREAD_ARCH_INFO(core) = ti_ppc64;
-
- /* user_ppc64_regs_entry */
- regs = xmalloc(sizeof(*regs));
- if (!regs)
- goto err;
- user_ppc64_regs_entry__init(regs);
-
- regs->gpr = xmalloc(32*sizeof(uint64_t));
- if (!regs->gpr)
- goto err;
- regs->n_gpr = 32;
-
- ti_ppc64->gpregs = regs;
-
- return 0;
-err:
- return -1;
-}
-
-void arch_free_thread_info(CoreEntry *core)
-{
- if (CORE_THREAD_ARCH_INFO(core)) {
- if (CORE_THREAD_ARCH_INFO(core)->fpstate) {
- xfree(CORE_THREAD_ARCH_INFO(core)->fpstate->fpregs);
- xfree(CORE_THREAD_ARCH_INFO(core)->fpstate);
- }
- if (CORE_THREAD_ARCH_INFO(core)->vrstate) {
- xfree(CORE_THREAD_ARCH_INFO(core)->vrstate->vrregs);
- xfree(CORE_THREAD_ARCH_INFO(core)->vrstate);
- }
- if (CORE_THREAD_ARCH_INFO(core)->vsxstate) {
- xfree(CORE_THREAD_ARCH_INFO(core)->vsxstate->vsxregs);
- xfree(CORE_THREAD_ARCH_INFO(core)->vsxstate);
- }
- xfree(CORE_THREAD_ARCH_INFO(core)->gpregs->gpr);
- xfree(CORE_THREAD_ARCH_INFO(core)->gpregs);
- xfree(CORE_THREAD_ARCH_INFO(core));
- CORE_THREAD_ARCH_INFO(core) = NULL;
- }
-}
-
-int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core)
-{
- int ret = 0;
- if (CORE_THREAD_ARCH_INFO(core)->fpstate)
- put_fpu_regs(&sigframe->uc.uc_mcontext,
- CORE_THREAD_ARCH_INFO(core)->fpstate);
-
- if (CORE_THREAD_ARCH_INFO(core)->vrstate)
- ret = put_altivec_regs(&sigframe->uc.uc_mcontext,
- CORE_THREAD_ARCH_INFO(core)->vrstate);
- else if (core->ti_ppc64->gpregs->msr & MSR_VEC) {
- pr_err("Internal error\n");
- ret = -1;
- }
-
- if (!ret && CORE_THREAD_ARCH_INFO(core)->vsxstate)
- ret = put_vsx_regs(&sigframe->uc.uc_mcontext,
- CORE_THREAD_ARCH_INFO(core)->vsxstate);
- else if (core->ti_ppc64->gpregs->msr & MSR_VSX) {
- pr_err("Internal error\n");
- ret = -1;
- }
-
- return ret;
-}
-
-/*
- * The signal frame has been built using local addresses. Since it has to be
- * used in the context of the checkpointed process, the v_regs pointer in the
- * signal frame must be updated to match the address in the remote stack.
- */
-int sigreturn_prep_fpu_frame(struct rt_sigframe *frame, mcontext_t *rcontext)
-{
- mcontext_t *lcontext = &frame->uc.uc_mcontext;
-
- if (lcontext->v_regs) {
- uint64_t offset = (uint64_t)(lcontext->v_regs) - (uint64_t)lcontext;
- lcontext->v_regs = (vrregset_t *)((uint64_t)rcontext + offset);
-
- pr_debug("Updated v_regs:%llx (rcontext:%llx)\n",
- (unsigned long long) lcontext->v_regs,
- (unsigned long long) rcontext);
- }
- return 0;
-}
-
-int restore_gpregs(struct rt_sigframe *f, UserPpc64RegsEntry *r)
-{
- int i;
-
- /* r0 to r31 */
- for (i=0; i<32; i++)
- f->uc.uc_mcontext.gp_regs[i] = r->gpr[i];
-
- f->uc.uc_mcontext.gp_regs[PT_NIP] = r->nip;
- f->uc.uc_mcontext.gp_regs[PT_MSR] = r->msr;
- f->uc.uc_mcontext.gp_regs[PT_ORIG_R3] = r->orig_gpr3;
- f->uc.uc_mcontext.gp_regs[PT_CTR] = r->ctr;
- f->uc.uc_mcontext.gp_regs[PT_LNK] = r->link;
- f->uc.uc_mcontext.gp_regs[PT_XER] = r->xer;
- f->uc.uc_mcontext.gp_regs[PT_CCR] = r->ccr;
- f->uc.uc_mcontext.gp_regs[PT_TRAP] = r->trap;
-
- return 0;
-}
-
-void *mmap_seized(struct parasite_ctl *ctl,
- void *addr, size_t length, int prot,
- int flags, int fd, off_t offset)
-{
- unsigned long map = 0;
- int err;
-
- err = syscall_seized(ctl, __NR_mmap, &map,
- (unsigned long)addr, length, prot, flags, fd, offset);
- if (err < 0 || (long)map < 0)
- map = 0;
-
- return (void *)map;
-}
diff --git a/arch/ppc64/include/asm/atomic.h b/arch/ppc64/include/asm/atomic.h
deleted file mode 100644
index 4fa33b1c7005..000000000000
--- a/arch/ppc64/include/asm/atomic.h
+++ /dev/null
@@ -1,112 +0,0 @@
-#ifndef __CR_ATOMIC_H__
-#define __CR_ATOMIC_H__
-
-/*
- * PowerPC atomic operations
- *
- * Copied from kernel header file arch/powerpc/include/asm/atomic.h
- */
-
-typedef struct {
- int counter;
-} atomic_t;
-
-#include "asm/cmpxchg.h"
-
-#define PPC_ATOMIC_ENTRY_BARRIER "lwsync \n"
-#define PPC_ATOMIC_EXIT_BARRIER "sync \n"
-
-#define ATOMIC_INIT(i) { (i) }
-
-static __inline__ int atomic_read(const atomic_t *v)
-{
- int t;
-
- __asm__ __volatile__("lwz%U1%X1 %0,%1" : "=r"(t) : "m"(v->counter));
-
- return t;
-}
-
-static __inline__ void atomic_set(atomic_t *v, int i)
-{
- __asm__ __volatile__("stw%U0%X0 %1,%0" : "=m"(v->counter) : "r"(i));
-}
-
-#define ATOMIC_OP(op, asm_op) \
-static __inline__ void atomic_##op(int a, atomic_t *v) \
-{ \
- int t; \
- \
- __asm__ __volatile__( \
-"1: lwarx %0,0,%3 # atomic_" #op "\n" \
- #asm_op " %0,%2,%0\n" \
-" stwcx. %0,0,%3 \n" \
-" bne- 1b\n" \
- : "=&r" (t), "+m" (v->counter) \
- : "r" (a), "r" (&v->counter) \
- : "cc"); \
-} \
-
-ATOMIC_OP(add, add)
-ATOMIC_OP(sub, subf)
-
-#undef ATOMIC_OP
-
-static __inline__ void atomic_inc(atomic_t *v)
-{
- int t;
-
- __asm__ __volatile__(
-"1: lwarx %0,0,%2 # atomic_inc\n\
- addic %0,%0,1\n"
-" stwcx. %0,0,%2 \n\
- bne- 1b"
- : "=&r" (t), "+m" (v->counter)
- : "r" (&v->counter)
- : "cc", "xer");
-}
-
-static __inline__ int atomic_inc_return(atomic_t *v)
-{
- int t;
-
- __asm__ __volatile__(
- PPC_ATOMIC_ENTRY_BARRIER \
-"1: lwarx %0,0,%1 # atomic_inc_return\n\
- addic %0,%0,1\n"
-" stwcx. %0,0,%1 \n\
- bne- 1b \n" \
- PPC_ATOMIC_EXIT_BARRIER
- : "=&r" (t)
- : "r" (&v->counter)
- : "cc", "xer", "memory");
-
- return t;
-}
-
-/*
- * atomic_inc_and_test - increment and test
- * @v: pointer of type atomic_t
- *
- * Atomically increments @v by 1
- * and returns true if the result is zero, or false for all
- * other cases.
- */
-
-static __inline__ void atomic_dec(atomic_t *v)
-{
- int t;
-
- __asm__ __volatile__(
-"1: lwarx %0,0,%2 # atomic_dec\n\
- addic %0,%0,-1\n"
-" stwcx. %0,0,%2\n\
- bne- 1b"
- : "=&r" (t), "+m" (v->counter)
- : "r" (&v->counter)
- : "cc", "xer");
-}
-
-#define atomic_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n)))
-
-#endif /* __CR_ATOMIC_H__ */
diff --git a/arch/ppc64/include/asm/bitops.h b/arch/ppc64/include/asm/bitops.h
deleted file mode 100644
index 910971981ca9..000000000000
--- a/arch/ppc64/include/asm/bitops.h
+++ /dev/null
@@ -1,174 +0,0 @@
-#ifndef __CR_BITOPS_H__
-#define __CR_BITOPS_H__
-/*
- * PowerPC atomic bit operations.
- *
- * Merged version by David Gibson <david at gibson.dropbear.id.au>.
- * Based on ppc64 versions by: Dave Engebretsen, Todd Inglett, Don
- * Reed, Pat McCarthy, Peter Bergner, Anton Blanchard. They
- * originally took it from the ppc32 code.
- *
- * Within a word, bits are numbered LSB first. Lot's of places make
- * this assumption by directly testing bits with (val & (1<<nr)).
- * This can cause confusion for large (> 1 word) bitmaps on a
- * big-endian system because, unlike little endian, the number of each
- * bit depends on the word size.
- *
- * The bitop functions are defined to work on unsigned longs, so for a
- * ppc64 system the bits end up numbered:
- * |63..............0|127............64|191...........128|255...........192|
- * and on ppc32:
- * |31.....0|63....32|95....64|127...96|159..128|191..160|223..192|255..224|
- *
- * There are a few little-endian macros used mostly for filesystem
- * bitmaps, these work on similar bit arrays layouts, but
- * byte-oriented:
- * |7...0|15...8|23...16|31...24|39...32|47...40|55...48|63...56|
- *
- * The main difference is that bit 3-5 (64b) or 3-4 (32b) in the bit
- * number field needs to be reversed compared to the big-endian bit
- * fields. This can be achieved by XOR with 0x38 (64b) or 0x18 (32b).
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * --
- * Copied from the kernel file arch/powerpc/include/asm/bitops.h
- */
-
-#include "compiler.h"
-
-#include "asm/bitsperlong.h"
-
-#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
-#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG)
-
-#define DECLARE_BITMAP(name,bits) \
- unsigned long name[BITS_TO_LONGS(bits)]
-
-#define __stringify_in_c(...) #__VA_ARGS__
-#define stringify_in_c(...) __stringify_in_c(__VA_ARGS__) " "
-
-#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG))
-#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
-
-/* PPC bit number conversion */
-#define PPC_BITLSHIFT(be) (BITS_PER_LONG - 1 - (be))
-#define PPC_BIT(bit) (1UL << PPC_BITLSHIFT(bit))
-#define PPC_BITMASK(bs, be) ((PPC_BIT(bs) - PPC_BIT(be)) | PPC_BIT(bs))
-
-
-/* Macro for generating the ***_bits() functions */
-#define DEFINE_BITOP(fn, op) \
-static __inline__ void fn(unsigned long mask, \
- volatile unsigned long *_p) \
-{ \
- unsigned long old; \
- unsigned long *p = (unsigned long *)_p; \
- __asm__ __volatile__ ( \
-"1: ldarx %0,0,%3,0\n" \
- stringify_in_c(op) "%0,%0,%2\n" \
- "stdcx. %0,0,%3\n" \
- "bne- 1b\n" \
- : "=&r" (old), "+m" (*p) \
- : "r" (mask), "r" (p) \
- : "cc", "memory"); \
-}
-
-DEFINE_BITOP(set_bits, or)
-DEFINE_BITOP(clear_bits, andc)
-DEFINE_BITOP(change_bits, xor)
-
-static __inline__ void set_bit(int nr, volatile unsigned long *addr)
-{
- set_bits(BIT_MASK(nr), addr + BIT_WORD(nr));
-}
-
-static __inline__ void clear_bit(int nr, volatile unsigned long *addr)
-{
- clear_bits(BIT_MASK(nr), addr + BIT_WORD(nr));
-}
-
-static __inline__ void change_bit(int nr, volatile unsigned long *addr)
-{
- change_bits(BIT_MASK(nr), addr + BIT_WORD(nr));
-}
-
-static inline int test_bit(int nr, const volatile unsigned long *addr)
-{
- return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
-}
-
-/*
- * Return the zero-based bit position (LE, not IBM bit numbering) of
- * the most significant 1-bit in a double word.
- */
-static __inline__ __attribute__((const))
-int __ilog2(unsigned long x)
-{
- int lz;
-
- asm ("cntlzd %0,%1" : "=r" (lz) : "r" (x));
- return BITS_PER_LONG - 1 - lz;
-}
-
-
-static __inline__ unsigned long __ffs(unsigned long x)
-{
- return __ilog2(x & -x);
-}
-
-
-#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
-/*
- * Find the next set bit in a memory region.
- */
-static inline
-unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
- unsigned long offset)
-{
- const unsigned long *p = addr + BITOP_WORD(offset);
- unsigned long result = offset & ~(BITS_PER_LONG-1);
- unsigned long tmp;
-
- if (offset >= size)
- return size;
- size -= result;
- offset %= BITS_PER_LONG;
- if (offset) {
- tmp = *(p++);
- tmp &= (~0UL << offset);
- if (size < BITS_PER_LONG)
- goto found_first;
- if (tmp)
- goto found_middle;
- size -= BITS_PER_LONG;
- result += BITS_PER_LONG;
- }
- while (size & ~(BITS_PER_LONG-1)) {
- if ((tmp = *(p++)))
- goto found_middle;
- result += BITS_PER_LONG;
- size -= BITS_PER_LONG;
- }
- if (!size)
- return result;
- tmp = *p;
-
-found_first:
- tmp &= (~0UL >> (BITS_PER_LONG - size));
- if (tmp == 0UL) /* Are any bits set? */
- return result + size; /* Nope. */
-found_middle:
- return result + __ffs(tmp);
-}
-
-#define for_each_bit(i, bitmask) \
- for (i = find_next_bit(bitmask, sizeof(bitmask), 0); \
- i < sizeof(bitmask); \
- i = find_next_bit(bitmask, sizeof(bitmask), i + 1))
-
-
-#endif /* __CR_BITOPS_H__ */
diff --git a/arch/ppc64/include/asm/bitsperlong.h b/arch/ppc64/include/asm/bitsperlong.h
deleted file mode 100644
index d95727d193e8..000000000000
--- a/arch/ppc64/include/asm/bitsperlong.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __CR_BITSPERLONG_H__
-#define __CR_BITSPERLONG_H__
-
-#define BITS_PER_LONG 64
-
-#endif /* __CR_BITSPERLONG_H__ */
diff --git a/arch/ppc64/include/asm/cmpxchg.h b/arch/ppc64/include/asm/cmpxchg.h
deleted file mode 100644
index b93fbdef06c7..000000000000
--- a/arch/ppc64/include/asm/cmpxchg.h
+++ /dev/null
@@ -1,96 +0,0 @@
-#ifndef __CR_CMPXCHG_H__
-#define __CR_CMPXCHG_H__
-
-/*
- * Copied from kernel header file arch/powerpc/include/asm/cmpxchg.h
- */
-
-#define PPC_ACQUIRE_BARRIER "isync \n"
-#define PPC_RELEASE_BARRIER "lwsync \n"
-
-/*
- * Compare and exchange - if *p == old, set it to new,
- * and return the old value of *p.
- */
-
-static __always_inline unsigned long
-__cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new)
-{
- unsigned int prev;
-
- __asm__ __volatile__ (
- PPC_RELEASE_BARRIER \
-"1: lwarx %0,0,%2 # __cmpxchg_u32\n\
- cmpw 0,%0,%3\n\
- bne- 2f\n"
-" stwcx. %4,0,%2\n\
- bne- 1b \n" \
- PPC_ACQUIRE_BARRIER
- "\n\
-2:"
- : "=&r" (prev), "+m" (*p)
- : "r" (p), "r" (old), "r" (new)
- : "cc", "memory");
-
- return prev;
-}
-
-static __always_inline unsigned long
-__cmpxchg_u64(volatile unsigned long *p, unsigned long old, unsigned long new)
-{
- unsigned long prev;
-
- __asm__ __volatile__ (
- PPC_RELEASE_BARRIER \
-"1: ldarx %0,0,%2 # __cmpxchg_u64\n\
- cmpd 0,%0,%3\n\
- bne- 2f\n\
- stdcx. %4,0,%2\n\
- bne- 1b \n" \
- PPC_ACQUIRE_BARRIER
- "\n\
-2:"
- : "=&r" (prev), "+m" (*p)
- : "r" (p), "r" (old), "r" (new)
- : "cc", "memory");
-
- return prev;
-}
-
-/* This function doesn't exist, so you'll get a linker error
- if something tries to do an invalid cmpxchg(). */
-#ifdef CR_DEBUG
-static inline void __cmpxchg_called_with_bad_pointer(void)
-{
- __asm__ __volatile__ (
- "1: twi 31,0,0 # trap\n"
- " b 1b"
- : : : "memory");
-}
-#else
-extern void __cmpxchg_called_with_bad_pointer(void);
-#endif
-
-static __always_inline unsigned long
-__cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
- unsigned int size)
-{
- switch (size) {
- case 4:
- return __cmpxchg_u32(ptr, old, new);
- case 8:
- return __cmpxchg_u64(ptr, old, new);
- }
- __cmpxchg_called_with_bad_pointer();
- return old;
-}
-
-#define cmpxchg(ptr, o, n) \
- ({ \
- __typeof__(*(ptr)) _o_ = (o); \
- __typeof__(*(ptr)) _n_ = (n); \
- (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_, \
- (unsigned long)_n_, sizeof(*(ptr))); \
- })
-
-#endif /* __CR_CMPXCHG_H__ */
diff --git a/arch/ppc64/include/asm/cpu.h b/arch/ppc64/include/asm/cpu.h
deleted file mode 100644
index 59118c211d10..000000000000
--- a/arch/ppc64/include/asm/cpu.h
+++ /dev/null
@@ -1 +0,0 @@
-#include <stdbool.h>
diff --git a/arch/ppc64/include/asm/dump.h b/arch/ppc64/include/asm/dump.h
deleted file mode 100644
index 1505fd2983b0..000000000000
--- a/arch/ppc64/include/asm/dump.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef __CR_ASM_DUMP_H__
-#define __CR_ASM_DUMP_H__
-
-extern int get_task_regs(pid_t pid, user_regs_struct_t regs, CoreEntry *core);
-extern int arch_alloc_thread_info(CoreEntry *core);
-extern void arch_free_thread_info(CoreEntry *core);
-
-
-#define core_put_tls(core, tls)
-
-#endif
diff --git a/arch/ppc64/include/asm/fpu.h b/arch/ppc64/include/asm/fpu.h
deleted file mode 100644
index 7f476d541a7d..000000000000
--- a/arch/ppc64/include/asm/fpu.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef __CR_ASM_FPU_H__
-#define __CR_ASM_FPU_H__
-
-#endif /* __CR_ASM_FPU_H__ */
diff --git a/arch/ppc64/include/asm/int.h b/arch/ppc64/include/asm/int.h
deleted file mode 100644
index 642804e9b485..000000000000
--- a/arch/ppc64/include/asm/int.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __CR_ASM_INT_H__
-#define __CR_ASM_INT_H__
-
-#include "asm-generic/int.h"
-
-#endif /* __CR_ASM_INT_H__ */
diff --git a/arch/ppc64/include/asm/linkage.h b/arch/ppc64/include/asm/linkage.h
deleted file mode 100644
index 506edc7114d4..000000000000
--- a/arch/ppc64/include/asm/linkage.h
+++ /dev/null
@@ -1,301 +0,0 @@
-/*
- * Various PowerPc assembly definitions
- *
- * Copied from the kernel file arch/powerpc/include/asm/ppc_asm.h
- *
- * Copyright (C) 1995-1999 Gary Thomas, Paul Mackerras, Cort Dougan.
- */
-#ifndef __CR_LINKAGE_H__
-#define __CR_LINKAGE_H__
-
-#ifdef __ASSEMBLY__
-
-#define GLOBAL(name) \
- .globl name; \
- name:
-
-#define ENTRY(name) \
- .globl name; \
- .type name, @function; \
- name:
-
-#define END(sym) \
- .size sym, . - sym
-
-
-#define STACKFRAMESIZE 256
-#define __STK_REG(i) (112 + ((i)-14)*8)
-#define STK_REG(i) __STK_REG(__REG_##i)
-
-/* The boring bits... */
-
-/* Condition Register Bit Fields */
-
-#define cr0 0
-#define cr1 1
-#define cr2 2
-#define cr3 3
-#define cr4 4
-#define cr5 5
-#define cr6 6
-#define cr7 7
-
-
-/*
- * General Purpose Registers (GPRs)
- *
- * The lower case r0-r31 should be used in preference to the upper
- * case R0-R31 as they provide more error checking in the assembler.
- * Use R0-31 only when really nessesary.
- */
-
-#define r0 %r0
-#define r1 %r1
-#define r2 %r2
-#define r3 %r3
-#define r4 %r4
-#define r5 %r5
-#define r6 %r6
-#define r7 %r7
-#define r8 %r8
-#define r9 %r9
-#define r10 %r10
-#define r11 %r11
-#define r12 %r12
-#define r13 %r13
-#define r14 %r14
-#define r15 %r15
-#define r16 %r16
-#define r17 %r17
-#define r18 %r18
-#define r19 %r19
-#define r20 %r20
-#define r21 %r21
-#define r22 %r22
-#define r23 %r23
-#define r24 %r24
-#define r25 %r25
-#define r26 %r26
-#define r27 %r27
-#define r28 %r28
-#define r29 %r29
-#define r30 %r30
-#define r31 %r31
-
-
-/* Floating Point Registers (FPRs) */
-
-#define fr0 0
-#define fr1 1
-#define fr2 2
-#define fr3 3
-#define fr4 4
-#define fr5 5
-#define fr6 6
-#define fr7 7
-#define fr8 8
-#define fr9 9
-#define fr10 10
-#define fr11 11
-#define fr12 12
-#define fr13 13
-#define fr14 14
-#define fr15 15
-#define fr16 16
-#define fr17 17
-#define fr18 18
-#define fr19 19
-#define fr20 20
-#define fr21 21
-#define fr22 22
-#define fr23 23
-#define fr24 24
-#define fr25 25
-#define fr26 26
-#define fr27 27
-#define fr28 28
-#define fr29 29
-#define fr30 30
-#define fr31 31
-
-/* AltiVec Registers (VPRs) */
-
-#define vr0 0
-#define vr1 1
-#define vr2 2
-#define vr3 3
-#define vr4 4
-#define vr5 5
-#define vr6 6
-#define vr7 7
-#define vr8 8
-#define vr9 9
-#define vr10 10
-#define vr11 11
-#define vr12 12
-#define vr13 13
-#define vr14 14
-#define vr15 15
-#define vr16 16
-#define vr17 17
-#define vr18 18
-#define vr19 19
-#define vr20 20
-#define vr21 21
-#define vr22 22
-#define vr23 23
-#define vr24 24
-#define vr25 25
-#define vr26 26
-#define vr27 27
-#define vr28 28
-#define vr29 29
-#define vr30 30
-#define vr31 31
-
-/* VSX Registers (VSRs) */
-
-#define vsr0 0
-#define vsr1 1
-#define vsr2 2
-#define vsr3 3
-#define vsr4 4
-#define vsr5 5
-#define vsr6 6
-#define vsr7 7
-#define vsr8 8
-#define vsr9 9
-#define vsr10 10
-#define vsr11 11
-#define vsr12 12
-#define vsr13 13
-#define vsr14 14
-#define vsr15 15
-#define vsr16 16
-#define vsr17 17
-#define vsr18 18
-#define vsr19 19
-#define vsr20 20
-#define vsr21 21
-#define vsr22 22
-#define vsr23 23
-#define vsr24 24
-#define vsr25 25
-#define vsr26 26
-#define vsr27 27
-#define vsr28 28
-#define vsr29 29
-#define vsr30 30
-#define vsr31 31
-#define vsr32 32
-#define vsr33 33
-#define vsr34 34
-#define vsr35 35
-#define vsr36 36
-#define vsr37 37
-#define vsr38 38
-#define vsr39 39
-#define vsr40 40
-#define vsr41 41
-#define vsr42 42
-#define vsr43 43
-#define vsr44 44
-#define vsr45 45
-#define vsr46 46
-#define vsr47 47
-#define vsr48 48
-#define vsr49 49
-#define vsr50 50
-#define vsr51 51
-#define vsr52 52
-#define vsr53 53
-#define vsr54 54
-#define vsr55 55
-#define vsr56 56
-#define vsr57 57
-#define vsr58 58
-#define vsr59 59
-#define vsr60 60
-#define vsr61 61
-#define vsr62 62
-#define vsr63 63
-
-/* SPE Registers (EVPRs) */
-
-#define evr0 0
-#define evr1 1
-#define evr2 2
-#define evr3 3
-#define evr4 4
-#define evr5 5
-#define evr6 6
-#define evr7 7
-#define evr8 8
-#define evr9 9
-#define evr10 10
-#define evr11 11
-#define evr12 12
-#define evr13 13
-#define evr14 14
-#define evr15 15
-#define evr16 16
-#define evr17 17
-#define evr18 18
-#define evr19 19
-#define evr20 20
-#define evr21 21
-#define evr22 22
-#define evr23 23
-#define evr24 24
-#define evr25 25
-#define evr26 26
-#define evr27 27
-#define evr28 28
-#define evr29 29
-#define evr30 30
-#define evr31 31
-
-/* some stab codes */
-#define N_FUN 36
-#define N_RSYM 64
-#define N_SLINE 68
-#define N_SO 100
-
-#define __REG_R0 0
-#define __REG_R1 1
-#define __REG_R2 2
-#define __REG_R3 3
-#define __REG_R4 4
-#define __REG_R5 5
-#define __REG_R6 6
-#define __REG_R7 7
-#define __REG_R8 8
-#define __REG_R9 9
-#define __REG_R10 10
-#define __REG_R11 11
-#define __REG_R12 12
-#define __REG_R13 13
-#define __REG_R14 14
-#define __REG_R15 15
-#define __REG_R16 16
-#define __REG_R17 17
-#define __REG_R18 18
-#define __REG_R19 19
-#define __REG_R20 20
-#define __REG_R21 21
-#define __REG_R22 22
-#define __REG_R23 23
-#define __REG_R24 24
-#define __REG_R25 25
-#define __REG_R26 26
-#define __REG_R27 27
-#define __REG_R28 28
-#define __REG_R29 29
-#define __REG_R30 30
-#define __REG_R31 31
-
-
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* __CR_LINKAGE_H__ */
diff --git a/arch/ppc64/include/asm/page.h b/arch/ppc64/include/asm/page.h
deleted file mode 100644
index 9d10455f1c47..000000000000
--- a/arch/ppc64/include/asm/page.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef __CR_ASM_PAGE_H__
-#define __CR_ASM_PAGE_H__
-
-#include <unistd.h>
-
-/*
- * Default config for Pseries is to use 64K pages.
- * See kernel file arch/powerpc/configs/pseries_*defconfig
- */
-#ifndef PAGE_SHIFT
-# define PAGE_SHIFT 16
-#endif
-
-#ifndef PAGE_SIZE
-# define PAGE_SIZE (1UL << PAGE_SHIFT)
-#endif
-
-#ifndef PAGE_MASK
-# define PAGE_MASK (~(PAGE_SIZE - 1))
-#endif
-
-#define PAGE_PFN(addr) ((addr) / PAGE_SIZE)
-#define page_size() sysconf(_SC_PAGESIZE)
-
-#endif /* __CR_ASM_PAGE_H__ */
diff --git a/arch/ppc64/include/asm/parasite-syscall.h b/arch/ppc64/include/asm/parasite-syscall.h
deleted file mode 100644
index 7665e207b75e..000000000000
--- a/arch/ppc64/include/asm/parasite-syscall.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef __CR_ASM_PARASITE_SYSCALL_H__
-#define __CR_ASM_PARASITE_SYSCALL_H__
-
-struct parasite_ctl;
-
-#define ARCH_SI_TRAP TRAP_BRKPT
-
-extern const char code_syscall[];
-extern const int code_syscall_size;
-
-void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs);
-
-void *mmap_seized(struct parasite_ctl *ctl,
- void *addr, size_t length, int prot,
- int flags, int fd, off_t offset);
-
-#endif
diff --git a/arch/ppc64/include/asm/parasite.h b/arch/ppc64/include/asm/parasite.h
deleted file mode 100644
index fdbc340b05e2..000000000000
--- a/arch/ppc64/include/asm/parasite.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef __ASM_PARASITE_H__
-#define __ASM_PARASITE_H__
-
-/* TLS is accessed through r13, which is already processed */
-static inline void arch_get_tls(tls_t *ptls) { (void)ptls; }
-
-#endif
diff --git a/arch/ppc64/include/asm/prlimit.h b/arch/ppc64/include/asm/prlimit.h
deleted file mode 100644
index 6746ba0e6f19..000000000000
--- a/arch/ppc64/include/asm/prlimit.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef __CR_PRLIMIT_H__
-#define __CR_PRLIMIT_H__
-
-#include <sys/types.h>
-#include <sys/time.h>
-#include <sys/resource.h>
-
-#include "config.h"
-
-#ifndef CONFIG_HAS_PRLIMIT
-extern int prlimit(pid_t pid, int resource, const struct rlimit *new_rlimit, struct rlimit *old_rlimit);
-#endif
-
-#endif /* __CR_PRLIMIT_H__ */
diff --git a/arch/ppc64/include/asm/processor-flags.h b/arch/ppc64/include/asm/processor-flags.h
deleted file mode 100644
index c1888af36fa0..000000000000
--- a/arch/ppc64/include/asm/processor-flags.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef __CR_PROCESSOR_FLAGS_H__
-#define __CR_PROCESSOR_FLAGS_H__
-
-#endif
diff --git a/arch/ppc64/include/asm/restore.h b/arch/ppc64/include/asm/restore.h
deleted file mode 100644
index 325ff96e1018..000000000000
--- a/arch/ppc64/include/asm/restore.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef __CR_ASM_RESTORE_H__
-#define __CR_ASM_RESTORE_H__
-
-#include "asm/restorer.h"
-
-#include "protobuf/core.pb-c.h"
-
-/*
- * Set R2 to blob + 8000 which is the default value
- * Jump to restore_task_exec_start + 8 since R2 is already set (local call)
- */
-#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \
- task_args) \
- asm volatile( \
- "mr 1,%0 \n" \
- "mr 12,%1 \n" \
- "mtctr 12 \n" \
- "mr 3,%2 \n" \
- "bctr \n" \
- : \
- : "r"(new_sp), \
- "r"((unsigned long)restore_task_exec_start), \
- "r"(task_args) \
- : "sp", "1", "2", "3", "12", "memory")
-
-/* There is nothing to do since TLS is accessed through r13 */
-#define core_get_tls(pcore, ptls)
-
-int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core);
-
-#endif /* __CR_ASM_RESTORE_H__ */
diff --git a/arch/ppc64/include/asm/restorer.h b/arch/ppc64/include/asm/restorer.h
deleted file mode 100644
index e728f133535e..000000000000
--- a/arch/ppc64/include/asm/restorer.h
+++ /dev/null
@@ -1,133 +0,0 @@
-#ifndef __CR_ASM_RESTORER_H__
-#define __CR_ASM_RESTORER_H__
-
-#include <asm/ptrace.h>
-#include <asm/elf.h>
-#include <asm/types.h>
-
-/*
- * sigcontext structure defined in file
- * /usr/include/powerpc64le-linux-gnu/bits/sigcontext.h,
- * included from /usr/include/signal.h
- *
- * Kernel definition can be found in arch/powerpc/include/uapi/asm/sigcontext.h
- */
-#include <signal.h>
-
-// XXX: the idetifier rt_sigcontext is expected to be struct by the CRIU code
-#define rt_sigcontext sigcontext
-
-#include "sigframe.h"
-#define SIGFRAME_OFFSET 0
-
-/* Copied from the Linux kernel header arch/powerpc/include/asm/ptrace.h */
-#define USER_REDZONE_SIZE 512
-
-/* Copied from the Linux kernel source file arch/powerpc/kernel/signal_64.c */
-#define TRAMP_SIZE 6
-
-/*
- * ucontext defined in /usr/include/powerpc64le-linux-gnu/sys/ucontext.h
- */
-struct rt_sigframe {
- /* sys_rt_sigreturn requires the ucontext be the first field */
- struct ucontext uc;
-#if 1
- /*
- * XXX: Assuming that transactional is turned on by default in
- * most of the Linux distribution.
- */
- struct ucontext uc_transact;
-#endif
- unsigned long _unused[2];
- unsigned int tramp[TRAMP_SIZE];
- struct rt_siginfo *pinfo;
- void *puc;
- struct rt_siginfo info;
- /* New 64 bit little-endian ABI allows redzone of 512 bytes below sp */
- char abigap[USER_REDZONE_SIZE];
-} __attribute__ ((aligned (16)));
-
-#define ARCH_RT_SIGRETURN(new_sp) \
- asm volatile( \
- "mr 1, %0 \n" \
- "li 0, "__stringify(__NR_rt_sigreturn)" \n" \
- "sc \n" \
- : \
- : "r"(new_sp) \
- : "1", "memory")
-
-/*
- * Clone trampoline
- *
- * See glibc sysdeps/powerpc/powerpc64/sysdep.h for FRAME_MIN_SIZE defines
- */
-#if _CALL_ELF != 2
-#error Only supporting ABIv2.
-#else
-#define FRAME_MIN_SIZE_PARM 96
-#endif
-#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \
- thread_args, clone_restore_fn) \
- asm volatile( \
- "clone_emul: \n" \
- "/* Save fn, args, stack across syscall. */ \n" \
- "mr 14, %5 /* clone_restore_fn in r14 */ \n" \
- "mr 15, %6 /* &thread_args[i] in r15 */ \n" \
- "mr 3, %1 /* clone_flags */ \n" \
- "ld 4, %2 /* new_sp */ \n" \
- "mr 5, %3 /* &parent_tid */ \n" \
- "li 6, 0 /* tls = 0 ? */ \n" \
- "mr 7, %4 /* &thread_args[i].pid */ \n" \
- "li 0,"__stringify(__NR_clone)" \n" \
- "sc \n" \
- "/* Check for child process. */ \n" \
- "cmpdi cr1,3,0 \n" \
- "crandc cr1*4+eq,cr1*4+eq,cr0*4+so \n" \
- "bne- cr1,clone_end \n" \
- "/* child */ \n" \
- "addi 14, 14, 8 /* jump over r2 fixup */ \n" \
- "mtctr 14 \n" \
- "mr 3,15 \n" \
- "bctr \n" \
- "clone_end: \n" \
- "mr %0,3 \n" \
- : "=r"(ret) /* %0 */ \
- : "r"(clone_flags), /* %1 */ \
- "m"(new_sp), /* %2 */ \
- "r"(&parent_tid), /* %3 */ \
- "r"(&thread_args[i].pid), /* %4 */ \
- "r"(clone_restore_fn), /* %5 */ \
- "r"(&thread_args[i]) /* %6 */ \
- : "memory","0","3","4","5","6","7","14","15")
-
-#define RT_SIGFRAME_UC(rt_sigframe) rt_sigframe->uc
-#define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(rt_sigframe)->uc.uc_mcontext.gp_regs[PT_NIP])
-#define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1)
-#define RT_SIGFRAME_FPU(rt_sigframe) ((rt_sigframe)->uc.uc_mcontext)
-
-int restore_gpregs(struct rt_sigframe *f, UserPpc64RegsEntry *r);
-int restore_nonsigframe_gpregs(UserPpc64RegsEntry *r);
-
-/* Nothing to do, TLS is accessed through r13 */
-static inline void restore_tls(tls_t *ptls) { (void)ptls; }
-
-static inline int ptrace_set_breakpoint(pid_t pid, void *addr)
-{
- return 0;
-}
-
-static inline int ptrace_flush_breakpoints(pid_t pid)
-{
- return 0;
-}
-
-int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe,
- mcontext_t *sigcontext);
-
-/*
- * Defined in arch/ppc64/syscall-common-ppc64.S
- */
-unsigned long sys_shmat(int shmid, const void *shmaddr, int shmflg);
-
-#endif /*__CR_ASM_RESTORER_H__*/
diff --git a/arch/ppc64/include/asm/string.h b/arch/ppc64/include/asm/string.h
deleted file mode 100644
index 4531b3ba6c26..000000000000
--- a/arch/ppc64/include/asm/string.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef __CR_ASM_STRING_H__
-#define __CR_ASM_STRING_H__
-
-#include "compiler.h"
-
-#define HAS_BUILTIN_MEMCPY
-#define HAS_BUILTIN_MEMCMP
-
-#include "asm-generic/string.h"
-
-#ifdef CR_NOGLIBC
-extern void memcpy_power7(void *to, const void *from, unsigned long n);
-static inline void *builtin_memcpy(void *to, const void *from, unsigned long n)
-{
- if (n)
- memcpy_power7(to, from, n);
- return to;
-}
-extern int builtin_memcmp(const void *cs, const void *ct, size_t count);
-#else
-/*
- * When building with the C library, call its services
- */
-#define builtin_memcpy memcpy
-#define builtin_memcmp memcmp
-#endif
-
-#endif /* __CR_ASM_STRING_H__ */
diff --git a/arch/ppc64/include/asm/types.h b/arch/ppc64/include/asm/types.h
deleted file mode 100644
index 3412dc75d6a0..000000000000
--- a/arch/ppc64/include/asm/types.h
+++ /dev/null
@@ -1,113 +0,0 @@
-#ifndef __CR_ASM_TYPES_H__
-#define __CR_ASM_TYPES_H__
-
-#include <stdbool.h>
-#include <signal.h>
-#include "protobuf/core.pb-c.h"
-
-#include "asm/page.h"
-#include "asm/bitops.h"
-#include "asm/int.h"
-
-/*
- * Copied from kernel header include/uapi/asm-generic/signal-defs.h
- */
-typedef void rt_signalfn_t(int, siginfo_t *, void *);
-typedef rt_signalfn_t *rt_sighandler_t;
-
-typedef void rt_restorefn_t(void);
-typedef rt_restorefn_t *rt_sigrestore_t;
-
-#define SIGMAX_OLD 31
-#define SIGMAX 64
-
-/*Copied from the Linux kernel arch/powerpc/include/uapi/asm/signal.h */
-#define _KNSIG 64
-#define _NSIG_BPW 64
-#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW)
-
-typedef struct {
- uint64_t sig[_KNSIG_WORDS];
-} k_rtsigset_t;
-
-static inline void ksigfillset(k_rtsigset_t *set)
-{
- int i;
- for (i = 0; i < _KNSIG_WORDS; i++)
- set->sig[i] = (unsigned long)-1;
-}
-
-/* Copied from the Linux kernel arch/powerpc/include/uapi/asm/signal.h */
-#define SA_RESTORER 0x04000000U
-
-typedef struct {
- rt_sighandler_t rt_sa_handler;
- unsigned long rt_sa_flags;
- rt_sigrestore_t rt_sa_restorer;
- k_rtsigset_t rt_sa_mask; /* mask last for extensibility */
-} rt_sigaction_t;
-
-/*
- * Copied from kernel header arch/powerpc/include/uapi/asm/ptrace.h
- */
-typedef struct {
- unsigned long gpr[32];
- unsigned long nip;
- unsigned long msr;
- unsigned long orig_gpr3; /* Used for restarting system calls */
- unsigned long ctr;
- unsigned long link;
- unsigned long xer;
- unsigned long ccr;
- unsigned long softe; /* Soft enabled/disabled */
- unsigned long trap; /* Reason for being here */
- /* N.B. for critical exceptions on 4xx, the dar and dsisr
- fields are overloaded to hold srr0 and srr1. */
- unsigned long dar; /* Fault registers */
- unsigned long dsisr; /* on 4xx/Book-E used for ESR */
- unsigned long result; /* Result of a system call */
-} user_regs_struct_t;
-
-typedef UserPpc64RegsEntry UserRegsEntry;
-
-#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__PPC64
-
-#define ASSIGN_TYPED(a, b) do { a = (typeof(a))b; } while (0)
-#define ASSIGN_MEMBER(a,b,m) do { ASSIGN_TYPED((a)->m, (b)->m); } while (0)
-
-#define REG_RES(regs) ((u64)(regs).gpr[3])
-#define REG_IP(regs) ((u64)(regs).nip)
-#define REG_SYSCALL_NR(regs) ((u64)(regs).gpr[0])
-
-
-#define CORE_THREAD_ARCH_INFO(core) core->ti_ppc64
-
-/*
- * Copied from the following kernel header files :
- * include/linux/auxvec.h
- * arch/powerpc/include/uapi/asm/auxvec.h
- * include/linux/mm_types.h
- */
-#define AT_VECTOR_SIZE_BASE 20
-#define AT_VECTOR_SIZE_ARCH 6
-#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))
-
-typedef uint64_t auxv_t;
-
-/* Not used but the structure parasite_dump_thread needs a tls_t field */
-typedef uint64_t tls_t;
-
-/*
- * Copied for the Linux kernel arch/powerpc/include/asm/processor.h
- *
- * NOTE: 32bit tasks are not supported.
- */
-#define TASK_SIZE_USER64 (0x0000400000000000UL)
-#define TASK_SIZE TASK_SIZE_USER64
-
-static inline unsigned long task_size() { return TASK_SIZE; }
-
-static inline void *decode_pointer(uint64_t v) { return (void*)v; }
-static inline uint64_t encode_pointer(void *p) { return (uint64_t)p; }
-
-#endif /* __CR_ASM_TYPES_H__ */
diff --git a/arch/ppc64/include/asm/vdso.h b/arch/ppc64/include/asm/vdso.h
deleted file mode 100644
index ed94e4cf0160..000000000000
--- a/arch/ppc64/include/asm/vdso.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef __CR_ASM_VDSO_H__
-#define __CR_ASM_VDSO_H__
-
-#include "asm/int.h"
-#include "asm-generic/vdso.h"
-
-/* This definition is used in pie/util-vdso.c to initialize the vdso symbol
- * name string table 'vdso_symbols'
- *
- * Poke from kernel file arch/powerpc/kernel/vdso64/vdso64.lds.S
- *
- * Note that '__kernel_datapage_offset' is not a service but mostly a data
- * inside the text page which should not be used as is from user space.
- */
-#define VDSO_SYMBOL_MAX 10
-#define ARCH_VDSO_SYMBOLS \
- "__kernel_clock_getres", \
- "__kernel_clock_gettime", \
- "__kernel_get_syscall_map", \
- "__kernel_get_tbfreq", \
- "__kernel_getcpu", \
- "__kernel_gettimeofday", \
- "__kernel_sigtramp_rt64", \
- "__kernel_sync_dicache", \
- "__kernel_sync_dicache_p5", \
- "__kernel_time"
-
-struct vdso_symtable;
-extern int vdso_redirect_calls(unsigned long base_to,
- unsigned long base_from,
- struct vdso_symtable *to,
- struct vdso_symtable *from);
-
-#endif /* __CR_ASM_VDSO_H__ */
diff --git a/arch/ppc64/memcmp_64.S b/arch/ppc64/memcmp_64.S
deleted file mode 100644
index 16c2b0cd8280..000000000000
--- a/arch/ppc64/memcmp_64.S
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- * Author: Anton Blanchard <anton at au.ibm.com>
- * Copyright 2015 IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- * --
- * Copied form the linux file arch/powerpc/lib/memcmp_64.S
- */
-#include "asm/linkage.h"
-
-#define off8 r6
-#define off16 r7
-#define off24 r8
-
-#define rA r9
-#define rB r10
-#define rC r11
-#define rD r27
-#define rE r28
-#define rF r29
-#define rG r30
-#define rH r31
-
-#ifdef __LITTLE_ENDIAN__
-#define LD ldbrx
-#else
-#define LD ldx
-#endif
-
-ENTRY(builtin_memcmp)
- cmpdi cr1,r5,0
-
- /* Use the short loop if both strings are not 8B aligned */
- or r6,r3,r4
- andi. r6,r6,7
-
- /* Use the short loop if length is less than 32B */
- cmpdi cr6,r5,31
-
- beq cr1,.Lzero
- bne .Lshort
- bgt cr6,.Llong
-
-.Lshort:
- mtctr r5
-
-1: lbz rA,0(r3)
- lbz rB,0(r4)
- subf. rC,rB,rA
- bne .Lnon_zero
- bdz .Lzero
-
- lbz rA,1(r3)
- lbz rB,1(r4)
- subf. rC,rB,rA
- bne .Lnon_zero
- bdz .Lzero
-
- lbz rA,2(r3)
- lbz rB,2(r4)
- subf. rC,rB,rA
- bne .Lnon_zero
- bdz .Lzero
-
- lbz rA,3(r3)
- lbz rB,3(r4)
- subf. rC,rB,rA
- bne .Lnon_zero
-
- addi r3,r3,4
- addi r4,r4,4
-
- bdnz 1b
-
-.Lzero:
- li r3,0
- blr
-
-.Lnon_zero:
- mr r3,rC
- blr
-
-.Llong:
- li off8,8
- li off16,16
- li off24,24
-
- std r31,-8(r1)
- std r30,-16(r1)
- std r29,-24(r1)
- std r28,-32(r1)
- std r27,-40(r1)
-
- srdi r0,r5,5
- mtctr r0
- andi. r5,r5,31
-
- LD rA,0,r3
- LD rB,0,r4
-
- LD rC,off8,r3
- LD rD,off8,r4
-
- LD rE,off16,r3
- LD rF,off16,r4
-
- LD rG,off24,r3
- LD rH,off24,r4
- cmpld cr0,rA,rB
-
- addi r3,r3,32
- addi r4,r4,32
-
- bdz .Lfirst32
-
- LD rA,0,r3
- LD rB,0,r4
- cmpld cr1,rC,rD
-
- LD rC,off8,r3
- LD rD,off8,r4
- cmpld cr6,rE,rF
-
- LD rE,off16,r3
- LD rF,off16,r4
- cmpld cr7,rG,rH
- bne cr0,.LcmpAB
-
- LD rG,off24,r3
- LD rH,off24,r4
- cmpld cr0,rA,rB
- bne cr1,.LcmpCD
-
- addi r3,r3,32
- addi r4,r4,32
-
- bdz .Lsecond32
-
- .balign 16
-
-1: LD rA,0,r3
- LD rB,0,r4
- cmpld cr1,rC,rD
- bne cr6,.LcmpEF
-
- LD rC,off8,r3
- LD rD,off8,r4
- cmpld cr6,rE,rF
- bne cr7,.LcmpGH
-
- LD rE,off16,r3
- LD rF,off16,r4
- cmpld cr7,rG,rH
- bne cr0,.LcmpAB
-
- LD rG,off24,r3
- LD rH,off24,r4
- cmpld cr0,rA,rB
- bne cr1,.LcmpCD
-
- addi r3,r3,32
- addi r4,r4,32
-
- bdnz 1b
-
-.Lsecond32:
- cmpld cr1,rC,rD
- bne cr6,.LcmpEF
-
- cmpld cr6,rE,rF
- bne cr7,.LcmpGH
-
- cmpld cr7,rG,rH
- bne cr0,.LcmpAB
-
- bne cr1,.LcmpCD
- bne cr6,.LcmpEF
- bne cr7,.LcmpGH
-
-.Ltail:
- ld r31,-8(r1)
- ld r30,-16(r1)
- ld r29,-24(r1)
- ld r28,-32(r1)
- ld r27,-40(r1)
-
- cmpdi r5,0
- beq .Lzero
- b .Lshort
-
-.Lfirst32:
- cmpld cr1,rC,rD
- cmpld cr6,rE,rF
- cmpld cr7,rG,rH
-
- bne cr0,.LcmpAB
- bne cr1,.LcmpCD
- bne cr6,.LcmpEF
- bne cr7,.LcmpGH
-
- b .Ltail
-
-.LcmpAB:
- li r3,1
- bgt cr0,.Lout
- li r3,-1
- b .Lout
-
-.LcmpCD:
- li r3,1
- bgt cr1,.Lout
- li r3,-1
- b .Lout
-
-.LcmpEF:
- li r3,1
- bgt cr6,.Lout
- li r3,-1
- b .Lout
-
-.LcmpGH:
- li r3,1
- bgt cr7,.Lout
- li r3,-1
-
-.Lout:
- ld r31,-8(r1)
- ld r30,-16(r1)
- ld r29,-24(r1)
- ld r28,-32(r1)
- ld r27,-40(r1)
- blr
diff --git a/arch/ppc64/memcpy_power7.S b/arch/ppc64/memcpy_power7.S
deleted file mode 100644
index a29d0e8f2ada..000000000000
--- a/arch/ppc64/memcpy_power7.S
+++ /dev/null
@@ -1,213 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2012
- *
- * Author: Anton Blanchard <anton at au.ibm.com>
- *
- * --
- * Copied from the kernel file arch/powerpc/lib/memcpy_power7.S
- * Altivec support has been removed so we don't taint restored process.
- */
-#include "asm/linkage.h"
-
-/*
- * When building the parasite code, the compiler may rely on the C library
- * service memcpy to initialise big local variable in the stack.
- */
-ENTRY(memcpy)
-ENTRY(memcpy_power7)
- cmpldi r5,16
- std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
- blt .Lshort_copy
-
-.Lnonvmx_copy:
- /* Get the source 8B aligned */
- neg r6,r4
- mtocrf 0x01,r6
- clrldi r6,r6,(64-3)
-
- bf cr7*4+3,1f
- lbz r0,0(r4)
- addi r4,r4,1
- stb r0,0(r3)
- addi r3,r3,1
-
-1: bf cr7*4+2,2f
- lhz r0,0(r4)
- addi r4,r4,2
- sth r0,0(r3)
- addi r3,r3,2
-
-2: bf cr7*4+1,3f
- lwz r0,0(r4)
- addi r4,r4,4
- stw r0,0(r3)
- addi r3,r3,4
-
-3: sub r5,r5,r6
- cmpldi r5,128
- blt 5f
-
- mflr r0
- stdu r1,-STACKFRAMESIZE(r1)
- std r14,STK_REG(R14)(r1)
- std r15,STK_REG(R15)(r1)
- std r16,STK_REG(R16)(r1)
- std r17,STK_REG(R17)(r1)
- std r18,STK_REG(R18)(r1)
- std r19,STK_REG(R19)(r1)
- std r20,STK_REG(R20)(r1)
- std r21,STK_REG(R21)(r1)
- std r22,STK_REG(R22)(r1)
- std r0,STACKFRAMESIZE+16(r1)
-
- srdi r6,r5,7
- mtctr r6
-
- /* Now do cacheline (128B) sized loads and stores. */
- .align 5
-4:
- ld r0,0(r4)
- ld r6,8(r4)
- ld r7,16(r4)
- ld r8,24(r4)
- ld r9,32(r4)
- ld r10,40(r4)
- ld r11,48(r4)
- ld r12,56(r4)
- ld r14,64(r4)
- ld r15,72(r4)
- ld r16,80(r4)
- ld r17,88(r4)
- ld r18,96(r4)
- ld r19,104(r4)
- ld r20,112(r4)
- ld r21,120(r4)
- addi r4,r4,128
- std r0,0(r3)
- std r6,8(r3)
- std r7,16(r3)
- std r8,24(r3)
- std r9,32(r3)
- std r10,40(r3)
- std r11,48(r3)
- std r12,56(r3)
- std r14,64(r3)
- std r15,72(r3)
- std r16,80(r3)
- std r17,88(r3)
- std r18,96(r3)
- std r19,104(r3)
- std r20,112(r3)
- std r21,120(r3)
- addi r3,r3,128
- bdnz 4b
-
- clrldi r5,r5,(64-7)
-
- ld r14,STK_REG(R14)(r1)
- ld r15,STK_REG(R15)(r1)
- ld r16,STK_REG(R16)(r1)
- ld r17,STK_REG(R17)(r1)
- ld r18,STK_REG(R18)(r1)
- ld r19,STK_REG(R19)(r1)
- ld r20,STK_REG(R20)(r1)
- ld r21,STK_REG(R21)(r1)
- ld r22,STK_REG(R22)(r1)
- addi r1,r1,STACKFRAMESIZE
-
- /* Up to 127B to go */
-5: srdi r6,r5,4
- mtocrf 0x01,r6
-
-6: bf cr7*4+1,7f
- ld r0,0(r4)
- ld r6,8(r4)
- ld r7,16(r4)
- ld r8,24(r4)
- ld r9,32(r4)
- ld r10,40(r4)
- ld r11,48(r4)
- ld r12,56(r4)
- addi r4,r4,64
- std r0,0(r3)
- std r6,8(r3)
- std r7,16(r3)
- std r8,24(r3)
- std r9,32(r3)
- std r10,40(r3)
- std r11,48(r3)
- std r12,56(r3)
- addi r3,r3,64
-
- /* Up to 63B to go */
-7: bf cr7*4+2,8f
- ld r0,0(r4)
- ld r6,8(r4)
- ld r7,16(r4)
- ld r8,24(r4)
- addi r4,r4,32
- std r0,0(r3)
- std r6,8(r3)
- std r7,16(r3)
- std r8,24(r3)
- addi r3,r3,32
-
- /* Up to 31B to go */
-8: bf cr7*4+3,9f
- ld r0,0(r4)
- ld r6,8(r4)
- addi r4,r4,16
- std r0,0(r3)
- std r6,8(r3)
- addi r3,r3,16
-
-9: clrldi r5,r5,(64-4)
-
- /* Up to 15B to go */
-.Lshort_copy:
- mtocrf 0x01,r5
- bf cr7*4+0,12f
- lwz r0,0(r4) /* Less chance of a reject with word ops */
- lwz r6,4(r4)
- addi r4,r4,8
- stw r0,0(r3)
- stw r6,4(r3)
- addi r3,r3,8
-
-12: bf cr7*4+1,13f
- lwz r0,0(r4)
- addi r4,r4,4
- stw r0,0(r3)
- addi r3,r3,4
-
-13: bf cr7*4+2,14f
- lhz r0,0(r4)
- addi r4,r4,2
- sth r0,0(r3)
- addi r3,r3,2
-
-14: bf cr7*4+3,15f
- lbz r0,0(r4)
- stb r0,0(r3)
-
-15: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
- blr
-
-.Lunwind_stack_nonvmx_copy:
- addi r1,r1,STACKFRAMESIZE
- b .Lnonvmx_copy
-
diff --git a/arch/ppc64/misc.S b/arch/ppc64/misc.S
deleted file mode 100644
index 4ee188d554d3..000000000000
--- a/arch/ppc64/misc.S
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * This is from linux/arch/powerpc/lib/crtsavres.S:
- *
- * Special support for eabi and SVR4
- *
- * Copyright (C) 1995, 1996, 1998, 2000, 2001 Free Software Foundation, Inc.
- * Copyright 2008 Freescale Semiconductor, Inc.
- * Written By Michael Meissner
- *
- * Based on gcc/config/rs6000/crtsavres.asm from gcc
- * 64 bit additions from reading the PPC elf64abi document.
- *
- * This file is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the
- * Free Software Foundation; either version 2, or (at your option) any
- * later version.
- *
- * In addition to the permissions in the GNU General Public License, the
- * Free Software Foundation gives you unlimited permission to link the
- * compiled version of this file with other programs, and to distribute
- * those programs without any restriction coming from the use of this
- * file. (The General Public License restrictions do apply in other
- * respects; for example, they cover modification of the file, and
- * distribution when not linked into another program.)
- *
- * This file is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; see the file COPYING. If not, write to
- * the Free Software Foundation, 51 Franklin Street, Fifth Floor,
- * Boston, MA 02110-1301, USA.
- *
- * As a special exception, if you link this library with files
- * compiled with GCC to produce an executable, this does not cause
- * the resulting executable to be covered by the GNU General Public License.
- * This exception does not however invalidate any other reasons why
- * the executable file might be covered by the GNU General Public License.
- */
-
-#define r0 0
-#define r1 1
-#define r2 2
-#define r3 3
-#define r4 4
-#define r5 5
-#define r6 6
-#define r7 7
-#define r8 8
-#define r9 9
-#define r10 10
-#define r11 11
-#define r12 12
-#define r13 13
-#define r14 14
-#define r15 15
-#define r16 16
-#define r17 17
-#define r18 18
-#define r19 19
-#define r20 20
-#define r21 21
-#define r22 22
-#define r23 23
-#define r24 24
-#define r25 25
-#define r26 26
-#define r27 27
-#define r28 28
-#define r29 29
-#define r30 30
-#define r31 31
-
- .text
-
-.globl _savegpr0_14
-_savegpr0_14:
- std r14,-144(r1)
-.globl _savegpr0_15
-_savegpr0_15:
- std r15,-136(r1)
-.globl _savegpr0_16
-_savegpr0_16:
- std r16,-128(r1)
-.globl _savegpr0_17
-_savegpr0_17:
- std r17,-120(r1)
-.globl _savegpr0_18
-_savegpr0_18:
- std r18,-112(r1)
-.globl _savegpr0_19
-_savegpr0_19:
- std r19,-104(r1)
-.globl _savegpr0_20
-_savegpr0_20:
- std r20,-96(r1)
-.globl _savegpr0_21
-_savegpr0_21:
- std r21,-88(r1)
-.globl _savegpr0_22
-_savegpr0_22:
- std r22,-80(r1)
-.globl _savegpr0_23
-_savegpr0_23:
- std r23,-72(r1)
-.globl _savegpr0_24
-_savegpr0_24:
- std r24,-64(r1)
-.globl _savegpr0_25
-_savegpr0_25:
- std r25,-56(r1)
-.globl _savegpr0_26
-_savegpr0_26:
- std r26,-48(r1)
-.globl _savegpr0_27
-_savegpr0_27:
- std r27,-40(r1)
-.globl _savegpr0_28
-_savegpr0_28:
- std r28,-32(r1)
-.globl _savegpr0_29
-_savegpr0_29:
- std r29,-24(r1)
-.globl _savegpr0_30
-_savegpr0_30:
- std r30,-16(r1)
-.globl _savegpr0_31
-_savegpr0_31:
- std r31,-8(r1)
- std r0,16(r1)
- blr
-
-.globl _restgpr0_14
-_restgpr0_14:
- ld r14,-144(r1)
-.globl _restgpr0_15
-_restgpr0_15:
- ld r15,-136(r1)
-.globl _restgpr0_16
-_restgpr0_16:
- ld r16,-128(r1)
-.globl _restgpr0_17
-_restgpr0_17:
- ld r17,-120(r1)
-.globl _restgpr0_18
-_restgpr0_18:
- ld r18,-112(r1)
-.globl _restgpr0_19
-_restgpr0_19:
- ld r19,-104(r1)
-.globl _restgpr0_20
-_restgpr0_20:
- ld r20,-96(r1)
-.globl _restgpr0_21
-_restgpr0_21:
- ld r21,-88(r1)
-.globl _restgpr0_22
-_restgpr0_22:
- ld r22,-80(r1)
-.globl _restgpr0_23
-_restgpr0_23:
- ld r23,-72(r1)
-.globl _restgpr0_24
-_restgpr0_24:
- ld r24,-64(r1)
-.globl _restgpr0_25
-_restgpr0_25:
- ld r25,-56(r1)
-.globl _restgpr0_26
-_restgpr0_26:
- ld r26,-48(r1)
-.globl _restgpr0_27
-_restgpr0_27:
- ld r27,-40(r1)
-.globl _restgpr0_28
-_restgpr0_28:
- ld r28,-32(r1)
-.globl _restgpr0_29
-_restgpr0_29:
- ld r0,16(r1)
- ld r29,-24(r1)
- mtlr r0
- ld r30,-16(r1)
- ld r31,-8(r1)
- blr
-
-.globl _restgpr0_30
-_restgpr0_30:
- ld r30,-16(r1)
-.globl _restgpr0_31
-_restgpr0_31:
- ld r0,16(r1)
- ld r31,-8(r1)
- mtlr r0
- blr
diff --git a/arch/ppc64/parasite-head.S b/arch/ppc64/parasite-head.S
deleted file mode 100644
index a1c189fe94ea..000000000000
--- a/arch/ppc64/parasite-head.S
+++ /dev/null
@@ -1,46 +0,0 @@
-#include "asm/linkage.h"
-#include "parasite.h"
-
- .section .head.text
- .align 8
-
-ENTRY(__export_parasite_head_start)
-
- // int __used parasite_service(unsigned int cmd, void *args)
- // cmd = r3 = *__export_parasite_cmd (u32 ?)
- // args = r4 = @parasite_args_ptr + @pc
- bl 0f
-0: mflr r2
-
-#define LOAD_REG_ADDR(reg, name) \
- addis reg,r2,(name - 0b)@ha; \
- addi reg,r2,(name - 0b)@l;
-
- LOAD_REG_ADDR(r3,__export_parasite_cmd)
- lwz r3,0(r3)
-
- LOAD_REG_ADDR(r4,parasite_args_ptr)
- ld r4,0(r4)
-
- LOAD_REG_ADDR(r12,parasite_service_ptr)
- ld r12,0(r12)
- mtctr r12
-
- bctrl // call parasite_service
- twi 31,0,0 // Should generate SIGTRAP
-
-parasite_args_ptr:
- .quad __export_parasite_args
-
-parasite_service_ptr:
- // We want to run the function prototype to set r2.
- // Since the relocation will prefer the local entry
- // point, we force it to the global one which is 2
- // instructions above the local one.
- // FIXME: There should be a way to specify the global entry here.
- .quad parasite_service - 8
-
-__export_parasite_cmd:
- .long 0
-
-END(__export_parasite_head_start)
diff --git a/arch/ppc64/restorer.c b/arch/ppc64/restorer.c
deleted file mode 100644
index 665676045d3f..000000000000
--- a/arch/ppc64/restorer.c
+++ /dev/null
@@ -1,31 +0,0 @@
-#include <unistd.h>
-
-#include "restorer.h"
-#include "asm/restorer.h"
-#include "asm/fpu.h"
-
-#include "syscall.h"
-#include "log.h"
-
-int restore_nonsigframe_gpregs(UserPpc64RegsEntry *r)
-{
- return 0;
-}
-
-unsigned long sys_shmat(int shmid, const void *shmaddr, int shmflg)
-{
- unsigned long raddr;
- int ret;
-
- ret = sys_ipc(21 /*SHMAT */,
- shmid, /* first */
- shmflg, /* second */
- (unsigned long)&raddr, /* third */
- shmaddr, /* ptr */
- 0 /* fifth not used */);
-
- if (ret)
- raddr = (unsigned long) ret;
-
- return raddr;
-}
diff --git a/arch/ppc64/syscall-common-ppc64.S b/arch/ppc64/syscall-common-ppc64.S
deleted file mode 100644
index e18d6adf419e..000000000000
--- a/arch/ppc64/syscall-common-ppc64.S
+++ /dev/null
@@ -1,24 +0,0 @@
-#include "asm/linkage.h"
-#include <asm/unistd.h> /* for __NR_ipc */
-
-#define SYSCALL(name, opcode) \
- ENTRY(name); \
- li r0, opcode; \
- b __syscall_common; \
- END(name)
-
- .text
- .align 4
-
-ENTRY(__syscall_common)
- sc
- bnslr+ /* if no error return to LR */
- neg r3,r3 /* r3 = -r3 to return -errno value */
- blr
-END(__syscall_common)
-
-ENTRY(__cr_restore_rt)
- li r0, __NR_rt_sigreturn
- b __syscall_common
-END(__cr_restore_rt)
-
diff --git a/arch/ppc64/syscall-ppc64.def b/arch/ppc64/syscall-ppc64.def
deleted file mode 100644
index 331937973f72..000000000000
--- a/arch/ppc64/syscall-ppc64.def
+++ /dev/null
@@ -1,105 +0,0 @@
-#
-# System calls table, please make sure the table consist only the syscalls
-# really used somewhere in project.
-#
-# The template is (name and srguments are optinal if you need only __NR_x
-# defined, but no realy entry point in syscalls lib).
-#
-# name code name arguments
-# -----------------------------------------------------------------------
-#
-__NR_read 3 sys_read (int fd, void *buf, unsigned long count)
-__NR_write 4 sys_write (int fd, const void *buf, unsigned long count)
-__NR_open 5 sys_open (const char *filename, unsigned long flags, unsigned long mode)
-__NR_close 6 sys_close (int fd)
-__NR_lseek 19 sys_lseek (int fd, unsigned long offset, unsigned long origin)
-__NR_mmap 90 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset)
-__NR_mprotect 125 sys_mprotect (const void *addr, unsigned long len, unsigned long prot)
-__NR_munmap 91 sys_munmap (void *addr, unsigned long len)
-__NR_brk 45 sys_brk (void *addr)
-__NR_rt_sigaction 173 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize)
-__NR_rt_sigprocmask 174 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize)
-__NR_rt_sigreturn 172 sys_rt_sigreturn (void)
-__NR_ioctl 54 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg)
-__NR_pread64 179 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos)
-__NR_ptrace 26 sys_ptrace (long request, pid_t pid, void *addr, void *data)
-__NR_mremap 163 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr)
-__NR_mincore 206 sys_mincore (void *addr, unsigned long size, unsigned char *vec)
-__NR_madvise 205 sys_madvise (unsigned long start, size_t len, int behavior)
-__NR_pause 29 sys_pause (void)
-__NR_nanosleep 162 sys_nanosleep (struct timespec *req, struct timespec *rem)
-__NR_getitimer 105 sys_getitimer (int which, const struct itimerval *val)
-__NR_setitimer 104 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old)
-__NR_getpid 20 sys_getpid (void)
-__NR_socket 326 sys_socket (int domain, int type, int protocol)
-__NR_connect 328 sys_connect (int sockfd, struct sockaddr *addr, int addrlen)
-__NR_sendto 335 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len)
-__NR_recvfrom 337 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len)
-__NR_sendmsg 341 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags)
-__NR_recvmsg 342 sys_recvmsg (int sockfd, struct msghdr *msg, int flags)
-__NR_shutdown 338 sys_shutdown (int sockfd, int how)
-__NR_bind 327 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen)
-__NR_setsockopt 339 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen)
-__NR_getsockopt 340 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen)
-__NR_clone 120 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, void *child_tid)
-__NR_exit 1 sys_exit (unsigned long error_code)
-__NR_wait4 114 sys_wait4 (int pid, int *status, int options, struct rusage *ru)
-__NR_kill 37 sys_kill (long pid, int sig)
-__NR_fcntl 55 sys_fcntl (int fd, int type, long arg)
-__NR_flock 143 sys_flock (int fd, unsigned long cmd)
-__NR_mkdir 39 sys_mkdir (const char *name, int mode)
-__NR_rmdir 40 sys_rmdir (const char *name)
-__NR_unlink 10 sys_unlink (char *pathname)
-__NR_readlinkat 296 sys_readlinkat (int fd, const char *path, char *buf, int bufsize)
-__NR_umask 60 sys_umask (int mask)
-__NR_getgroups 80 sys_getgroups (int gsize, unsigned int *groups)
-__NR_setgroups 81 sys_setgroups (int gsize, unsigned int *groups)
-__NR_setresuid 164 sys_setresuid (int uid, int euid, int suid)
-__NR_getresuid 165 sys_getresuid (int *uid, int *euid, int *suid)
-__NR_setresgid 169 sys_setresgid (int gid, int egid, int sgid)
-__NR_getresgid 170 sys_getresgid (int *gid, int *egid, int *sgid)
-__NR_getpgid 132 sys_getpgid (pid_t pid)
-__NR_setfsuid 138 sys_setfsuid (int fsuid)
-__NR_setfsgid 139 sys_setfsgid (int fsgid)
-__NR_getsid 147 sys_getsid (void)
-__NR_capget 183 sys_capget (struct cap_header *h, struct cap_data *d)
-__NR_capset 184 sys_capset (struct cap_header *h, struct cap_data *d)
-__NR_rt_sigqueueinfo 177 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info)
-__NR_sigaltstack 185 sys_sigaltstack (const void *uss, void *uoss)
-__NR_personality 136 sys_personality (unsigned int personality)
-__NR_setpriority 97 sys_setpriority (int which, int who, int nice)
-__NR_sched_setscheduler 156 sys_sched_setscheduler (int pid, int policy, struct sched_param *p)
-__NR_prctl 171 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5)
-__NR_setrlimit 75 sys_setrlimit (int resource, struct krlimit *rlim)
-__NR_mount 21 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data)
-__NR_umount2 52 sys_umount2 (char *name, int flags)
-__NR_gettid 207 sys_gettid (void)
-__NR_futex 221 sys_futex (u32 *uaddr, int op, u32 val, struct timespec *utime, u32 *uaddr2, u32 val3)
-__NR_set_tid_address 232 sys_set_tid_address (int *tid_addr)
-__NR_restart_syscall 0 sys_restart_syscall (void)
-__NR_sys_timer_create 240 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id)
-__NR_sys_timer_settime 241 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting)
-__NR_sys_timer_gettime 242 sys_timer_gettime (int timer_id, const struct itimerspec *setting)
-__NR_sys_timer_getoverrun 243 sys_timer_getoverrun (int timer_id)
-__NR_sys_timer_delete 244 sys_timer_delete (kernel_timer_t timer_id)
-__NR_clock_gettime 246 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp)
-__NR_exit_group 234 sys_exit_group (int error_code)
-__NR_waitid 272 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru)
-__NR_set_robust_list 300 sys_set_robust_list (struct robust_list_head *head, size_t len)
-__NR_get_robust_list 299 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
-__NR_vmsplice 285 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags)
-__NR_openat 286 sys_openat (int dfd, const char *filename, int flags, int mode)
-__NR_timerfd_settime 311 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
-__NR_signalfd4 313 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
-__NR_rt_tgsigqueueinfo 322 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info)
-__NR_fanotify_init 323 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags)
-__NR_fanotify_mark 324 sys_fanotify_mark (int fanotify_fd, unsigned int flags, u64 mask, int dfd, const char *pathname)
-__NR_prlimit64 325 sys_prlimit64 (pid_t pid, unsigned int resource, const struct rlimit64 *new_rlim, struct rlimit64 *old_rlim)
-__NR_open_by_handle_at 346 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags)
-__NR_setns 350 sys_setns (int fd, int nstype)
-__NR_kcmp 354 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2)
-__NR_seccomp 358 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs)
-__NR_memfd_create 360 sys_memfd_create (const char *name, unsigned int flags)
-__NR_io_setup 227 sys_io_setup (unsigned nr_events, aio_context_t *ctx_idp)
-__NR_io_getevents 229 sys_io_getevents (aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout)
-__NR_ipc 117 sys_ipc (unsigned int call, int first, unsigned long second, unsigned long third, const void *ptr, long fifth)
diff --git a/arch/ppc64/syscalls-ppc64.sh b/arch/ppc64/syscalls-ppc64.sh
deleted file mode 100644
index 871895efaf7a..000000000000
--- a/arch/ppc64/syscalls-ppc64.sh
+++ /dev/null
@@ -1,57 +0,0 @@
-#!/bin/sh
-
-gen_asm() {
- in=$1
- codesout=$2
- codesinc=`echo $2 | sed -e 's/.*include\///g'`
- protosout=$3
- asmout=$4
- asmcommon=`echo $5 | sed -e 's/.*include\///g'`
- prototypes=`echo $6 | sed -e 's/.*include\///g'`
-
- codesdef=`echo $codesout | sed -e 's/.*include\///g' | tr "[[:space:]].-" _`
- protosdef=`echo $protosout | sed -e 's/.*include\///g' | tr "[[:space:]].-" _`
-
- echo "/* Autogenerated, don't edit */" > $codesout
- echo "#ifndef $codesdef" >> $codesout
- echo "#define $codesdef" >> $codesout
-
- echo "/* Autogenerated, don't edit */" > $protosout
- echo "#ifndef $protosdef" >> $protosout
- echo "#define $protosdef" >> $protosout
- echo "#ifndef CR_NOGLIBC" >> $protosout
- echo "#error This file should only be used in the parasite code" >> $protosout
- echo "#endif" >> $protosout
- echo "#include \"$prototypes\"" >> $protosout
- echo "#include \"$codesinc\"" >> $protosout
-
- echo "/* Autogenerated, don't edit */" > $asmout
- echo "#include \"$codesinc\"" >> $asmout
- echo "#include \"$asmcommon\"" >> $asmout
-
- cat $in | egrep -v '^#' | sed -e 's/\t\{1,\}/|/g' | awk -F '|' '{print "#define", $1, $2}' >> $codesout
- cat $in | egrep -v '^#' | sed -e 's/\t\{1,\}/|/g' | awk -F '|' '{print "extern long ", $3, $4, ";"}' >> $protosout
- cat $in | egrep -v '^#' | sed -e 's/\t\{1,\}/|/g' | awk -F '|' '{print "SYSCALL(", $3, ",", $2, ")"}' >> $asmout
-
- echo "#endif /* $codesdef */" >> $codesout
- echo "#endif /* $protosdef */" >> $protosout
-}
-
-gen_exec() {
- in=$1
- codecout=$2
-
- echo "/* Autogenerated, don't edit */" > $codecout
-
- cat $in | egrep -v '^#' | sed -e 's/\t\{1,\}/|/g' | awk -F '|' '{print "SYSCALL(", substr($3, 5), ",", $2, ")"}' >> $codecout
-}
-
-if [ "$1" = "--asm" ]; then
- shift
- gen_asm $@
-fi
-
-if [ "$1" = "--exec" ]; then
- shift
- gen_exec $@
-fi
diff --git a/arch/ppc64/vdso-pie.c b/arch/ppc64/vdso-pie.c
deleted file mode 100644
index 30437d5cc686..000000000000
--- a/arch/ppc64/vdso-pie.c
+++ /dev/null
@@ -1,155 +0,0 @@
-#include <unistd.h>
-
-#include "asm/string.h"
-#include "asm/types.h"
-
-#include "syscall.h"
-#include "parasite-vdso.h"
-#include "log.h"
-#include "bug.h"
-
-#ifdef LOG_PREFIX
-# undef LOG_PREFIX
-#endif
-#define LOG_PREFIX "vdso: "
-
-/* This symbols are defined in vdso-trampoline.S */
-extern char *vdso_trampoline, *vdso_trampoline_end;
-
-static inline void invalidate_caches(unsigned long at)
-{
- asm volatile("isync \n" \
- "li 3,0 \n" \
- "dcbf 3,%0 \n" \
- "sync \n" \
- "icbi 3,%0 \n" \
- "isync \n" \
- : /* no output */ \
- : "r"(at) \
- :"memory", "r3");
-}
-
-/* This is the size of the trampoline call :
- * mlfr r0
- * bl trampoline
- * <64 bit address>
- */
-#define TRAMP_CALL_SIZE (2*sizeof(uint32_t) + sizeof(uint64_t))
-
-/*
- * put_trampoline does 2 things :
- *
- * 1. it looks for a place in the checkpointed vDSO where to put the
- * trampoline code (see vdso-trampoline.S).
- *
- * 2. for each symbol from the checkpointed vDSO, it checks that there are
- * enough place to put the call to the vDSO trampoline (see
- * TRAMP_CALL_SIZE's comment above).
- * This done by checking that there is no interesting symbols in the range
- * of current one's offset -> (current one's offset + TRAMP_CALL_SIZE).
- * Unfortunately the symbols are not sorted by address so we have to look
- * for the complete table all the time. Since the vDSO is small, this is
- * not a big issue.
- */
-static unsigned long put_trampoline(unsigned long at, struct vdso_symtable *sym)
-{
- int i,j;
- unsigned long size;
- unsigned long trampoline = 0;
-
- /* First of all we have to find a place where to put the trampoline
- * code.
- */
- size = (unsigned long)&vdso_trampoline_end
- - (unsigned long)&vdso_trampoline;
-
- for (i = 0; i < ARRAY_SIZE(sym->symbols); i++) {
- if (vdso_symbol_empty(&sym->symbols[i]))
- continue;
-
- pr_debug("Checking '%s' at %lx\n", sym->symbols[i].name,
- sym->symbols[i].offset);
-
- /* find the nearest followin symbol we are interested in */
- for (j=0; j < ARRAY_SIZE(sym->symbols); j++) {
- if (i==j || vdso_symbol_empty(&sym->symbols[j]))
- continue;
-
- if (sym->symbols[j].offset <= sym->symbols[i].offset)
- /* this symbol is above the current one */
- continue;
-
- if ((sym->symbols[i].offset+TRAMP_CALL_SIZE) >
- sym->symbols[j].offset) {
- /* we have a major issue here since we cannot
- * even put the trampoline call for this symbol
- */
- pr_err("Can't handle small vDSO symbol %s\n",
- sym->symbols[i].name);
- return 0;
- }
-
- if (trampoline)
- /* no need to put it twice */
- continue;
-
- if ((sym->symbols[j].offset -
- (sym->symbols[i].offset+TRAMP_CALL_SIZE)) <= size)
- /* not enough place */
- continue;
-
- /* We can put the trampoline there */
- trampoline = at + sym->symbols[i].offset;
- trampoline += TRAMP_CALL_SIZE;
-
- pr_debug("Puting vDSO trampoline in %s at %lx\n",
- sym->symbols[i].name, trampoline);
- builtin_memcpy((void *)trampoline, &vdso_trampoline,
- size);
- invalidate_caches(trampoline);
- }
- }
-
- return trampoline;
-}
-
-static inline void put_trampoline_call(unsigned long at, unsigned long to,
- unsigned long tr)
-{
- uint32_t *addr = (uint32_t *)at;;
-
- *addr++ = 0x7C0802a6; /* mflr r0 */
- *addr++ = 0x48000001 | ((long)(tr-at-4) & 0x3fffffc); /* bl tr */
- *(uint64_t *)addr = to; /* the address to read by the trampoline */
-
- invalidate_caches(at);
-}
-
-int vdso_redirect_calls(unsigned long base_to,
- unsigned long base_from,
- struct vdso_symtable *to,
- struct vdso_symtable *from)
-{
- unsigned int i;
- unsigned long trampoline;
-
- trampoline = (unsigned long)put_trampoline(base_from, from);
- if (!trampoline)
- return 1;
-
- for (i = 0; i < ARRAY_SIZE(to->symbols); i++) {
- if (vdso_symbol_empty(&from->symbols[i]))
- continue;
-
- pr_debug("br: %lx/%lx -> %lx/%lx (index %d) '%s'\n",
- base_from, from->symbols[i].offset,
- base_to, to->symbols[i].offset, i,
- from->symbols[i].name);
-
- put_trampoline_call(base_from + from->symbols[i].offset,
- base_to + to->symbols[i].offset,
- trampoline);
- }
-
- return 0;
-}
diff --git a/arch/ppc64/vdso-trampoline.S b/arch/ppc64/vdso-trampoline.S
deleted file mode 100644
index e910e7ab99a4..000000000000
--- a/arch/ppc64/vdso-trampoline.S
+++ /dev/null
@@ -1,11 +0,0 @@
-#include "asm/linkage.h"
-
- .section .text
-
-GLOBAL(vdso_trampoline)
- mflr r12 /* r12 vdso_ptr's address */
- mtlr r0 /* restore lr */
- ld r12,0(r12) /* read value store in vdso_ptr */
- mtctr r12 /* branch to it */
- bctr
-GLOBAL(vdso_trampoline_end)
diff --git a/arch/scripts/arm/gen-sys-exec-tbl.pl b/arch/scripts/arm/gen-sys-exec-tbl.pl
deleted file mode 100755
index a3037b78c34e..000000000000
--- a/arch/scripts/arm/gen-sys-exec-tbl.pl
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/perl
-
-use strict;
-use warnings;
-
-my $in = $ARGV[0];
-my $tblout = $ARGV[1];
-my $bits = $ARGV[2];
-
-my $code = "code$bits";
-
-open TBLOUT, ">", $tblout or die $!;
-open IN, "<", $in or die $!;
-
-print TBLOUT "/* Autogenerated, don't edit */\n";
-
-for (<IN>) {
- if ($_ =~ /\#/) {
- next;
- }
-
- my $sys_name;
- my $sys_num;
-
- if (/(?<name>\S+)\s+(?<alias>\S+)\s+(?<code64>\d+|\!)\s+(?<code32>(?:\d+|\!))\s+\((?<args>.+)\)/) {
- $sys_name = $+{alias};
- } elsif (/(?<name>\S+)\s+(?<code64>\d+|\!)\s+(?<code32>(?:\d+|\!))\s+\((?<args>.+)\)/) {
- $sys_name = $+{name};
- } else {
- unlink $tblout;
- die "Invalid syscall definition file: invalid entry $_\n";
- }
-
- $sys_num = $+{$code};
-
- if ($sys_num ne "!") {
- print TBLOUT "SYSCALL($sys_name, $sys_num)\n";
- }
-}
diff --git a/arch/scripts/arm/gen-syscalls.pl b/arch/scripts/arm/gen-syscalls.pl
deleted file mode 100755
index 6fb8f3bf2071..000000000000
--- a/arch/scripts/arm/gen-syscalls.pl
+++ /dev/null
@@ -1,95 +0,0 @@
-#!/usr/bin/perl
-
-use strict;
-use warnings;
-
-my $in = $ARGV[0];
-my $codesout = $ARGV[1];
-my $codes = $ARGV[1];
-$codes =~ s/.*include\///g;
-my $protosout = $ARGV[2];
-my $protos = $ARGV[2];
-$protos =~ s/.*include\///g;
-my $asmout = $ARGV[3];
-my $asmcommon = $ARGV[4];
-my $prototypes = $ARGV[5];
-$prototypes =~ s/.*include\///g;
-my $bits = $ARGV[6];
-
-my $codesdef = $codes;
-$codesdef =~ tr/.-/_/;
-my $protosdef = $protos;
-$protosdef =~ tr/.-/_/;
-my $code = "code$bits";
-my $need_aux = 0;
-
-unlink $codesout;
-unlink $protosout;
-unlink $asmout;
-
-open CODESOUT, ">", $codesout or die $!;
-open PROTOSOUT, ">", $protosout or die $!;
-open ASMOUT, ">", $asmout or die $!;
-open IN, "<", $in or die $!;
-
-print CODESOUT <<"END";
-/* Autogenerated, don't edit */
-#ifndef $codesdef
-#define $codesdef
-END
-
-print PROTOSOUT <<"END";
-/* Autogenerated, don't edit */
-#ifndef $protosdef
-#define $protosdef
-#include "$prototypes"
-#include "$codes"
-END
-
-print ASMOUT <<"END";
-/* Autogenerated, don't edit */
-#include "$codes"
-#include "$asmcommon"
-END
-
-
-for (<IN>) {
- if ($_ =~ /\#/) {
- next;
- }
-
- my $code_macro;
- my $sys_name;
-
- if (/(?<name>\S+)\s+(?<alias>\S+)\s+(?<code64>\d+|\!)\s+(?<code32>(?:\d+|\!))\s+\((?<args>.+)\)/) {
- $code_macro = "__NR_$+{name}";
- $sys_name = "sys_$+{alias}";
- } elsif (/(?<name>\S+)\s+(?<code64>\d+|\!)\s+(?<code32>(?:\d+|\!))\s+\((?<args>.+)\)/) {
- $code_macro = "__NR_$+{name}";
- $sys_name = "sys_$+{name}";
- } else {
- unlink $codesout;
- unlink $protosout;
- unlink $asmout;
-
- die "Invalid syscall definition file: invalid entry $_\n";
- }
-
- if ($+{$code} ne "!") {
- print CODESOUT "#define $code_macro $+{$code}\n";
- print ASMOUT "syscall $sys_name, $code_macro\n";
-
- } else {
- $need_aux = 1;
- }
-
- print PROTOSOUT "extern long $sys_name($+{args});\n";
-}
-
-if ($need_aux == 1) {
- print ASMOUT "#include \"asm/syscall-aux.S\"\n";
- print CODESOUT "#include \"asm/syscall-aux.h\"\n";
-}
-
-print CODESOUT "#endif /* $codesdef */";
-print PROTOSOUT "#endif /* $protosdef */";
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
deleted file mode 100644
index 2304915c7cc4..000000000000
--- a/arch/x86/Makefile
+++ /dev/null
@@ -1,91 +0,0 @@
-targets += syscalls
-targets += crtools
-
-SYS-ASM := syscalls.S
-
-syscalls-asm-y += $(SYS-ASM:.S=).o
-crtools-obj-y += crtools.o
-crtools-obj-y += cpu.o
-crtools-obj-y += prlimit.o
-
-ifeq ($(ARCH),x86)
-SYS-DEF := syscall_64.tbl
-SYS-ASM-COMMON := syscall-common-x86-64.S
-else
-SYS-DEF := syscall_32.tbl
-SYS-ASM-COMMON := syscall-common-x86-32.S
-endif
-
-SYS-TYPES := ../../include/syscall-types.h
-SYS-CODES := ../../include/syscall-codes.h
-SYS-PROTO := ../../include/syscall.h
-
-SYS-EXEC-TBL := sys-exec-tbl.c
-
-ifeq ($(ARCH),x86)
-syscalls-asm-y-asmflags := -fpie -Wstrict-prototypes -Wa,--noexecstack
-else
-syscalls-asm-y-asmflags := -fno-pic -Wstrict-prototypes -Wa,--noexecstack
-endif
-syscalls-asm-y-asmflags += -nostdlib -fomit-frame-pointer -I$(obj)
-
-ifneq ($(ARCH),x86)
-syscalls-obj-y += syscalls/syscall32.o
-$(obj)/syscalls/syscall32.o: $(obj)/$(SYS-CODES) $(obj)/$(SYS-PROTO)
-endif
-cleanup-y += $(obj)/syscalls/*.o $(obj)/syscalls/*.d
-
-ASMFLAGS += -D__ASSEMBLY__
-
-$(obj)/$(SYS-CODES): $(obj)/syscalls/$(SYS-DEF)
- $(E) " GEN " $@
- $(Q) echo "/* Autogenerated, don't edit */" > $@
- $(Q) echo "#ifndef __ASM_CR_SYSCALL_CODES_H__" >> $@
- $(Q) echo "#define __ASM_CR_SYSCALL_CODES_H__" >> $@
- $(Q) cat $< | awk '/^__NR/{print "#define", $$1, $$2}' >> $@
- $(Q) echo "#endif /* __ASM_CR_SYSCALL_CODES_H__ */" >> $@
-_all += $(obj)/$(SYS-CODES)
-cleanup-y += $(obj)/$(SYS-CODES)
-
-$(obj)/$(SYS-PROTO): $(obj)/syscalls/$(SYS-DEF)
- $(E) " GEN " $@
- $(Q) echo "/* Autogenerated, don't edit */" > $@
- $(Q) echo "#ifndef __ASM_CR_SYSCALL_PROTO_H__" >> $@
- $(Q) echo "#define __ASM_CR_SYSCALL_PROTO_H__" >> $@
- $(Q) echo "#ifndef CR_NOGLIBC" >> $@
- $(Q) echo "#error This file should only be used in the parasite code" >> $@
- $(Q) echo "#endif" >> $@
- $(Q) echo "#include \"syscall-codes.h\"" >> $@
- $(Q) echo "#include \"syscall-types.h\"" >> $@
-ifneq ($(ARCH),x86)
- $(Q) echo "#include \"asm/syscall32.h\"" >> $@
-endif
- $(Q) cat $< | awk '/^__NR/{print "extern long", $$3, substr($$0, index($$0,$$4)), ";"}' >> $@
- $(Q) echo "#endif /* __ASM_CR_SYSCALL_PROTO_H__ */" >> $@
-_all += $(obj)/$(SYS-PROTO)
-cleanup-y += $(obj)/$(SYS-PROTO)
-
-$(obj)/$(SYS-ASM): $(obj)/syscalls/$(SYS-DEF) $(obj)/syscalls/$(SYS-ASM-COMMON) $(obj)/$(SYS-CODES) $(obj)/$(SYS-PROTO)
- $(E) " GEN " $@
- $(Q) echo "/* Autogenerated, don't edit */" > $@
- $(Q) echo "#include \"syscall-codes.h\"" >> $@
- $(Q) echo "#include \"syscalls/$(SYS-ASM-COMMON)\"" >> $@
- $(Q) cat $< | awk '/^__NR/{print "SYSCALL(", $$3, ",", $$2, ")"}' >> $@
-_all += $(obj)/$(SYS-ASM)
-cleanup-y += $(obj)/$(SYS-ASM)
-
-$(obj)/syscalls.o: $(obj)/$(SYS-ASM)
-
-$(obj)/$(SYS-EXEC-TBL): $(obj)/syscalls/$(SYS-DEF) $(obj)/$(SYS-CODES) $(obj)/$(SYS-PROTO)
- $(E) " GEN " $@
- $(Q) echo "/* Autogenerated, don't edit */" > $@
- $(Q) cat $< | awk '/^__NR/{print "SYSCALL(", substr($$3, 5), ",", $$2, ")"}' >> $@
-_all += $(obj)/$(SYS-EXEC-TBL)
-cleanup-y += $(obj)/$(SYS-EXEC-TBL)
-
-$(obj)/crtools.built-in.o: | $(obj)/$(SYS-CODES) $(obj)/$(SYS-PROTO)
-
-ifneq ($(MAKECMDGOALS),clean)
-deps-after := $(obj)/$(SYS-ASM)
-incdeps := y
-endif
diff --git a/arch/x86/cpu.c b/arch/x86/cpu.c
deleted file mode 100644
index d703e68e3621..000000000000
--- a/arch/x86/cpu.c
+++ /dev/null
@@ -1,491 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <errno.h>
-#include <string.h>
-
-#include <sys/types.h>
-
-#include "asm/bitops.h"
-#include "asm/types.h"
-#include "asm/cpu.h"
-#include "asm/fpu.h"
-
-#include "compiler.h"
-
-#include "cr_options.h"
-#include "proc_parse.h"
-#include "util.h"
-#include "log.h"
-
-#include "cpu.h"
-
-#include "protobuf.h"
-#include "protobuf/cpuinfo.pb-c.h"
-
-#undef LOG_PREFIX
-#define LOG_PREFIX "cpu: "
-
-static struct cpuinfo_x86 rt_cpu_info;
-
-static void set_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
-{
- if (likely(feature < NCAPINTS_BITS))
- set_bit(feature, (unsigned long *)c->x86_capability);
-}
-
-static void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
-{
- if (likely(feature < NCAPINTS_BITS))
- clear_bit(feature, (unsigned long *)c->x86_capability);
-}
-
-static int test_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
-{
- if (likely(feature < NCAPINTS_BITS))
- return test_bit(feature, (unsigned long *)c->x86_capability);
- return 0;
-}
-
-bool cpu_has_feature(unsigned int feature)
-{
- return test_cpu_cap(&rt_cpu_info, feature);
-}
-
-static int cpu_init_cpuid(struct cpuinfo_x86 *c)
-{
- /*
- * See cpu_detect() in the kernel, also
- * read cpuid specs not only from general
- * SDM but for extended instructions set
- * reference.
- */
-
- /* Get vendor name */
- cpuid(0x00000000,
- (unsigned int *)&c->cpuid_level,
- (unsigned int *)&c->x86_vendor_id[0],
- (unsigned int *)&c->x86_vendor_id[8],
- (unsigned int *)&c->x86_vendor_id[4]);
-
- if (!strcmp(c->x86_vendor_id, "GenuineIntel")) {
- c->x86_vendor = X86_VENDOR_INTEL;
- } else if (!strcmp(c->x86_vendor_id, "AuthenticAMD")) {
- c->x86_vendor = X86_VENDOR_AMD;
- } else {
- pr_err("Unsupported CPU vendor %s\n",
- c->x86_vendor_id);
- return -1;
- }
-
- c->x86_family = 4;
-
- /* Intel-defined flags: level 0x00000001 */
- if (c->cpuid_level >= 0x00000001) {
- u32 eax, ebx, ecx, edx;
-
- cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
- c->x86_family = (eax >> 8) & 0xf;
- c->x86_model = (eax >> 4) & 0xf;
- c->x86_mask = eax & 0xf;
-
- if (c->x86_family == 0xf)
- c->x86_family += (eax >> 20) & 0xff;
- if (c->x86_family >= 0x6)
- c->x86_model += ((eax >> 16) & 0xf) << 4;
-
- c->x86_capability[0] = edx;
- c->x86_capability[4] = ecx;
- }
-
- /* Additional Intel-defined flags: level 0x00000007 */
- if (c->cpuid_level >= 0x00000007) {
- u32 eax, ebx, ecx, edx;
-
- cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
- c->x86_capability[9] = ebx;
- c->x86_capability[11] = ecx;
- }
-
- /* Extended state features: level 0x0000000d */
- if (c->cpuid_level >= 0x0000000d) {
- u32 eax, ebx, ecx, edx;
-
- cpuid_count(0x0000000d, 1, &eax, &ebx, &ecx, &edx);
- c->x86_capability[10] = eax;
- }
-
- /* AMD-defined flags: level 0x80000001 */
- c->extended_cpuid_level = cpuid_eax(0x80000000);
-
- if ((c->extended_cpuid_level & 0xffff0000) == 0x80000000) {
- if (c->extended_cpuid_level >= 0x80000001) {
- c->x86_capability[1] = cpuid_edx(0x80000001);
- c->x86_capability[6] = cpuid_ecx(0x80000001);
- }
- }
-
- /*
- * We're don't care about scattered features for now,
- * otherwise look into init_scattered_cpuid_features()
- * in kernel.
- */
-
- if (c->extended_cpuid_level >= 0x80000004) {
- unsigned int *v;
- char *p, *q;
- v = (unsigned int *)c->x86_model_id;
- cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
- cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
- cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
- c->x86_model_id[48] = 0;
-
- /*
- * Intel chips right-justify this string for some dumb reason;
- * undo that brain damage:
- */
- p = q = &c->x86_model_id[0];
- while (*p == ' ')
- p++;
- if (p != q) {
- while (*p)
- *q++ = *p++;
- while (q <= &c->x86_model_id[48])
- *q++ = '\0'; /* Zero-pad the rest */
- }
- }
-
- /* On x86-64 NOP is always present */
- set_cpu_cap(c, X86_FEATURE_NOPL);
-
- switch (c->x86_vendor) {
- case X86_VENDOR_INTEL:
- /*
- * Strictly speaking we need to read MSR_IA32_MISC_ENABLE
- * here but on ring3 it's impossible.
- */
- if (c->x86_family == 15) {
- clear_cpu_cap(c, X86_FEATURE_REP_GOOD);
- clear_cpu_cap(c, X86_FEATURE_ERMS);
- } else if (c->x86_family == 6) {
- /* On x86-64 rep is fine */
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
- }
-
- /* See filter_cpuid_features in kernel */
- if ((s32)c->cpuid_level < (s32)0x0000000d)
- clear_cpu_cap(c, X86_FEATURE_XSAVE);
- break;
- case X86_VENDOR_AMD:
- /*
- * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
- * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
- */
- clear_cpu_cap(c, 0 * 32 + 31);
- if (c->x86_family >= 0x10)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
- if (c->x86_family == 0xf) {
- u32 level;
-
- /* On C+ stepping K8 rep microcode works well for copy/memset */
- level = cpuid_eax(1);
- if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
- set_cpu_cap(c, X86_FEATURE_REP_GOOD);
- }
- break;
- }
-
- return 0;
-}
-
-int cpu_init(void)
-{
- if (cpu_init_cpuid(&rt_cpu_info))
- return -1;
-
- BUILD_BUG_ON(sizeof(struct xsave_struct) != XSAVE_SIZE);
- BUILD_BUG_ON(sizeof(struct i387_fxsave_struct) != FXSAVE_SIZE);
-
- /*
- * Make sure that at least FPU is onboard
- * and fxsave is supported.
- */
- if (cpu_has_feature(X86_FEATURE_FPU)) {
- if (!cpu_has_feature(X86_FEATURE_FXSR)) {
- pr_err("missing support fxsave/restore insns\n");
- return -1;
- }
- }
-
- pr_debug("fpu:%d fxsr:%d xsave:%d\n",
- !!cpu_has_feature(X86_FEATURE_FPU),
- !!cpu_has_feature(X86_FEATURE_FXSR),
- !!cpu_has_feature(X86_FEATURE_XSAVE));
-
- return 0;
-}
-
-int cpu_dump_cpuinfo(void)
-{
- CpuinfoEntry cpu_info = CPUINFO_ENTRY__INIT;
- CpuinfoX86Entry cpu_x86_info = CPUINFO_X86_ENTRY__INIT;
- CpuinfoX86Entry *cpu_x86_info_ptr = &cpu_x86_info;
- struct cr_img *img;
-
- img = open_image(CR_FD_CPUINFO, O_DUMP);
- if (!img)
- return -1;
-
- cpu_info.x86_entry = &cpu_x86_info_ptr;
- cpu_info.n_x86_entry = 1;
-
- cpu_x86_info.vendor_id = (rt_cpu_info.x86_vendor == X86_VENDOR_INTEL) ?
- CPUINFO_X86_ENTRY__VENDOR__INTEL :
- CPUINFO_X86_ENTRY__VENDOR__AMD;
- cpu_x86_info.cpu_family = rt_cpu_info.x86_family;
- cpu_x86_info.model = rt_cpu_info.x86_model;
- cpu_x86_info.stepping = rt_cpu_info.x86_mask;
- cpu_x86_info.capability_ver = 1;
- cpu_x86_info.n_capability = ARRAY_SIZE(rt_cpu_info.x86_capability);
- cpu_x86_info.capability = (void *)rt_cpu_info.x86_capability;
-
- if (rt_cpu_info.x86_model_id[0])
- cpu_x86_info.model_id = rt_cpu_info.x86_model_id;
-
- if (pb_write_one(img, &cpu_info, PB_CPUINFO) < 0) {
- close_image(img);
- return -1;
- }
-
- close_image(img);
- return 0;
-}
-
-#define __ins_bit(__l, __v) (1u << ((__v) - 32u * (__l)))
-
-static u32 x86_ins_capability_mask[NCAPINTS] = {
- [0] =
- __ins_bit(0, X86_FEATURE_FPU) |
- __ins_bit(0, X86_FEATURE_TSC) |
- __ins_bit(0, X86_FEATURE_CX8) |
- __ins_bit(0, X86_FEATURE_SEP) |
- __ins_bit(0, X86_FEATURE_CMOV) |
- __ins_bit(0, X86_FEATURE_CLFLUSH) |
- __ins_bit(0, X86_FEATURE_MMX) |
- __ins_bit(0, X86_FEATURE_FXSR) |
- __ins_bit(0, X86_FEATURE_XMM) |
- __ins_bit(0, X86_FEATURE_XMM2),
-
- [1] =
- __ins_bit(1, X86_FEATURE_SYSCALL) |
- __ins_bit(1, X86_FEATURE_MMXEXT) |
- __ins_bit(1, X86_FEATURE_RDTSCP) |
- __ins_bit(1, X86_FEATURE_3DNOWEXT) |
- __ins_bit(1, X86_FEATURE_3DNOW),
-
- [3] =
- __ins_bit(3, X86_FEATURE_REP_GOOD) |
- __ins_bit(3, X86_FEATURE_NOPL),
-
- [4] =
- __ins_bit(4, X86_FEATURE_XMM3) |
- __ins_bit(4, X86_FEATURE_PCLMULQDQ) |
- __ins_bit(4, X86_FEATURE_MWAIT) |
- __ins_bit(4, X86_FEATURE_SSSE3) |
- __ins_bit(4, X86_FEATURE_CX16) |
- __ins_bit(4, X86_FEATURE_XMM4_1) |
- __ins_bit(4, X86_FEATURE_XMM4_2) |
- __ins_bit(4, X86_FEATURE_MOVBE) |
- __ins_bit(4, X86_FEATURE_POPCNT) |
- __ins_bit(4, X86_FEATURE_AES) |
- __ins_bit(4, X86_FEATURE_XSAVE) |
- __ins_bit(4, X86_FEATURE_OSXSAVE) |
- __ins_bit(4, X86_FEATURE_AVX) |
- __ins_bit(4, X86_FEATURE_F16C) |
- __ins_bit(4, X86_FEATURE_RDRAND),
-
- [6] =
- __ins_bit(6, X86_FEATURE_ABM) |
- __ins_bit(6, X86_FEATURE_SSE4A) |
- __ins_bit(6, X86_FEATURE_MISALIGNSSE) |
- __ins_bit(6, X86_FEATURE_3DNOWPREFETCH) |
- __ins_bit(6, X86_FEATURE_XOP) |
- __ins_bit(6, X86_FEATURE_FMA4) |
- __ins_bit(6, X86_FEATURE_TBM),
-
- [9] =
- __ins_bit(9, X86_FEATURE_FSGSBASE) |
- __ins_bit(9, X86_FEATURE_BMI1) |
- __ins_bit(9, X86_FEATURE_HLE) |
- __ins_bit(9, X86_FEATURE_AVX2) |
- __ins_bit(9, X86_FEATURE_BMI2) |
- __ins_bit(9, X86_FEATURE_ERMS) |
- __ins_bit(9, X86_FEATURE_RTM) |
- __ins_bit(9, X86_FEATURE_MPX) |
- __ins_bit(9, X86_FEATURE_AVX512F) |
- __ins_bit(9, X86_FEATURE_AVX512DQ) |
- __ins_bit(9, X86_FEATURE_RDSEED) |
- __ins_bit(9, X86_FEATURE_ADX) |
- __ins_bit(9, X86_FEATURE_CLFLUSHOPT) |
- __ins_bit(9, X86_FEATURE_AVX512PF) |
- __ins_bit(9, X86_FEATURE_AVX512ER) |
- __ins_bit(9, X86_FEATURE_AVX512CD) |
- __ins_bit(9, X86_FEATURE_SHA) |
- __ins_bit(9, X86_FEATURE_AVX512BW) |
- __ins_bit(9, X86_FEATURE_AVXVL),
-
- [10] =
- __ins_bit(10, X86_FEATURE_XSAVEOPT) |
- __ins_bit(10, X86_FEATURE_XSAVEC) |
- __ins_bit(10, X86_FEATURE_XGETBV1) |
- __ins_bit(10, X86_FEATURE_XSAVES),
-
- [11] =
- __ins_bit(11, X86_FEATURE_PREFETCHWT1),
-};
-
-#undef __ins_bit
-
-static int cpu_validate_ins_features(CpuinfoX86Entry *img_x86_entry)
-{
- size_t i;
-
- for (i = 0; i < ARRAY_SIZE(rt_cpu_info.x86_capability); i++) {
- u32 s = img_x86_entry->capability[i] & x86_ins_capability_mask[i];
- u32 d = rt_cpu_info.x86_capability[i] & x86_ins_capability_mask[i];
-
- /*
- * Destination might be more feature rich
- * but not the reverse.
- */
- if (s & ~d) {
- pr_err("CPU instruction capabilities do not match run time\n");
- return -1;
- }
- }
-
- return 0;
-}
-
-static int cpu_validate_features(CpuinfoX86Entry *img_x86_entry)
-{
- if (img_x86_entry->n_capability != ARRAY_SIZE(rt_cpu_info.x86_capability)) {
- /*
- * Image carries different number of bits.
- * Simply reject, we can't guarantee anything
- * in such case.
- */
- pr_err("Size of features in image mismatch "
- "one provided by run time CPU (%d:%d)\n",
- (unsigned)img_x86_entry->n_capability,
- (unsigned)ARRAY_SIZE(rt_cpu_info.x86_capability));
- return -1;
- }
-
- if (opts.cpu_cap == CPU_CAP_FPU) {
- /*
- * If we're requested to check FPU only ignore
- * any other bit. It's up to a user if the
- * rest of mismatches won't cause problems.
- */
-
-#define __mismatch_fpu_bit(__bit) \
- (test_bit(__bit, (void *)img_x86_entry->capability) && \
- !cpu_has_feature(__bit))
- if (__mismatch_fpu_bit(X86_FEATURE_FPU) ||
- __mismatch_fpu_bit(X86_FEATURE_FXSR) ||
- __mismatch_fpu_bit(X86_FEATURE_XSAVE)) {
- pr_err("FPU feature required by image "
- "is not supported on host.\n");
- return -1;
- } else
- return 0;
-#undef __mismatch_fpu_bit
- }
-
- /*
- * Capability on instructions level only.
- */
- if (opts.cpu_cap == CPU_CAP_INS)
- return cpu_validate_ins_features(img_x86_entry);
-
- /*
- * Strict capability mode. Everything must match.
- */
- if (memcmp(img_x86_entry->capability, rt_cpu_info.x86_capability,
- sizeof(rt_cpu_info.x86_capability))) {
- pr_err("CPU capabilites do not match run time\n");
- return -1;
- }
-
- return 0;
-}
-
-int cpu_validate_cpuinfo(void)
-{
- CpuinfoX86Entry *img_x86_entry;
- CpuinfoEntry *img_cpu_info;
- struct cr_img *img;
- int ret = -1;
-
- img = open_image(CR_FD_CPUINFO, O_RSTR);
- if (!img)
- return -1;
-
- if (pb_read_one(img, &img_cpu_info, PB_CPUINFO) < 0)
- goto err;
-
- if (img_cpu_info->n_x86_entry != 1) {
- pr_err("No x86 related cpuinfo in image, "
- "corruption (n_x86_entry = %zi)\n",
- img_cpu_info->n_x86_entry);
- goto err;
- }
-
- img_x86_entry = img_cpu_info->x86_entry[0];
- if (img_x86_entry->vendor_id != CPUINFO_X86_ENTRY__VENDOR__INTEL &&
- img_x86_entry->vendor_id != CPUINFO_X86_ENTRY__VENDOR__AMD) {
- pr_err("Unknown cpu vendor %d\n", img_x86_entry->vendor_id);
- goto err;
- }
-
- if (img_x86_entry->n_capability != ARRAY_SIZE(rt_cpu_info.x86_capability)) {
- pr_err("Image carries %u words while %u expected\n",
- (unsigned)img_x86_entry->n_capability,
- (unsigned)ARRAY_SIZE(rt_cpu_info.x86_capability));
- goto err;
- }
-
- ret = cpu_validate_features(img_x86_entry);
-err:
- close_image(img);
- return ret;
-}
-
-int cpuinfo_dump(void)
-{
- if (cpu_init())
- return -1;
- if (cpu_dump_cpuinfo())
- return -1;
- return 0;
-}
-
-int cpuinfo_check(void)
-{
- if (cpu_init())
- return 1;
-
- /*
- * Force to check all caps if empty passed,
- * still allow to check instructions only
- * and etc.
- */
- if (!opts.cpu_cap)
- opts.cpu_cap = CPU_CAP_ALL;
-
- if (cpu_validate_cpuinfo())
- return 1;
-
- return 0;
-}
diff --git a/arch/x86/crtools.c b/arch/x86/crtools.c
deleted file mode 100644
index f713b0d3fd40..000000000000
--- a/arch/x86/crtools.c
+++ /dev/null
@@ -1,572 +0,0 @@
-#include <string.h>
-#include <unistd.h>
-#include <elf.h>
-#include <sys/user.h>
-#include <sys/mman.h>
-
-#include "asm/processor-flags.h"
-#include "asm/restorer.h"
-#include "asm/types.h"
-#include "asm/fpu.h"
-
-#include "cr_options.h"
-#include "compiler.h"
-#include "ptrace.h"
-#include "parasite-syscall.h"
-#include "restorer.h"
-#include "log.h"
-#include "util.h"
-#include "cpu.h"
-#include "errno.h"
-
-#include "protobuf.h"
-#include "protobuf/core.pb-c.h"
-#include "protobuf/creds.pb-c.h"
-
-/*
- * Injected syscall instruction
- */
-const char code_syscall[] = {
- 0x0f, 0x05, /* syscall */
- 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc /* int 3, ... */
-};
-
-const int code_syscall_size = round_up(sizeof(code_syscall), sizeof(long));
-
-static inline __always_unused void __check_code_syscall(void)
-{
- BUILD_BUG_ON(sizeof(code_syscall) != BUILTIN_SYSCALL_SIZE);
- BUILD_BUG_ON(!is_log2(sizeof(code_syscall)));
-}
-
-void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs)
-{
- regs->ip = new_ip;
- if (stack)
- regs->sp = (unsigned long) stack;
-
- /* Avoid end of syscall processing */
- regs->orig_ax = -1;
-
- /* Make sure flags are in known state */
- regs->flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_DF | X86_EFLAGS_IF);
-}
-
-static int task_in_compat_mode(pid_t pid)
-{
- unsigned long cs, ds;
-
- errno = 0;
- cs = ptrace(PTRACE_PEEKUSER, pid, offsetof(user_regs_struct_t, cs), 0);
- if (errno != 0) {
- pr_perror("Can't get CS register for %d", pid);
- return -1;
- }
-
- errno = 0;
- ds = ptrace(PTRACE_PEEKUSER, pid, offsetof(user_regs_struct_t, ds), 0);
- if (errno != 0) {
- pr_perror("Can't get DS register for %d", pid);
- return -1;
- }
-
- /* It's x86-32 or x32 */
- return cs != 0x33 || ds == 0x2b;
-}
-
-bool arch_can_dump_task(pid_t pid)
-{
- if (task_in_compat_mode(pid)) {
- pr_err("Can't dump task %d running in 32-bit mode\n", pid);
- return false;
- }
-
- return true;
-}
-
-int syscall_seized(struct parasite_ctl *ctl, int nr, unsigned long *ret,
- unsigned long arg1,
- unsigned long arg2,
- unsigned long arg3,
- unsigned long arg4,
- unsigned long arg5,
- unsigned long arg6)
-{
- user_regs_struct_t regs = ctl->orig.regs;
- int err;
-
- regs.ax = (unsigned long)nr;
- regs.di = arg1;
- regs.si = arg2;
- regs.dx = arg3;
- regs.r10 = arg4;
- regs.r8 = arg5;
- regs.r9 = arg6;
-
- err = __parasite_execute_syscall(ctl, ®s);
-
- *ret = regs.ax;
- return err;
-}
-
-int get_task_regs(pid_t pid, user_regs_struct_t regs, CoreEntry *core)
-{
- struct xsave_struct xsave = { };
-
- struct iovec iov;
- int ret = -1;
-
- pr_info("Dumping GP/FPU registers for %d\n", pid);
-
- /* Did we come from a system call? */
- if ((int)regs.orig_ax >= 0) {
- /* Restart the system call */
- switch ((long)(int)regs.ax) {
- case -ERESTARTNOHAND:
- case -ERESTARTSYS:
- case -ERESTARTNOINTR:
- regs.ax = regs.orig_ax;
- regs.ip -= 2;
- break;
- case -ERESTART_RESTARTBLOCK:
- pr_warn("Will restore %d with interrupted system call\n", pid);
- regs.ax = -EINTR;
- break;
- }
- }
-
-#define assign_reg(dst, src, e) do { dst->e = (__typeof__(dst->e))src.e; } while (0)
-#define assign_array(dst, src, e) memcpy(dst->e, &src.e, sizeof(src.e))
-
- assign_reg(core->thread_info->gpregs, regs, r15);
- assign_reg(core->thread_info->gpregs, regs, r14);
- assign_reg(core->thread_info->gpregs, regs, r13);
- assign_reg(core->thread_info->gpregs, regs, r12);
- assign_reg(core->thread_info->gpregs, regs, bp);
- assign_reg(core->thread_info->gpregs, regs, bx);
- assign_reg(core->thread_info->gpregs, regs, r11);
- assign_reg(core->thread_info->gpregs, regs, r10);
- assign_reg(core->thread_info->gpregs, regs, r9);
- assign_reg(core->thread_info->gpregs, regs, r8);
- assign_reg(core->thread_info->gpregs, regs, ax);
- assign_reg(core->thread_info->gpregs, regs, cx);
- assign_reg(core->thread_info->gpregs, regs, dx);
- assign_reg(core->thread_info->gpregs, regs, si);
- assign_reg(core->thread_info->gpregs, regs, di);
- assign_reg(core->thread_info->gpregs, regs, orig_ax);
- assign_reg(core->thread_info->gpregs, regs, ip);
- assign_reg(core->thread_info->gpregs, regs, cs);
- assign_reg(core->thread_info->gpregs, regs, flags);
- assign_reg(core->thread_info->gpregs, regs, sp);
- assign_reg(core->thread_info->gpregs, regs, ss);
- assign_reg(core->thread_info->gpregs, regs, fs_base);
- assign_reg(core->thread_info->gpregs, regs, gs_base);
- assign_reg(core->thread_info->gpregs, regs, ds);
- assign_reg(core->thread_info->gpregs, regs, es);
- assign_reg(core->thread_info->gpregs, regs, fs);
- assign_reg(core->thread_info->gpregs, regs, gs);
-
-#ifndef PTRACE_GETREGSET
-# define PTRACE_GETREGSET 0x4204
-#endif
-
- if (!cpu_has_feature(X86_FEATURE_FPU))
- goto out;
-
- /*
- * FPU fetched either via fxsave or via xsave,
- * thus decode it accrodingly.
- */
-
- if (cpu_has_feature(X86_FEATURE_XSAVE)) {
- iov.iov_base = &xsave;
- iov.iov_len = sizeof(xsave);
-
- if (ptrace(PTRACE_GETREGSET, pid, (unsigned int)NT_X86_XSTATE, &iov) < 0) {
- pr_perror("Can't obtain FPU registers for %d", pid);
- goto err;
- }
- } else {
- if (ptrace(PTRACE_GETFPREGS, pid, NULL, &xsave)) {
- pr_perror("Can't obtain FPU registers for %d", pid);
- goto err;
- }
- }
-
- assign_reg(core->thread_info->fpregs, xsave.i387, cwd);
- assign_reg(core->thread_info->fpregs, xsave.i387, swd);
- assign_reg(core->thread_info->fpregs, xsave.i387, twd);
- assign_reg(core->thread_info->fpregs, xsave.i387, fop);
- assign_reg(core->thread_info->fpregs, xsave.i387, rip);
- assign_reg(core->thread_info->fpregs, xsave.i387, rdp);
- assign_reg(core->thread_info->fpregs, xsave.i387, mxcsr);
- assign_reg(core->thread_info->fpregs, xsave.i387, mxcsr_mask);
-
- /* Make sure we have enough space */
- BUG_ON(core->thread_info->fpregs->n_st_space != ARRAY_SIZE(xsave.i387.st_space));
- BUG_ON(core->thread_info->fpregs->n_xmm_space != ARRAY_SIZE(xsave.i387.xmm_space));
-
- assign_array(core->thread_info->fpregs, xsave.i387, st_space);
- assign_array(core->thread_info->fpregs, xsave.i387, xmm_space);
-
- if (cpu_has_feature(X86_FEATURE_XSAVE)) {
- BUG_ON(core->thread_info->fpregs->xsave->n_ymmh_space != ARRAY_SIZE(xsave.ymmh.ymmh_space));
-
- assign_reg(core->thread_info->fpregs->xsave, xsave.xsave_hdr, xstate_bv);
- assign_array(core->thread_info->fpregs->xsave, xsave.ymmh, ymmh_space);
- }
-
-#undef assign_reg
-#undef assign_array
-
-out:
- ret = 0;
-
-err:
- return ret;
-}
-
-int arch_alloc_thread_info(CoreEntry *core)
-{
- size_t sz;
- bool with_fpu, with_xsave = false;
- void *m;
- ThreadInfoX86 *ti = NULL;
-
-
- with_fpu = cpu_has_feature(X86_FEATURE_FPU);
-
- sz = sizeof(ThreadInfoX86) + sizeof(UserX86RegsEntry);
- if (with_fpu) {
- sz += sizeof(UserX86FpregsEntry);
- with_xsave = cpu_has_feature(X86_FEATURE_XSAVE);
- if (with_xsave)
- sz += sizeof(UserX86XsaveEntry);
- }
-
- m = xmalloc(sz);
- if (!m)
- return -1;
-
- ti = core->thread_info = xptr_pull(&m, ThreadInfoX86);
- thread_info_x86__init(ti);
- ti->gpregs = xptr_pull(&m, UserX86RegsEntry);
- user_x86_regs_entry__init(ti->gpregs);
-
- if (with_fpu) {
- UserX86FpregsEntry *fpregs;
-
- fpregs = ti->fpregs = xptr_pull(&m, UserX86FpregsEntry);
- user_x86_fpregs_entry__init(fpregs);
-
- /* These are numbers from kernel */
- fpregs->n_st_space = 32;
- fpregs->n_xmm_space = 64;
-
- fpregs->st_space = xzalloc(pb_repeated_size(fpregs, st_space));
- fpregs->xmm_space = xzalloc(pb_repeated_size(fpregs, xmm_space));
-
- if (!fpregs->st_space || !fpregs->xmm_space)
- goto err;
-
- if (with_xsave) {
- UserX86XsaveEntry *xsave;
-
- xsave = fpregs->xsave = xptr_pull(&m, UserX86XsaveEntry);
- user_x86_xsave_entry__init(xsave);
-
- xsave->n_ymmh_space = 64;
- xsave->ymmh_space = xzalloc(pb_repeated_size(xsave, ymmh_space));
- if (!xsave->ymmh_space)
- goto err;
- }
- }
-
- return 0;
-err:
- return -1;
-}
-
-void arch_free_thread_info(CoreEntry *core)
-{
- if (!core->thread_info)
- return;
-
- if (core->thread_info->fpregs->xsave)
- xfree(core->thread_info->fpregs->xsave->ymmh_space);
- xfree(core->thread_info->fpregs->st_space);
- xfree(core->thread_info->fpregs->xmm_space);
- xfree(core->thread_info);
-}
-
-static bool valid_xsave_frame(CoreEntry *core)
-{
- struct xsave_struct *x = NULL;
-
- if (core->thread_info->fpregs->n_st_space < ARRAY_SIZE(x->i387.st_space)) {
- pr_err("Corruption in FPU st_space area "
- "(got %li but %li expected)\n",
- (long)core->thread_info->fpregs->n_st_space,
- (long)ARRAY_SIZE(x->i387.st_space));
- return false;
- }
-
- if (core->thread_info->fpregs->n_xmm_space < ARRAY_SIZE(x->i387.xmm_space)) {
- pr_err("Corruption in FPU xmm_space area "
- "(got %li but %li expected)\n",
- (long)core->thread_info->fpregs->n_st_space,
- (long)ARRAY_SIZE(x->i387.xmm_space));
- return false;
- }
-
- if (cpu_has_feature(X86_FEATURE_XSAVE)) {
- if (core->thread_info->fpregs->xsave &&
- core->thread_info->fpregs->xsave->n_ymmh_space < ARRAY_SIZE(x->ymmh.ymmh_space)) {
- pr_err("Corruption in FPU ymmh_space area "
- "(got %li but %li expected)\n",
- (long)core->thread_info->fpregs->xsave->n_ymmh_space,
- (long)ARRAY_SIZE(x->ymmh.ymmh_space));
- return false;
- }
- } else {
- /*
- * If the image has xsave area present then CPU we're restoring
- * on must have X86_FEATURE_XSAVE feature until explicitly
- * stated in options.
- */
- if (core->thread_info->fpregs->xsave) {
- if (opts.cpu_cap & CPU_CAP_FPU) {
- pr_err("FPU xsave area present, "
- "but host cpu doesn't support it\n");
- return false;
- } else
- pr_warn_once("FPU is about to restore ignoring ymm state!\n");
- }
- }
-
- return true;
-}
-
-static void show_rt_xsave_frame(struct xsave_struct *x)
-{
- struct fpx_sw_bytes *fpx = (void *)&x->i387.sw_reserved;
- struct xsave_hdr_struct *xsave_hdr = &x->xsave_hdr;
- struct i387_fxsave_struct *i387 = &x->i387;
-
- pr_debug("xsave runtime structure\n");
- pr_debug("-----------------------\n");
-
- pr_debug("cwd:%x swd:%x twd:%x fop:%x mxcsr:%x mxcsr_mask:%x\n",
- (int)i387->cwd, (int)i387->swd, (int)i387->twd,
- (int)i387->fop, (int)i387->mxcsr, (int)i387->mxcsr_mask);
-
- pr_debug("magic1:%x extended_size:%x xstate_bv:%lx xstate_size:%x\n",
- fpx->magic1, fpx->extended_size, (long)fpx->xstate_bv, fpx->xstate_size);
-
- pr_debug("xstate_bv: %lx\n", (long)xsave_hdr->xstate_bv);
-
- pr_debug("-----------------------\n");
-}
-
-int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core)
-{
- fpu_state_t *fpu_state = &sigframe->fpu_state;
- struct xsave_struct *x = &fpu_state->xsave;
-
- /*
- * If no FPU information provided -- we're restoring
- * old image which has no FPU support, or the dump simply
- * has no FPU support at all.
- */
- if (!core->thread_info->fpregs) {
- fpu_state->has_fpu = false;
- return 0;
- }
-
- if (!valid_xsave_frame(core))
- return -1;
-
- fpu_state->has_fpu = true;
-
-#define assign_reg(dst, src, e) do { dst.e = (__typeof__(dst.e))src->e; } while (0)
-#define assign_array(dst, src, e) memcpy(dst.e, (src)->e, sizeof(dst.e))
-
- assign_reg(x->i387, core->thread_info->fpregs, cwd);
- assign_reg(x->i387, core->thread_info->fpregs, swd);
- assign_reg(x->i387, core->thread_info->fpregs, twd);
- assign_reg(x->i387, core->thread_info->fpregs, fop);
- assign_reg(x->i387, core->thread_info->fpregs, rip);
- assign_reg(x->i387, core->thread_info->fpregs, rdp);
- assign_reg(x->i387, core->thread_info->fpregs, mxcsr);
- assign_reg(x->i387, core->thread_info->fpregs, mxcsr_mask);
-
- assign_array(x->i387, core->thread_info->fpregs, st_space);
- assign_array(x->i387, core->thread_info->fpregs, xmm_space);
-
- if (cpu_has_feature(X86_FEATURE_XSAVE)) {
- struct fpx_sw_bytes *fpx_sw = (void *)&x->i387.sw_reserved;
- void *magic2;
-
- x->xsave_hdr.xstate_bv = XSTATE_FP | XSTATE_SSE | XSTATE_YMM;
-
- /*
- * fpregs->xsave pointer might not present on image so we
- * simply clear out all ymm registers.
- */
- if (core->thread_info->fpregs->xsave)
- assign_array(x->ymmh, core->thread_info->fpregs->xsave, ymmh_space);
-
- fpx_sw->magic1 = FP_XSTATE_MAGIC1;
- fpx_sw->xstate_bv = XSTATE_FP | XSTATE_SSE | XSTATE_YMM;
- fpx_sw->xstate_size = sizeof(struct xsave_struct);
- fpx_sw->extended_size = sizeof(struct xsave_struct) + FP_XSTATE_MAGIC2_SIZE;
-
- /*
- * This should be at the end of xsave frame.
- */
- magic2 = fpu_state->__pad + sizeof(struct xsave_struct);
- *(u32 *)magic2 = FP_XSTATE_MAGIC2;
- }
-
- show_rt_xsave_frame(x);
-
-#undef assign_reg
-#undef assign_array
-
- return 0;
-}
-
-void *mmap_seized(struct parasite_ctl *ctl,
- void *addr, size_t length, int prot,
- int flags, int fd, off_t offset)
-{
- unsigned long map;
- int err;
-
- err = syscall_seized(ctl, __NR_mmap, &map,
- (unsigned long)addr, length, prot, flags, fd, offset);
- if (err < 0)
- return NULL;
-
- if (IS_ERR_VALUE(map)) {
- if (map == -EACCES && (prot & PROT_WRITE) && (prot & PROT_EXEC))
- pr_warn("mmap(PROT_WRITE | PROT_EXEC) failed for %d, "
- "check selinux execmem policy\n", ctl->pid.real);
- return NULL;
- }
-
- return (void *)map;
-}
-
-int restore_gpregs(struct rt_sigframe *f, UserX86RegsEntry *r)
-{
-#define CPREG1(d) f->uc.uc_mcontext.d = r->d
-#define CPREG2(d, s) f->uc.uc_mcontext.d = r->s
-
-#ifdef CONFIG_X86_64
- CPREG1(r8);
- CPREG1(r9);
- CPREG1(r10);
- CPREG1(r11);
- CPREG1(r12);
- CPREG1(r13);
- CPREG1(r14);
- CPREG1(r15);
-#endif
-
- CPREG2(rdi, di);
- CPREG2(rsi, si);
- CPREG2(rbp, bp);
- CPREG2(rbx, bx);
- CPREG2(rdx, dx);
- CPREG2(rax, ax);
- CPREG2(rcx, cx);
- CPREG2(rsp, sp);
- CPREG2(rip, ip);
- CPREG2(eflags, flags);
-
- CPREG1(cs);
- CPREG1(ss);
-
-#ifdef CONFIG_X86_32
- CPREG1(gs);
- CPREG1(fs);
- CPREG1(es);
- CPREG1(ds);
-#endif
-
- return 0;
-}
-
-int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, fpu_state_t *fpu_state)
-{
- unsigned long addr = (unsigned long)(void *)&fpu_state->xsave;
-
- if ((addr % 64ul) == 0ul) {
- sigframe->uc.uc_mcontext.fpstate = &fpu_state->xsave;
- } else {
- pr_err("Unaligned address passed: %lx\n", addr);
- return -1;
- }
-
- return 0;
-}
-
-/* Copied from the gdb header gdb/nat/x86-dregs.h */
-
-/* Debug registers' indices. */
-#define DR_FIRSTADDR 0
-#define DR_LASTADDR 3
-#define DR_NADDR 4 /* The number of debug address registers. */
-#define DR_STATUS 6 /* Index of debug status register (DR6). */
-#define DR_CONTROL 7 /* Index of debug control register (DR7). */
-
-#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit. */
-#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit. */
-#define DR_ENABLE_SIZE 2 /* Two enable bits per debug register. */
-
-/* Locally enable the break/watchpoint in the I'th debug register. */
-#define X86_DR_LOCAL_ENABLE(i) (1 << (DR_LOCAL_ENABLE_SHIFT + DR_ENABLE_SIZE * (i)))
-
-int ptrace_set_breakpoint(pid_t pid, void *addr)
-{
- int ret;
-
- /* Set a breakpoint */
- if (ptrace(PTRACE_POKEUSER, pid,
- offsetof(struct user, u_debugreg[DR_FIRSTADDR]),
- addr)) {
- pr_perror("Unable to setup a breakpoint into %d", pid);
- return -1;
- }
-
- /* Enable the breakpoint */
- if (ptrace(PTRACE_POKEUSER, pid,
- offsetof(struct user, u_debugreg[DR_CONTROL]),
- X86_DR_LOCAL_ENABLE(DR_FIRSTADDR))) {
- pr_perror("Unable to enable the breakpoint for %d", pid);
- return -1;
- }
-
- ret = ptrace(PTRACE_CONT, pid, NULL, NULL);
- if (ret) {
- pr_perror("Unable to restart the stopped tracee process %d", pid);
- return -1;
- }
-
- return 1;
-}
-
-int ptrace_flush_breakpoints(pid_t pid)
-{
- /* Disable the breakpoint */
- if (ptrace(PTRACE_POKEUSER, pid,
- offsetof(struct user, u_debugreg[DR_CONTROL]),
- 0)) {
- pr_perror("Unable to disable the breakpoint for %d", pid);
- return -1;
- }
-
- return 0;
-}
-
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
deleted file mode 100644
index d447b65cb4c6..000000000000
--- a/arch/x86/include/asm/atomic.h
+++ /dev/null
@@ -1,78 +0,0 @@
-#ifndef __CR_ATOMIC_H__
-#define __CR_ATOMIC_H__
-
-#include "asm/cmpxchg.h"
-
-#define LOCK_PREFIX "\n\tlock; "
-
-typedef struct {
- int counter;
-} atomic_t;
-
-#define ATOMIC_INIT(i) { (i) }
-
-static inline int atomic_read(const atomic_t *v)
-{
- return (*(volatile int *)&(v)->counter);
-}
-
-static inline void atomic_set(atomic_t *v, int i)
-{
- v->counter = i;
-}
-
-static inline void atomic_add(int i, atomic_t *v)
-{
- asm volatile(LOCK_PREFIX "addl %1,%0"
- : "+m" (v->counter)
- : "ir" (i));
-}
-
-static inline void atomic_sub(int i, atomic_t *v)
-{
- asm volatile(LOCK_PREFIX "subl %1,%0"
- : "+m" (v->counter)
- : "ir" (i));
-}
-
-static inline void atomic_inc(atomic_t *v)
-{
- asm volatile(LOCK_PREFIX "incl %0"
- : "+m" (v->counter));
-}
-
-static inline void atomic_dec(atomic_t *v)
-{
- asm volatile(LOCK_PREFIX "decl %0"
- : "+m" (v->counter));
-}
-
-static inline int atomic_dec_and_test(atomic_t *v)
-{
- unsigned char c;
-
- asm volatile(LOCK_PREFIX "decl %0; sete %1"
- : "+m" (v->counter), "=qm" (c)
- : : "memory");
- return c != 0;
-}
-
-static inline int atomic_add_return(int i, atomic_t *v)
-{
- return i + xadd(&v->counter, i);
-}
-
-static inline int atomic_sub_return(int i, atomic_t *v)
-{
- return atomic_add_return(-i, v);
-}
-
-#define atomic_inc_return(v) (atomic_add_return(1, v))
-#define atomic_dec_return(v) (atomic_sub_return(1, v))
-
-static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
-{
- return cmpxchg(&v->counter, old, new);
-}
-
-#endif /* __CR_ATOMIC_H__ */
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
deleted file mode 100644
index 7d6283183953..000000000000
--- a/arch/x86/include/asm/bitops.h
+++ /dev/null
@@ -1,113 +0,0 @@
-#ifndef __CR_BITOPS_H__
-#define __CR_BITOPS_H__
-
-#include "asm/bitsperlong.h"
-
-#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
-#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG)
-
-#define DECLARE_BITMAP(name, bits) \
- unsigned long name[BITS_TO_LONGS(bits)]
-
-#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
-/* Technically wrong, but this avoids compilation errors on some gcc
- versions. */
-#define BITOP_ADDR(x) "=m" (*(volatile long *) (x))
-#else
-#define BITOP_ADDR(x) "+m" (*(volatile long *) (x))
-#endif
-
-#define ADDR BITOP_ADDR(addr)
-
-static inline void set_bit(int nr, volatile unsigned long *addr)
-{
- asm volatile("btsl %1,%0" : ADDR : "Ir" (nr) : "memory");
-}
-
-static inline void change_bit(int nr, volatile unsigned long *addr)
-{
- asm volatile("btcl %1,%0" : ADDR : "Ir" (nr));
-}
-
-static inline int test_bit(int nr, volatile const unsigned long *addr)
-{
- int oldbit;
-
- asm volatile("bt %2,%1\n\t"
- "sbb %0,%0"
- : "=r" (oldbit)
- : "m" (*(unsigned long *)addr), "Ir" (nr));
-
- return oldbit;
-}
-
-static inline void clear_bit(int nr, volatile unsigned long *addr)
-{
- asm volatile("btrl %1,%0" : ADDR : "Ir" (nr));
-}
-
-/**
- * __ffs - find first set bit in word
- * @word: The word to search
- *
- * Undefined if no bit exists, so code should check against 0 first.
- */
-static inline unsigned long __ffs(unsigned long word)
-{
- asm("bsf %1,%0"
- : "=r" (word)
- : "rm" (word));
- return word;
-}
-
-#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
-
-/*
- * Find the next set bit in a memory region.
- */
-static inline
-unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
- unsigned long offset)
-{
- const unsigned long *p = addr + BITOP_WORD(offset);
- unsigned long result = offset & ~(BITS_PER_LONG-1);
- unsigned long tmp;
-
- if (offset >= size)
- return size;
- size -= result;
- offset %= BITS_PER_LONG;
- if (offset) {
- tmp = *(p++);
- tmp &= (~0UL << offset);
- if (size < BITS_PER_LONG)
- goto found_first;
- if (tmp)
- goto found_middle;
- size -= BITS_PER_LONG;
- result += BITS_PER_LONG;
- }
- while (size & ~(BITS_PER_LONG-1)) {
- if ((tmp = *(p++)))
- goto found_middle;
- result += BITS_PER_LONG;
- size -= BITS_PER_LONG;
- }
- if (!size)
- return result;
- tmp = *p;
-
-found_first:
- tmp &= (~0UL >> (BITS_PER_LONG - size));
- if (tmp == 0UL) /* Are any bits set? */
- return result + size; /* Nope. */
-found_middle:
- return result + __ffs(tmp);
-}
-
-#define for_each_bit(i, bitmask) \
- for (i = find_next_bit(bitmask, sizeof(bitmask), 0); \
- i < sizeof(bitmask); \
- i = find_next_bit(bitmask, sizeof(bitmask), i + 1))
-
-#endif /* __CR_BITOPS_H__ */
diff --git a/arch/x86/include/asm/bitsperlong.h b/arch/x86/include/asm/bitsperlong.h
deleted file mode 100644
index 7e0a71e8d71d..000000000000
--- a/arch/x86/include/asm/bitsperlong.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef __CR_BITSPERLONG_H__
-#define __CR_BITSPERLONG_H__
-
-#ifdef CONFIG_X86_64
-# define BITS_PER_LONG 64
-#else
-# define BITS_PER_LONG 32
-#endif
-
-#endif /* __CR_BITSPERLONG_H__ */
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
deleted file mode 100644
index 600d0a7fff84..000000000000
--- a/arch/x86/include/asm/cmpxchg.h
+++ /dev/null
@@ -1,105 +0,0 @@
-#ifndef __CR_CMPXCHG_H__
-#define __CR_CMPXCHG_H__
-
-#include "asm/int.h"
-
-#define __X86_CASE_B 1
-#define __X86_CASE_W 2
-#define __X86_CASE_L 4
-#define __X86_CASE_Q 8
-
-/*
- * An exchange-type operation, which takes a value and a pointer, and
- * returns the old value. Make sure you never reach non-case statement
- * here, otherwise behaviour is undefined.
- */
-#define __xchg_op(ptr, arg, op, lock) \
- ({ \
- __typeof__ (*(ptr)) __ret = (arg); \
- switch (sizeof(*(ptr))) { \
- case __X86_CASE_B: \
- asm volatile (lock #op "b %b0, %1\n" \
- : "+q" (__ret), "+m" (*(ptr)) \
- : : "memory", "cc"); \
- break; \
- case __X86_CASE_W: \
- asm volatile (lock #op "w %w0, %1\n" \
- : "+r" (__ret), "+m" (*(ptr)) \
- : : "memory", "cc"); \
- break; \
- case __X86_CASE_L: \
- asm volatile (lock #op "l %0, %1\n" \
- : "+r" (__ret), "+m" (*(ptr)) \
- : : "memory", "cc"); \
- break; \
- case __X86_CASE_Q: \
- asm volatile (lock #op "q %q0, %1\n" \
- : "+r" (__ret), "+m" (*(ptr)) \
- : : "memory", "cc"); \
- break; \
- } \
- __ret; \
- })
-
-#define __xadd(ptr, inc, lock) __xchg_op((ptr), (inc), xadd, lock)
-#define xadd(ptr, inc) __xadd((ptr), (inc), "lock ;")
-
-/* Borrowed from linux kernel arch/x86/include/asm/cmpxchg.h */
-
-/*
- * Atomic compare and exchange. Compare OLD with MEM, if identical,
- * store NEW in MEM. Return the initial value in MEM. Success is
- * indicated by comparing RETURN with OLD.
- */
-#define __raw_cmpxchg(ptr, old, new, size, lock) \
-({ \
- __typeof__(*(ptr)) __ret; \
- __typeof__(*(ptr)) __old = (old); \
- __typeof__(*(ptr)) __new = (new); \
- switch (size) { \
- case __X86_CASE_B: \
- { \
- volatile u8 *__ptr = (volatile u8 *)(ptr); \
- asm volatile(lock "cmpxchgb %2,%1" \
- : "=a" (__ret), "+m" (*__ptr) \
- : "q" (__new), "0" (__old) \
- : "memory"); \
- break; \
- } \
- case __X86_CASE_W: \
- { \
- volatile u16 *__ptr = (volatile u16 *)(ptr); \
- asm volatile(lock "cmpxchgw %2,%1" \
- : "=a" (__ret), "+m" (*__ptr) \
- : "r" (__new), "0" (__old) \
- : "memory"); \
- break; \
- } \
- case __X86_CASE_L: \
- { \
- volatile u32 *__ptr = (volatile u32 *)(ptr); \
- asm volatile(lock "cmpxchgl %2,%1" \
- : "=a" (__ret), "+m" (*__ptr) \
- : "r" (__new), "0" (__old) \
- : "memory"); \
- break; \
- } \
- case __X86_CASE_Q: \
- { \
- volatile u64 *__ptr = (volatile u64 *)(ptr); \
- asm volatile(lock "cmpxchgq %2,%1" \
- : "=a" (__ret), "+m" (*__ptr) \
- : "r" (__new), "0" (__old) \
- : "memory"); \
- break; \
- } \
- } \
- __ret; \
-})
-
-#define __cmpxchg(ptr, old, new, size) \
- __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
-#define cmpxchg(ptr, old, new) \
- __cmpxchg(ptr, old, new, sizeof(*(ptr)))
-
-#endif /* __CR_CMPXCHG_H__ */
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
deleted file mode 100644
index 6f49229d6396..000000000000
--- a/arch/x86/include/asm/cpu.h
+++ /dev/null
@@ -1,207 +0,0 @@
-#ifndef __CR_ASM_CPU_H__
-#define __CR_ASM_CPU_H__
-
-#include "asm/types.h"
-
-/*
- * Adopted from linux kernel and enhanced from Intel/AMD manuals.
- */
-
-#define NCAPINTS (12) /* N 32-bit words worth of info */
-#define NCAPINTS_BITS (NCAPINTS * 32)
-
-#define X86_FEATURE_FPU (0*32+ 0) /* Onboard FPU */
-#define X86_FEATURE_VME (0*32+ 1) /* Virtual 8086 Mode Enhancements */
-#define X86_FEATURE_DE (0*32+ 2) /* Debugging Extensions */
-#define X86_FEATURE_PSE (0*32+ 3) /* Page Size Extension */
-#define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */
-#define X86_FEATURE_MSR (0*32+ 5) /* Model Specific Registers RDMSR and WRMSR Instructions */
-#define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extension */
-#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Exception */
-#define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */
-#define X86_FEATURE_APIC (0*32+ 9) /* APIC On-Chip */
-#define X86_FEATURE_SEP (0*32+11) /* SYSENTER and SYSEXIT Instructions */
-#define X86_FEATURE_MTRR (0*32+12) /* Memory Type Range Registers */
-#define X86_FEATURE_PGE (0*32+13) /* PTE Global Bit */
-#define X86_FEATURE_MCA (0*32+14) /* Machine Check Architecture */
-#define X86_FEATURE_CMOV (0*32+15) /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */
-#define X86_FEATURE_PAT (0*32+16) /* Page Attribute Table */
-#define X86_FEATURE_PSE36 (0*32+17) /* 36-Bit Page Size Extension */
-#define X86_FEATURE_PSN (0*32+18) /* Processor Serial Number */
-#define X86_FEATURE_DS (0*32+21) /* Debug Store */
-#define X86_FEATURE_CLFLUSH (0*32+19) /* CLFLUSH instruction */
-#define X86_FEATURE_ACPI (0*32+22) /* Thermal Monitor and Software Controlled Clock Facilities */
-#define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */
-#define X86_FEATURE_FXSR (0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
-#define X86_FEATURE_XMM (0*32+25) /* "sse" */
-#define X86_FEATURE_XMM2 (0*32+26) /* "sse2" */
-#define X86_FEATURE_SS (0*32+27) /* Self Snoop */
-#define X86_FEATURE_HTT (0*32+28) /* Multi-Threading */
-#define X86_FEATURE_TM (0*32+29) /* Thermal Monitor */
-#define X86_FEATURE_PBE (0*32+31) /* Pending Break Enable */
-
-#define X86_FEATURE_SYSCALL (1*32+11) /* SYSCALL/SYSRET */
-#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */
-#define X86_FEATURE_RDTSCP (1*32+27) /* RDTSCP */
-#define X86_FEATURE_3DNOWEXT (1*32+30) /* AMD 3DNow! extensions */
-#define X86_FEATURE_3DNOW (1*32+31) /* 3DNow! */
-
-#define X86_FEATURE_REP_GOOD (3*32+16) /* rep microcode works well */
-#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */
-
-#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
-#define X86_FEATURE_PCLMULQDQ (4*32+ 1) /* PCLMULQDQ instruction */
-#define X86_FEATURE_DTES64 (4*32+ 2) /* 64-bit DS Area */
-#define X86_FEATURE_MWAIT (4*32+ 3) /* "monitor" Monitor/Mwait support */
-#define X86_FEATURE_DSCPL (4*32+ 4) /* CPL Qualified Debug Store */
-#define X86_FEATURE_VMX (4*32+ 5) /* Virtual Machine Extensions */
-#define X86_FEATURE_SMX (4*32+ 6) /* Safer Mode Extensions */
-#define X86_FEATURE_EST (4*32+ 7) /* Enhanced Intel SpeedStep technology */
-#define X86_FEATURE_TM2 (4*32+ 8) /* Thermal Monitor 2 */
-#define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */
-#define X86_FEATURE_CNXTID (4*32+10) /* L1 Context ID */
-#define X86_FEATURE_FMA (4*32+12) /* Fused multiply-add */
-#define X86_FEATURE_CX16 (4*32+13) /* CMPXCHG16B */
-#define X86_FEATURE_XTPR_UCTL (4*32+14) /* xTPR Update Control */
-#define X86_FEATURE_PDCM (4*32+15) /* Perfmon and Debug Capability */
-#define X86_FEATURE_PCID (4*32+17) /* Process-context identifiers */
-#define X86_FEATURE_DCA (4*32+18) /* Ability to prefetch data from a memory mapped device */
-#define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */
-#define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */
-#define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */
-#define X86_FEATURE_MOVBE (4*32+22) /* MOVBE instruction */
-#define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */
-#define X86_FEATURE_TSCDL (4*32+24) /* Local APIC timer supports one-shot operation using a TSC deadline value */
-#define X86_FEATURE_AES (4*32+25) /* AES instructions */
-#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
-#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
-#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */
-#define X86_FEATURE_F16C (4*32+29) /* 16-bit fp conversions */
-#define X86_FEATURE_RDRAND (4*32+30) /* The RDRAND instruction */
-
-#define X86_FEATURE_ABM (6*32+ 5) /* Advanced bit manipulation */
-#define X86_FEATURE_SSE4A (6*32+ 6) /* SSE-4A */
-#define X86_FEATURE_MISALIGNSSE (6*32+ 7) /* Misaligned SSE mode */
-#define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */
-#define X86_FEATURE_XOP (6*32+11) /* extended AVX instructions */
-#define X86_FEATURE_FMA4 (6*32+16) /* 4 operands MAC instructions */
-#define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */
-
-#define X86_FEATURE_FSGSBASE (9*32+ 0) /* Supports RDFSBASE/RDGSBASE/WRFSBASE/WRGSBASE */
-#define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */
-#define X86_FEATURE_HLE (9*32+ 4) /* Hardware Lock Elision */
-#define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */
-#define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */
-#define X86_FEATURE_BMI2 (9*32+ 8) /* 2nd group bit manipulation extensions */
-#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */
-#define X86_FEATURE_INVPCID (9*32+10) /* Invalidate Processor Context ID */
-#define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */
-#define X86_FEATURE_MPX (9*32+14) /* Memory Protection Extension */
-#define X86_FEATURE_AVX512F (9*32+16) /* AVX-512 Foundation */
-#define X86_FEATURE_AVX512DQ (9*32+17) /* AVX-512 Foundation */
-#define X86_FEATURE_RDSEED (9*32+18) /* The RDSEED instruction */
-#define X86_FEATURE_ADX (9*32+19) /* The ADCX and ADOX instructions */
-#define X86_FEATURE_SMAP (9*32+20) /* Supervisor Mode Access Prevention */
-#define X86_FEATURE_CLFLUSHOPT (9*32+23) /* CLFLUSHOPT instruction */
-#define X86_FEATURE_IPT (9*32+25) /* Intel Processor Trace */
-#define X86_FEATURE_AVX512PF (9*32+26) /* AVX-512 Prefetch */
-#define X86_FEATURE_AVX512ER (9*32+27) /* AVX-512 Exponential and Reciprocal */
-#define X86_FEATURE_AVX512CD (9*32+28) /* AVX-512 Conflict Detection */
-#define X86_FEATURE_SHA (9*32+29) /* Intel SHA extensions */
-#define X86_FEATURE_AVX512BW (9*32+30) /* AVX-512 */
-#define X86_FEATURE_AVXVL (9*32+31) /* AVX-512 */
-
-#define X86_FEATURE_XSAVEOPT (10*32+0) /* XSAVEOPT */
-#define X86_FEATURE_XSAVEC (10*32+1) /* XSAVEC */
-#define X86_FEATURE_XGETBV1 (10*32+2) /* XGETBV with ECX = 1 */
-#define X86_FEATURE_XSAVES (10*32+3) /* XSAVES/XRSTORS */
-
-/*
- * Node 11 is our own, kernel has not such entry.
- */
-#define X86_FEATURE_PREFETCHWT1 (11*32+0) /* The PREFETCHWT1 instruction */
-
-static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
- unsigned int *ecx, unsigned int *edx)
-{
- /* ecx is often an input as well as an output. */
- asm volatile("cpuid"
- : "=a" (*eax),
- "=b" (*ebx),
- "=c" (*ecx),
- "=d" (*edx)
- : "0" (*eax), "2" (*ecx)
- : "memory");
-}
-
-static inline void cpuid(unsigned int op,
- unsigned int *eax, unsigned int *ebx,
- unsigned int *ecx, unsigned int *edx)
-{
- *eax = op;
- *ecx = 0;
- native_cpuid(eax, ebx, ecx, edx);
-}
-
-static inline void cpuid_count(unsigned int op, int count,
- unsigned int *eax, unsigned int *ebx,
- unsigned int *ecx, unsigned int *edx)
-{
- *eax = op;
- *ecx = count;
- native_cpuid(eax, ebx, ecx, edx);
-}
-
-static inline unsigned int cpuid_eax(unsigned int op)
-{
- unsigned int eax, ebx, ecx, edx;
-
- cpuid(op, &eax, &ebx, &ecx, &edx);
- return eax;
-}
-
-static inline unsigned int cpuid_ecx(unsigned int op)
-{
- unsigned int eax, ebx, ecx, edx;
-
- cpuid(op, &eax, &ebx, &ecx, &edx);
- return ecx;
-}
-
-static inline unsigned int cpuid_edx(unsigned int op)
-{
- unsigned int eax, ebx, ecx, edx;
-
- cpuid(op, &eax, &ebx, &ecx, &edx);
- return edx;
-}
-
-#define X86_FEATURE_VERSION 1
-
-enum {
- X86_VENDOR_INTEL = 0,
- X86_VENDOR_AMD = 1,
-
- X86_VENDOR_MAX
-};
-
-struct cpuinfo_x86 {
- u8 x86_family;
- u8 x86_vendor;
- u8 x86_model;
- u8 x86_mask;
- u32 x86_capability[NCAPINTS];
- u32 extended_cpuid_level;
- int cpuid_level;
- char x86_vendor_id[16];
- char x86_model_id[64];
-};
-
-extern bool cpu_has_feature(unsigned int feature);
-extern int cpu_init(void);
-extern int cpu_dump_cpuinfo(void);
-extern int cpu_validate_cpuinfo(void);
-extern int cpuinfo_dump(void);
-extern int cpuinfo_check(void);
-
-#endif /* __CR_CPU_H__ */
diff --git a/arch/x86/include/asm/dump.h b/arch/x86/include/asm/dump.h
deleted file mode 100644
index 1505fd2983b0..000000000000
--- a/arch/x86/include/asm/dump.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef __CR_ASM_DUMP_H__
-#define __CR_ASM_DUMP_H__
-
-extern int get_task_regs(pid_t pid, user_regs_struct_t regs, CoreEntry *core);
-extern int arch_alloc_thread_info(CoreEntry *core);
-extern void arch_free_thread_info(CoreEntry *core);
-
-
-#define core_put_tls(core, tls)
-
-#endif
diff --git a/arch/x86/include/asm/fpu.h b/arch/x86/include/asm/fpu.h
deleted file mode 100644
index be168324bc96..000000000000
--- a/arch/x86/include/asm/fpu.h
+++ /dev/null
@@ -1,102 +0,0 @@
-#ifndef __CR_ASM_FPU_H__
-#define __CR_ASM_FPU_H__
-
-#include <sys/types.h>
-#include <stdbool.h>
-
-#include "compiler.h"
-#include "asm/int.h"
-
-#define FP_MIN_ALIGN_BYTES 64
-
-#define FP_XSTATE_MAGIC1 0x46505853U
-#define FP_XSTATE_MAGIC2 0x46505845U
-#define FP_XSTATE_MAGIC2_SIZE sizeof(FP_XSTATE_MAGIC2)
-
-#define XSTATE_FP 0x1
-#define XSTATE_SSE 0x2
-#define XSTATE_YMM 0x4
-
-#define FXSAVE_SIZE 512
-#define XSAVE_SIZE 832
-
-struct fpx_sw_bytes {
- u32 magic1;
- u32 extended_size;
- u64 xstate_bv;
- u32 xstate_size;
- u32 padding[7];
-};
-
-struct i387_fxsave_struct {
- u16 cwd; /* Control Word */
- u16 swd; /* Status Word */
- u16 twd; /* Tag Word */
- u16 fop; /* Last Instruction Opcode */
- union {
- struct {
- u64 rip; /* Instruction Pointer */
- u64 rdp; /* Data Pointer */
- };
- struct {
- u32 fip; /* FPU IP Offset */
- u32 fcs; /* FPU IP Selector */
- u32 foo; /* FPU Operand Offset */
- u32 fos; /* FPU Operand Selector */
- };
- };
- u32 mxcsr; /* MXCSR Register State */
- u32 mxcsr_mask; /* MXCSR Mask */
-
- /* 8*16 bytes for each FP-reg = 128 bytes */
- u32 st_space[32];
-
- /* 16*16 bytes for each XMM-reg = 256 bytes */
- u32 xmm_space[64];
-
- u32 padding[12];
-
- union {
- u32 padding1[12];
- u32 sw_reserved[12];
- };
-
-} __aligned(16);
-
-struct xsave_hdr_struct {
- u64 xstate_bv;
- u64 reserved1[2];
- u64 reserved2[5];
-} __packed;
-
-struct ymmh_struct {
- u32 ymmh_space[64];
-} __packed;
-
-/*
- * cpu requires it to be 64 byte aligned
- */
-struct xsave_struct {
- struct i387_fxsave_struct i387;
- struct xsave_hdr_struct xsave_hdr;
- struct ymmh_struct ymmh;
-} __aligned(FP_MIN_ALIGN_BYTES) __packed;
-
-/*
- * This one is used in restorer.
- */
-typedef struct {
- /*
- * The FPU xsave area must be continious and FP_MIN_ALIGN_BYTES
- * aligned, thus make sure the compiler won't insert any hole here.
- */
-
- union {
- struct xsave_struct xsave;
- unsigned char __pad[sizeof(struct xsave_struct) + FP_XSTATE_MAGIC2_SIZE];
- };
-
- bool has_fpu;
-} fpu_state_t;
-
-#endif /* __CR_ASM_FPU_H__ */
diff --git a/arch/x86/include/asm/int.h b/arch/x86/include/asm/int.h
deleted file mode 100644
index 642804e9b485..000000000000
--- a/arch/x86/include/asm/int.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __CR_ASM_INT_H__
-#define __CR_ASM_INT_H__
-
-#include "asm-generic/int.h"
-
-#endif /* __CR_ASM_INT_H__ */
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
deleted file mode 100644
index 5e0948f07ee6..000000000000
--- a/arch/x86/include/asm/linkage.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef __CR_LINKAGE_H__
-#define __CR_LINKAGE_H__
-
-#ifdef __ASSEMBLY__
-
-#define __ALIGN .align 4, 0x90
-#define __ALIGN_STR ".align 4, 0x90"
-
-#define GLOBAL(name) \
- .globl name; \
- name:
-
-#define ENTRY(name) \
- .globl name; \
- .type name, @function; \
- __ALIGN; \
- name:
-
-#define END(sym) \
- .size sym, . - sym
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* __CR_LINKAGE_H__ */
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
deleted file mode 100644
index 134835556c62..000000000000
--- a/arch/x86/include/asm/page.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef __CR_ASM_PAGE_H__
-#define __CR_ASM_PAGE_H__
-
-#ifndef PAGE_SHIFT
-# define PAGE_SHIFT 12
-#endif
-
-#ifndef PAGE_SIZE
-# define PAGE_SIZE (1UL << PAGE_SHIFT)
-#endif
-
-#ifndef PAGE_MASK
-# define PAGE_MASK (~(PAGE_SIZE - 1))
-#endif
-
-#define PAGE_PFN(addr) ((addr) / PAGE_SIZE)
-#define page_size() PAGE_SIZE
-
-#endif /* __CR_ASM_PAGE_H__ */
diff --git a/arch/x86/include/asm/parasite-syscall.h b/arch/x86/include/asm/parasite-syscall.h
deleted file mode 100644
index 4d56cb07220c..000000000000
--- a/arch/x86/include/asm/parasite-syscall.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef __CR_ASM_PARASITE_SYSCALL_H__
-#define __CR_ASM_PARASITE_SYSCALL_H__
-
-#include "asm/types.h"
-
-struct parasite_ctl;
-
-#define ARCH_SI_TRAP SI_KERNEL
-
-
-extern const char code_syscall[];
-extern const int code_syscall_size;
-
-void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs);
-
-void *mmap_seized(struct parasite_ctl *ctl,
- void *addr, size_t length, int prot,
- int flags, int fd, off_t offset);
-
-#endif
diff --git a/arch/x86/include/asm/parasite.h b/arch/x86/include/asm/parasite.h
deleted file mode 100644
index 669ae63e26cc..000000000000
--- a/arch/x86/include/asm/parasite.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef __ASM_PARASITE_H__
-#define __ASM_PARASITE_H__
-
-#ifdef CONFIG_X86_32
-# define __parasite_entry __attribute__((regparm(3)))
-#endif
-
-static inline void arch_get_tls(tls_t *ptls) { (void)ptls; }
-
-#endif
diff --git a/arch/x86/include/asm/prlimit.h b/arch/x86/include/asm/prlimit.h
deleted file mode 100644
index 6746ba0e6f19..000000000000
--- a/arch/x86/include/asm/prlimit.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef __CR_PRLIMIT_H__
-#define __CR_PRLIMIT_H__
-
-#include <sys/types.h>
-#include <sys/time.h>
-#include <sys/resource.h>
-
-#include "config.h"
-
-#ifndef CONFIG_HAS_PRLIMIT
-extern int prlimit(pid_t pid, int resource, const struct rlimit *new_rlimit, struct rlimit *old_rlimit);
-#endif
-
-#endif /* __CR_PRLIMIT_H__ */
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
deleted file mode 100644
index 9f1bccdbece8..000000000000
--- a/arch/x86/include/asm/processor-flags.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef __CR_PROCESSOR_FLAGS_H__
-#define __CR_PROCESSOR_FLAGS_H__
-
-/* Taken from linux kernel headers */
-
-/*
- * EFLAGS bits
- */
-#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
-#define X86_EFLAGS_BIT1 0x00000002 /* Bit 1 - always on */
-#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
-#define X86_EFLAGS_AF 0x00000010 /* Auxiliary carry Flag */
-#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
-#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
-#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
-#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */
-#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */
-#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */
-#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */
-#define X86_EFLAGS_NT 0x00004000 /* Nested Task */
-#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */
-#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */
-#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */
-#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
-#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
-#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
-
-#endif /* __CR_PROCESSOR_FLAGS_H__ */
diff --git a/arch/x86/include/asm/restore.h b/arch/x86/include/asm/restore.h
deleted file mode 100644
index 9d39b2bbae50..000000000000
--- a/arch/x86/include/asm/restore.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef __CR_ASM_RESTORE_H__
-#define __CR_ASM_RESTORE_H__
-
-#include "asm/restorer.h"
-
-#include "protobuf/core.pb-c.h"
-
-#ifdef CONFIG_X86_64
-#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \
- task_args) \
- asm volatile( \
- "movq %0, %%rbx \n" \
- "movq %1, %%rax \n" \
- "movq %2, %%rdi \n" \
- "movq %%rbx, %%rsp \n" \
- "callq *%%rax \n" \
- : \
- : "g"(new_sp), \
- "g"(restore_task_exec_start), \
- "g"(task_args) \
- : "rsp", "rdi", "rsi", "rbx", "rax", "memory")
-#else /* CONFIG_X86_64 */
-#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \
- task_args) \
- (void)new_sp; \
- (void)restore_task_exec_start; \
- (void)task_args; \
- ;
-#endif /* CONFIG_X86_64 */
-
-#define core_get_tls(pcore, ptls)
-
-
-int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core);
-
-#endif
diff --git a/arch/x86/include/asm/restorer.h b/arch/x86/include/asm/restorer.h
deleted file mode 100644
index 2021c41f5bb4..000000000000
--- a/arch/x86/include/asm/restorer.h
+++ /dev/null
@@ -1,181 +0,0 @@
-#ifndef __CR_ASM_RESTORER_H__
-#define __CR_ASM_RESTORER_H__
-
-#include "asm/types.h"
-#include "asm/fpu.h"
-#include "protobuf/core.pb-c.h"
-
-struct pt_regs {
- unsigned long r15;
- unsigned long r14;
- unsigned long r13;
- unsigned long r12;
- unsigned long bp;
- unsigned long bx;
-
- unsigned long r11;
- unsigned long r10;
- unsigned long r9;
- unsigned long r8;
- unsigned long ax;
- unsigned long cx;
- unsigned long dx;
- unsigned long si;
- unsigned long di;
- unsigned long orig_ax;
-
- unsigned long ip;
- unsigned long cs;
- unsigned long flags;
- unsigned long sp;
- unsigned long ss;
-};
-
-struct rt_sigcontext {
- unsigned long r8;
- unsigned long r9;
- unsigned long r10;
- unsigned long r11;
- unsigned long r12;
- unsigned long r13;
- unsigned long r14;
- unsigned long r15;
- unsigned long rdi;
- unsigned long rsi;
- unsigned long rbp;
- unsigned long rbx;
- unsigned long rdx;
- unsigned long rax;
- unsigned long rcx;
- unsigned long rsp;
- unsigned long rip;
- unsigned long eflags;
- unsigned short cs;
- unsigned short gs;
- unsigned short fs;
- unsigned short ss;
- unsigned long err;
- unsigned long trapno;
- unsigned long oldmask;
- unsigned long cr2;
- void *fpstate;
- unsigned long reserved1[8];
-};
-
-#include "sigframe.h"
-
-struct rt_sigframe {
- char *pretcode;
- struct rt_ucontext uc;
- struct rt_siginfo info;
-
- fpu_state_t fpu_state;
-};
-
-#ifdef CONFIG_X86_64
-#define ARCH_RT_SIGRETURN(new_sp) \
- asm volatile( \
- "movq %0, %%rax \n" \
- "movq %%rax, %%rsp \n" \
- "movl $"__stringify(__NR_rt_sigreturn)", %%eax \n" \
- "syscall \n" \
- : \
- : "r"(new_sp) \
- : "rax","rsp","memory")
-
-#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \
- thread_args, clone_restore_fn) \
- asm volatile( \
- "clone_emul: \n" \
- "movq %2, %%rsi \n" \
- "subq $16, %%rsi \n" \
- "movq %6, %%rdi \n" \
- "movq %%rdi, 8(%%rsi) \n" \
- "movq %5, %%rdi \n" \
- "movq %%rdi, 0(%%rsi) \n" \
- "movq %1, %%rdi \n" \
- "movq %3, %%rdx \n" \
- "movq %4, %%r10 \n" \
- "movl $"__stringify(__NR_clone)", %%eax \n" \
- "syscall \n" \
- \
- "testq %%rax,%%rax \n" \
- "jz thread_run \n" \
- \
- "movq %%rax, %0 \n" \
- "jmp clone_end \n" \
- \
- "thread_run: \n" \
- "xorq %%rbp, %%rbp \n" \
- "popq %%rax \n" \
- "popq %%rdi \n" \
- "callq *%%rax \n" \
- \
- "clone_end: \n" \
- : "=r"(ret) \
- : "g"(clone_flags), \
- "g"(new_sp), \
- "g"(&parent_tid), \
- "g"(&thread_args[i].pid), \
- "g"(clone_restore_fn), \
- "g"(&thread_args[i]) \
- : "rax", "rcx", "rdi", "rsi", "rdx", "r10", "r11", "memory")
-
-#define ARCH_FAIL_CORE_RESTORE \
- asm volatile( \
- "movq %0, %%rsp \n" \
- "movq 0, %%rax \n" \
- "jmp *%%rax \n" \
- : \
- : "r"(ret) \
- : "memory")
-#else /* CONFIG_X86_64 */
-#define ARCH_RT_SIGRETURN(new_sp) \
- asm volatile( \
- "movl %0, %%eax \n" \
- "movl %%eax, %%esp \n" \
- "movl $"__stringify(__NR_rt_sigreturn)", %%eax \n" \
- "int $0x80 \n" \
- : \
- : "r"(new_sp) \
- : "eax","esp","memory")
-
-#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \
- thread_args, clone_restore_fn) \
- (void)ret; \
- (void)clone_flags; \
- (void)new_sp; \
- (void)parent_tid; \
- (void)thread_args; \
- (void)clone_restore_fn; \
- ;
-#define ARCH_FAIL_CORE_RESTORE \
- asm volatile( \
- "movl %0, %%esp \n" \
- "xorl %%eax, %%eax \n" \
- "jmp *%%eax \n" \
- : \
- : "r"(ret) \
- : "memory")
-#endif /* CONFIG_X86_64 */
-
-#define RT_SIGFRAME_UC(rt_sigframe) rt_sigframe->uc
-#define RT_SIGFRAME_REGIP(rt_sigframe) (rt_sigframe)->uc.uc_mcontext.rip
-#define RT_SIGFRAME_HAS_FPU(rt_sigframe) (rt_sigframe)->fpu_state.has_fpu
-#define RT_SIGFRAME_FPU(rt_sigframe) (rt_sigframe)->fpu_state
-
-#define SIGFRAME_OFFSET 8
-
-
-int restore_gpregs(struct rt_sigframe *f, UserX86RegsEntry *r);
-int restore_nonsigframe_gpregs(UserX86RegsEntry *r);
-
-int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, fpu_state_t *fpu_state);
-
-static inline void restore_tls(tls_t *ptls) { (void)ptls; }
-
-int ptrace_set_breakpoint(pid_t pid, void *addr);
-int ptrace_flush_breakpoints(pid_t pid);
-
-
-#endif
diff --git a/arch/x86/include/asm/string.h b/arch/x86/include/asm/string.h
deleted file mode 100644
index e1d875e45f95..000000000000
--- a/arch/x86/include/asm/string.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#ifndef __CR_ASM_STRING_H__
-#define __CR_ASM_STRING_H__
-
-#define HAS_BUILTIN_MEMCPY
-
-#include "compiler.h"
-#include "asm-generic/string.h"
-
-static always_inline void *builtin_memcpy(void *to, const void *from, unsigned int n)
-{
- int d0, d1, d2;
- asm volatile("rep ; movsl \n"
- "movl %4,%%ecx \n"
- "andl $3,%%ecx \n"
- "jz 1f \n"
- "rep ; movsb \n"
- "1:"
- : "=&c" (d0), "=&D" (d1), "=&S" (d2)
- : "0" (n / 4), "g" (n), "1" ((long)to), "2" ((long)from)
- : "memory");
- return to;
-}
-
-#endif /* __CR_ASM_STRING_H__ */
diff --git a/arch/x86/include/asm/syscall32.h b/arch/x86/include/asm/syscall32.h
deleted file mode 100644
index b0d5cb71d3a5..000000000000
--- a/arch/x86/include/asm/syscall32.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef __CR_SYSCALL32_H__
-#define __CR_SYSCALL32_H__
-
-extern long sys_socket(int domain, int type, int protocol);
-extern long sys_connect(int sockfd, struct sockaddr *addr, int addrlen);
-extern long sys_sendto(int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len);
-extern long sys_recvfrom(int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len);
-extern long sys_sendmsg(int sockfd, const struct msghdr *msg, int flags);
-extern long sys_recvmsg(int sockfd, struct msghdr *msg, int flags);
-extern long sys_shutdown(int sockfd, int how);
-extern long sys_bind(int sockfd, const struct sockaddr *addr, int addrlen);
-extern long sys_setsockopt(int sockfd, int level, int optname, const void *optval, unsigned int optlen);
-extern long sys_getsockopt(int sockfd, int level, int optname, const void *optval, unsigned int *optlen);
-extern long sys_shmat(int shmid, void *shmaddr, int shmflag);
-extern long sys_pread(unsigned int fd, char *ubuf, u32 count, u64 pos);
-
-/*
- * For x86_32 __NR_mmap inside the kernel represents old_mmap system
- * call, but since we didn't use it yet lets go further and simply
- * define own alias for __NR_mmap2 which would allow us to unify code
- * between 32 and 64 bits version.
- */
-#define __NR_mmap __NR_mmap2
-
-#endif /* __CR_SYSCALL32_H__ */
diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h
deleted file mode 100644
index b2d018983ffa..000000000000
--- a/arch/x86/include/asm/types.h
+++ /dev/null
@@ -1,142 +0,0 @@
-#ifndef __CR_ASM_TYPES_H__
-#define __CR_ASM_TYPES_H__
-
-#include <stdbool.h>
-#include <signal.h>
-
-#include "asm/page.h"
-#include "asm/bitops.h"
-#include "asm/int.h"
-#include "asm/prlimit.h"
-
-#include "protobuf/core.pb-c.h"
-
-#define SIGMAX 64
-#define SIGMAX_OLD 31
-
-#define MAJOR(dev) ((dev)>>8)
-#define MINOR(dev) ((dev) & 0xff)
-
-typedef void rt_signalfn_t(int, siginfo_t *, void *);
-typedef rt_signalfn_t *rt_sighandler_t;
-
-typedef void rt_restorefn_t(void);
-typedef rt_restorefn_t *rt_sigrestore_t;
-
-#define _KNSIG 64
-# define _NSIG_BPW 64
-
-#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW)
-
-typedef struct {
- unsigned long sig[_KNSIG_WORDS];
-} k_rtsigset_t;
-
-static inline void ksigfillset(k_rtsigset_t *set)
-{
- int i;
- for (i = 0; i < _KNSIG_WORDS; i++)
- set->sig[i] = (unsigned long)-1;
-}
-
-#define SA_RESTORER 0x04000000
-
-typedef struct {
- rt_sighandler_t rt_sa_handler;
- unsigned long rt_sa_flags;
- rt_sigrestore_t rt_sa_restorer;
- k_rtsigset_t rt_sa_mask;
-} rt_sigaction_t;
-
-typedef struct {
- unsigned int entry_number;
- unsigned int base_addr;
- unsigned int limit;
- unsigned int seg_32bit:1;
- unsigned int contents:2;
- unsigned int read_exec_only:1;
- unsigned int limit_in_pages:1;
- unsigned int seg_not_present:1;
- unsigned int useable:1;
- unsigned int lm:1;
-} user_desc_t;
-
-typedef struct {
- unsigned long r15;
- unsigned long r14;
- unsigned long r13;
- unsigned long r12;
- unsigned long bp;
- unsigned long bx;
- unsigned long r11;
- unsigned long r10;
- unsigned long r9;
- unsigned long r8;
- unsigned long ax;
- unsigned long cx;
- unsigned long dx;
- unsigned long si;
- unsigned long di;
- unsigned long orig_ax;
- unsigned long ip;
- unsigned long cs;
- unsigned long flags;
- unsigned long sp;
- unsigned long ss;
- unsigned long fs_base;
- unsigned long gs_base;
- unsigned long ds;
- unsigned long es;
- unsigned long fs;
- unsigned long gs;
-} user_regs_struct_t;
-
-typedef struct {
- unsigned short cwd;
- unsigned short swd;
- unsigned short twd; /* Note this is not the same as
- the 32bit/x87/FSAVE twd */
- unsigned short fop;
- u64 rip;
- u64 rdp;
- u32 mxcsr;
- u32 mxcsr_mask;
- u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
- u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
- u32 padding[24];
-} user_fpregs_struct_t;
-
-#define ASSIGN_TYPED(a, b) do { a = (typeof(a))b; } while (0)
-#define ASSIGN_MEMBER(a,b,m) do { ASSIGN_TYPED((a)->m, (b)->m); } while (0)
-
-#ifdef CONFIG_X86_64
-# define TASK_SIZE ((1UL << 47) - PAGE_SIZE)
-#else
-/*
- * Task size may be limited to 3G but we need a
- * higher limit, because it's backward compatible.
- */
-# define TASK_SIZE (0xffffe000)
-#endif
-
-static inline unsigned long task_size() { return TASK_SIZE; }
-
-typedef u64 auxv_t;
-typedef u32 tls_t;
-
-#define REG_RES(regs) ((regs).ax)
-#define REG_IP(regs) ((regs).ip)
-#define REG_SYSCALL_NR(regs) ((regs).orig_ax)
-
-#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__X86_64
-
-#define AT_VECTOR_SIZE 44
-
-#define CORE_THREAD_ARCH_INFO(core) core->thread_info
-
-typedef UserX86RegsEntry UserRegsEntry;
-
-static inline u64 encode_pointer(void *p) { return (u64)(long)p; }
-static inline void *decode_pointer(u64 v) { return (void*)(long)v; }
-
-#endif /* __CR_ASM_TYPES_H__ */
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
deleted file mode 100644
index a1cc9bb9751b..000000000000
--- a/arch/x86/include/asm/vdso.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef __CR_ASM_VDSO_H__
-#define __CR_ASM_VDSO_H__
-
-#include "asm/int.h"
-#include "asm-generic/vdso.h"
-
-/* This definition is used in pie/util-vdso.c to initialize the vdso symbol
- * name string table 'vdso_symbols'
- */
-
-/*
- * This is a minimal amount of symbols
- * we should support at the moment.
- */
-#define VDSO_SYMBOL_MAX 4
-
-#define ARCH_VDSO_SYMBOLS \
- "__vdso_clock_gettime", \
- "__vdso_getcpu", \
- "__vdso_gettimeofday", \
- "__vdso_time"
-
-
-struct vdso_symtable;
-extern int vdso_redirect_calls(unsigned long base_to,
- unsigned long base_from,
- struct vdso_symtable *to,
- struct vdso_symtable *from);
-
-#endif /* __CR_ASM_VDSO_H__ */
diff --git a/arch/x86/parasite-head.S b/arch/x86/parasite-head.S
deleted file mode 100644
index 5fb00a5c87c1..000000000000
--- a/arch/x86/parasite-head.S
+++ /dev/null
@@ -1,40 +0,0 @@
-#include "asm/linkage.h"
-#include "parasite.h"
-
- .section .head.text, "ax"
-
-#ifdef CONFIG_X86_64
-
-ENTRY(__export_parasite_head_start)
- subq $16, %rsp
- andq $~15, %rsp
- pushq $0
- movq %rsp, %rbp
- movl __export_parasite_cmd(%rip), %edi
- leaq __export_parasite_args(%rip), %rsi
- call parasite_service
- int $0x03
- .align 8
-__export_parasite_cmd:
- .long 0
-END(__export_parasite_head_start)
-
-#else /* CONFIG_X86_64 */
-
-ENTRY(__export_parasite_head_start)
- subl $16, %esp
- andl $~15, %esp
- pushl $0
- movl %esp, %ebp
- call 1f
-1: popl %ecx
-2: leal (__export_parasite_cmd-2b)(%ecx), %eax
- leal (__export_parasite_args-2b)(%ecx), %edx
- call parasite_service
- int $0x03
- .align 8
-GLOBAL(__export_parasite_cmd)
- .long 0
-END(__export_parasite_head_start)
-
-#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/prlimit.c b/arch/x86/prlimit.c
deleted file mode 100644
index 8bc4aba9f6a6..000000000000
--- a/arch/x86/prlimit.c
+++ /dev/null
@@ -1,68 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <errno.h>
-
-#include "asm/types.h"
-#include "asm/prlimit.h"
-
-#include "compiler.h"
-#include "config.h"
-
-#ifndef CONFIG_HAS_PRLIMIT
-
-#ifndef RLIM64_INFINITY
-# define RLIM64_INFINITY (~0ULL)
-#endif
-
-int prlimit(pid_t pid, int resource, const struct rlimit *new_rlimit, struct rlimit *old_rlimit)
-{
- struct rlimit64 new_rlimit64_mem;
- struct rlimit64 old_rlimit64_mem;
- struct rlimit64 *new_rlimit64 = NULL;
- struct rlimit64 *old_rlimit64 = NULL;
- int ret;
-
- if (old_rlimit)
- old_rlimit64 = &old_rlimit64_mem;
-
- if (new_rlimit) {
- if (new_rlimit->rlim_cur == RLIM_INFINITY)
- new_rlimit64_mem.rlim_cur = RLIM64_INFINITY;
- else
- new_rlimit64_mem.rlim_cur = new_rlimit->rlim_cur;
- if (new_rlimit->rlim_max == RLIM_INFINITY)
- new_rlimit64_mem.rlim_max = RLIM64_INFINITY;
- else
- new_rlimit64_mem.rlim_max = new_rlimit->rlim_max;
- new_rlimit64 = &new_rlimit64_mem;
- }
-
- ret = sys_prlimit64(pid, resource, new_rlimit64, old_rlimit64);
-
- if (ret == 0 && old_rlimit) {
- old_rlimit->rlim_cur = old_rlimit64_mem.rlim_cur;
- if (old_rlimit->rlim_cur != old_rlimit64_mem.rlim_cur) {
- if (new_rlimit) {
- errno = EOVERFLOW;
- return -1;
- }
- old_rlimit->rlim_cur = RLIM_INFINITY;
- }
- old_rlimit->rlim_max = old_rlimit64_mem.rlim_max;
- if (old_rlimit->rlim_max != old_rlimit64_mem.rlim_max) {
- if (new_rlimit) {
- errno = EOVERFLOW;
- return -1;
- }
- old_rlimit->rlim_max = RLIM_INFINITY;
- }
- } else if (ret) {
- errno = -ret;
- ret = -1;
- }
-
- return ret;
-}
-
-#endif /* CONFIG_HAS_PRLIMIT */
diff --git a/arch/x86/restorer.c b/arch/x86/restorer.c
deleted file mode 100644
index 364b156be91e..000000000000
--- a/arch/x86/restorer.c
+++ /dev/null
@@ -1,33 +0,0 @@
-#include <asm/prctl.h>
-#include <unistd.h>
-
-#include "restorer.h"
-#include "asm/restorer.h"
-#include "asm/fpu.h"
-
-#include "syscall.h"
-#include "log.h"
-#include "cpu.h"
-
-int restore_nonsigframe_gpregs(UserX86RegsEntry *r)
-{
-#ifdef CONFIG_X86_64
- long ret;
- unsigned long fsgs_base;
-
- fsgs_base = r->fs_base;
- ret = sys_arch_prctl(ARCH_SET_FS, fsgs_base);
- if (ret) {
- pr_info("SET_FS fail %ld\n", ret);
- return -1;
- }
-
- fsgs_base = r->gs_base;
- ret = sys_arch_prctl(ARCH_SET_GS, fsgs_base);
- if (ret) {
- pr_info("SET_GS fail %ld\n", ret);
- return -1;
- }
-#endif
- return 0;
-}
diff --git a/arch/x86/syscalls/syscall-common-x86-32.S b/arch/x86/syscalls/syscall-common-x86-32.S
deleted file mode 100644
index ae6d594dc4fe..000000000000
--- a/arch/x86/syscalls/syscall-common-x86-32.S
+++ /dev/null
@@ -1,36 +0,0 @@
-#include "asm/linkage.h"
-
-#define SYSCALL(name, opcode) \
- ENTRY(name); \
- movl $opcode, %eax; \
- jmp __syscall_common; \
- END(name)
-
-ENTRY(__syscall_common)
- pushl %ebx
- pushl %esi
- pushl %edi
- pushl %ebp
-
-#define __arg(n) (4 * (n) + 20)(%esp)
- movl __arg(0),%ebx
- movl __arg(1),%ecx
- movl __arg(2),%edx
- movl __arg(3),%esi
- movl __arg(4),%edi
- movl __arg(5),%ebp
-#undef __arg
-
- int $0x80
-
- popl %ebp
- popl %edi
- popl %esi
- popl %ebx
- ret
-END(__syscall_common)
-
-ENTRY(__cr_restore_rt)
- movl $__NR_rt_sigreturn, %eax
- jmp __syscall_common
-END(__cr_restore_rt)
diff --git a/arch/x86/syscalls/syscall-common-x86-64.S b/arch/x86/syscalls/syscall-common-x86-64.S
deleted file mode 100644
index b93c31288a20..000000000000
--- a/arch/x86/syscalls/syscall-common-x86-64.S
+++ /dev/null
@@ -1,21 +0,0 @@
-#include "asm/linkage.h"
-
-#define SYSCALL(name, opcode) \
- ENTRY(name); \
- movl $opcode, %eax; \
- jmp __syscall_common; \
- END(name)
-
- .text
- .align 4
-
-ENTRY(__syscall_common)
- movq %rcx, %r10
- syscall
- ret
-END(__syscall_common)
-
-ENTRY(__cr_restore_rt)
- movq $__NR_rt_sigreturn, %rax
- syscall
-END(__cr_restore_rt)
diff --git a/arch/x86/syscalls/syscall32.c b/arch/x86/syscalls/syscall32.c
deleted file mode 100644
index b68ef09572f3..000000000000
--- a/arch/x86/syscalls/syscall32.c
+++ /dev/null
@@ -1,85 +0,0 @@
-#include "asm/types.h"
-#include "syscall.h"
-
-#define SYS_SOCKET 1 /* sys_socket(2) */
-#define SYS_BIND 2 /* sys_bind(2) */
-#define SYS_CONNECT 3 /* sys_connect(2) */
-#define SYS_SENDTO 11 /* sys_sendto(2) */
-#define SYS_RECVFROM 12 /* sys_recvfrom(2) */
-#define SYS_SHUTDOWN 13 /* sys_shutdown(2) */
-#define SYS_SETSOCKOPT 14 /* sys_setsockopt(2) */
-#define SYS_GETSOCKOPT 15 /* sys_getsockopt(2) */
-#define SYS_SENDMSG 16 /* sys_sendmsg(2) */
-#define SYS_RECVMSG 17 /* sys_recvmsg(2) */
-
-long sys_socket(int domain, int type, int protocol)
-{
- u32 a[] = { (u32)domain, (u32)type, (u32)protocol };
- return sys_socketcall(SYS_SOCKET, (unsigned long *)a);
-}
-
-long sys_connect(int sockfd, struct sockaddr *addr, int addrlen)
-{
- u32 a[] = {(u32)sockfd, (u32)addr, (u32)addrlen};
- return sys_socketcall(SYS_CONNECT, (unsigned long *)a);
-}
-
-long sys_sendto(int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len)
-{
- u32 a[] = {(u32)sockfd, (u32)buff, (u32)len, (u32)flags, (u32)addr, (u32)addr_len};
- return sys_socketcall(SYS_SENDTO, (unsigned long *)a);
-}
-
-long sys_recvfrom(int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len)
-{
- u32 a[] = {(u32)sockfd, (u32)ubuf, (u32)size, (u32)flags, (u32)addr, (u32)addr_len};
- return sys_socketcall(SYS_RECVFROM, (unsigned long *)a);
-}
-
-long sys_sendmsg(int sockfd, const struct msghdr *msg, int flags)
-{
- u32 a[] = {(u32)sockfd, (u32)msg, (u32)flags};
- return sys_socketcall(SYS_SENDMSG, (unsigned long *)a);
-}
-
-long sys_recvmsg(int sockfd, struct msghdr *msg, int flags)
-{
- u32 a[] = {(u32)sockfd, (u32)msg, (u32)flags};
- return sys_socketcall(SYS_RECVMSG, (unsigned long *)a);
-}
-
-long sys_shutdown(int sockfd, int how)
-{
- u32 a[] = {(u32)sockfd, (u32)how};
- return sys_socketcall(SYS_SHUTDOWN, (unsigned long *)a);
-}
-
-long sys_bind(int sockfd, const struct sockaddr *addr, int addrlen)
-{
- u32 a[] = {(u32)sockfd, (u32)addr, (u32)addrlen};
- return sys_socketcall(SYS_BIND, (unsigned long *)a);
-}
-
-long sys_setsockopt(int sockfd, int level, int optname, const void *optval, unsigned int optlen)
-{
- u32 a[] = {(u32)sockfd, (u32)level, (u32)optname, (u32)optval, (u32)optlen};
- return sys_socketcall(SYS_SETSOCKOPT, (unsigned long *)a);
-}
-
-long sys_getsockopt(int sockfd, int level, int optname, const void *optval, unsigned int *optlen)
-{
- u32 a[] = {(u32)sockfd, (u32)level, (u32)optname, (u32)optval, (u32)optlen};
- return sys_socketcall(SYS_GETSOCKOPT, (unsigned long *)a);
-}
-
-#define SHMAT 21
-
-long sys_shmat(int shmid, void *shmaddr, int shmflag)
-{
- return sys_ipc(SHMAT, shmid, shmflag, 0, shmaddr, 0);
-}
-
-long sys_pread(unsigned int fd, char *ubuf, u32 count, u64 pos)
-{
- return sys_pread64(fd, ubuf, count, (u32)(pos & 0xffffffffu), (u32)(pos >> 32));
-}
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
deleted file mode 100644
index 04d10d119c3f..000000000000
--- a/arch/x86/syscalls/syscall_32.tbl
+++ /dev/null
@@ -1,92 +0,0 @@
-#
-# System calls table, please make sure the table consist only the syscalls
-# really used somewhere in project.
-#
-# code name arguments
-# -------------------------------------------------------------------------------------------------------------------------------------------------------------
-__NR_restart_syscall 0 sys_restart_syscall (void)
-__NR_exit 1 sys_exit (unsigned long error_code)
-__NR_read 3 sys_read (int fd, void *buf, unsigned long count)
-__NR_write 4 sys_write (int fd, const void *buf, unsigned long count)
-__NR_open 5 sys_open (const char *filename, int flags, unsigned int mode)
-__NR_close 6 sys_close (int fd)
-__NR_unlink 10 sys_unlink (char *pathname)
-__NR_lseek 19 sys_lseek (int fd, s32 offset, unsigned int origin)
-__NR_getpid 20 sys_getpid (void)
-__NR_mount 21 sys_mount (const char *dev_name, const char *dir_name, const char *type, unsigned long flags, const void *data)
-__NR_ptrace 26 sys_ptrace (long request, pid_t pid, void *addr, void *data)
-__NR_kill 37 sys_kill (long pid, int sig)
-__NR_mkdir 39 sys_mkdir (const char *name, int mode)
-__NR_rmdir 40 sys_rmdir (const char *name)
-__NR_brk 45 sys_brk (void *addr)
-__NR_umount2 52 sys_umount2 (char *name, int flags)
-__NR_ioctl 54 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg)
-__NR_fcntl 55 sys_fcntl (unsigned int fd, unsigned int cmd, unsigned long arg)
-__NR_umask 60 sys_umask (int mask)
-__NR_setrlimit 75 sys_setrlimit (unsigned int resource, struct krlimit *rlim)
-__NR_munmap 91 sys_munmap (void *addr, unsigned long len)
-__NR_setpriority 97 sys_setpriority (int which, int who, int nice)
-__NR_socketcall 102 sys_socketcall (int call, unsigned long *args)
-__NR_setitimer 104 sys_setitimer (int which, struct itimerval *in, struct itimerval *out)
-__NR_getitimer 105 sys_getitimer (int which, struct itimerval *it)
-__NR_wait4 114 sys_wait4 (pid_t pid, int *stat_addr, int options, struct rusage *ru)
-__NR_ipc 117 sys_ipc (unsigned int call, int first, unsigned long second, unsigned long third, void *ptr, long fifth)
-__NR_clone 120 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, void *child_tid)
-__NR_mprotect 125 sys_mprotect (const void *addr, unsigned long len, unsigned long prot)
-__NR_getpgid 132 sys_getpgid (pid_t pid)
-__NR_personality 136 sys_personality (unsigned int personality)
-__NR_flock 143 sys_flock (int fd, unsigned long cmd)
-__NR_getsid 147 sys_getsid (void)
-__NR_sched_setscheduler 156 sys_sched_setscheduler (int pid, int policy, struct sched_param *p)
-__NR_nanosleep 162 sys_nanosleep (struct timespec *rqtp, struct timespec *rmtp)
-__NR_mremap 163 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr)
-__NR_prctl 172 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5)
-__NR_rt_sigreturn 173 sys_rt_sigreturn (void)
-__NR_rt_sigaction 174 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize)
-__NR_rt_sigprocmask 175 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *oset, size_t sigsetsize)
-__NR_rt_sigqueueinfo 178 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *uinfo)
-__NR_pread64 180 sys_pread64 (unsigned int fd, char *ubuf, u32 count, u32 poslo, u32 poshi)
-__NR_capget 184 sys_capget (struct cap_header *h, struct cap_data *d)
-__NR_capset 185 sys_capset (struct cap_header *h, struct cap_data *d)
-__NR_sigaltstack 186 sys_sigaltstack (const void *uss_ptr, void *uoss_ptr)
-__NR_mmap2 192 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff)
-__NR_getgroups32 205 sys_getgroups (int gsize, unsigned int *groups)
-__NR_setgroups32 206 sys_setgroups (int gsize, unsigned int *groups)
-__NR_setresuid32 208 sys_setresuid (int uid, int euid, int suid)
-__NR_getresuid32 209 sys_getresuid (int *uid, int *euid, int *suid)
-__NR_setresgid32 210 sys_setresgid (int gid, int egid, int sgid)
-__NR_getresgid32 211 sys_getresgid (int *gid, int *egid, int *sgid)
-__NR_setfsuid32 215 sys_setfsuid (int fsuid)
-__NR_setfsgid32 216 sys_setfsgid (int fsgid)
-__NR_mincore 218 sys_mincore (void *addr, unsigned long size, unsigned char *vec)
-__NR_madvise 219 sys_madvise (unsigned long start, size_t len, int behavior)
-__NR_gettid 224 sys_gettid (void)
-__NR_futex 240 sys_futex (u32 *uaddr, int op, u32 val, struct timespec *utime, u32 *uaddr2, u32 val3)
-__NR_set_thread_area 243 sys_set_thread_area (user_desc_t *info)
-__NR_get_thread_area 244 sys_get_thread_area (user_desc_t *info)
-__NR_io_setup 245 sys_io_setup (unsigned nr_reqs, aio_context_t *ctx32p)
-__NR_io_getevents 247 sys_io_getevents (aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout)
-__NR_exit_group 252 sys_exit_group (int error_code)
-__NR_set_tid_address 258 sys_set_tid_address (int *tid_addr)
-__NR_timer_create 259 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id)
-__NR_timer_settime 260 sys_timer_settime (kernel_timer_t timer_id, int flags, struct itimerspec *new, struct itimerspec *old)
-__NR_timer_gettime 261 sys_timer_gettime (int timer_id, struct itimerspec *setting)
-__NR_timer_getoverrun 262 sys_timer_getoverrun (int timer_id)
-__NR_timer_delete 263 sys_timer_delete (kernel_timer_t timer_id)
-__NR_clock_gettime 265 sys_clock_gettime (int which_clock, struct timespec *tp)
-__NR_seccomp 354 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs)
-__NR_waitid 284 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru)
-__NR_readlinkat 305 sys_readlinkat (int fd, const char *path, char *buf, int bufsize)
-__NR_set_robust_list 311 sys_set_robust_list (struct robust_list_head *head, size_t len)
-__NR_get_robust_list 312 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
-__NR_vmsplice 316 sys_vmsplice (int fd, const struct iovec *iov, unsigned int nr_segs, unsigned int flags)
-__NR_signalfd 321 sys_signalfd (int ufd, const k_rtsigset_t *sigmask, size_t sigsetsize)
-__NR_timerfd_settime 325 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
-__NR_rt_tgsigqueueinfo 335 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *uinfo)
-__NR_fanotify_init 338 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags)
-__NR_fanotify_mark 339 sys_fanotify_mark (int fanotify_fd, unsigned int flag, u32 mask, int dfd, const char *pathname)
-__NR_prlimit64 340 sys_prlimit64 (pid_t pid, unsigned int resource, const struct rlimit64 *new_rlim, struct rlimit64 *old_rlim)
-__NR_open_by_handle_at 342 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags)
-__NR_setns 346 sys_setns (int fd, int nstype)
-__NR_kcmp 349 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2)
-__NR_memfd_create 356 sys_memfd_create (const char *name, unsigned int flags)
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
deleted file mode 100644
index 5c32d4c46c75..000000000000
--- a/arch/x86/syscalls/syscall_64.tbl
+++ /dev/null
@@ -1,103 +0,0 @@
-#
-# System calls table, please make sure the table consist only the syscalls
-# really used somewhere in project.
-#
-# __NR_name code name arguments
-# -------------------------------------------------------------------------------------------------------------------------------------------------------------
-__NR_read 0 sys_read (int fd, void *buf, unsigned long count)
-__NR_write 1 sys_write (int fd, const void *buf, unsigned long count)
-__NR_open 2 sys_open (const char *filename, unsigned long flags, unsigned long mode)
-__NR_close 3 sys_close (int fd)
-__NR_lseek 8 sys_lseek (int fd, unsigned long offset, unsigned long origin)
-__NR_mmap 9 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset)
-__NR_mprotect 10 sys_mprotect (const void *addr, unsigned long len, unsigned long prot)
-__NR_munmap 11 sys_munmap (void *addr, unsigned long len)
-__NR_brk 12 sys_brk (void *addr)
-__NR_rt_sigaction 13 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize)
-__NR_rt_sigprocmask 14 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize)
-__NR_rt_sigreturn 15 sys_rt_sigreturn (void)
-__NR_ioctl 16 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg)
-__NR_pread64 17 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos)
-__NR_mremap 25 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr)
-__NR_mincore 27 sys_mincore (void *addr, unsigned long size, unsigned char *vec)
-__NR_madvise 28 sys_madvise (unsigned long start, size_t len, int behavior)
-__NR_shmat 30 sys_shmat (int shmid, void *shmaddr, int shmflag)
-__NR_nanosleep 35 sys_nanosleep (struct timespec *req, struct timespec *rem)
-__NR_getitimer 36 sys_getitimer (int which, const struct itimerval *val)
-__NR_setitimer 38 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old)
-__NR_getpid 39 sys_getpid (void)
-__NR_socket 41 sys_socket (int domain, int type, int protocol)
-__NR_connect 42 sys_connect (int sockfd, struct sockaddr *addr, int addrlen)
-__NR_sendto 44 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len)
-__NR_recvfrom 45 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len)
-__NR_sendmsg 46 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags)
-__NR_recvmsg 47 sys_recvmsg (int sockfd, struct msghdr *msg, int flags)
-__NR_shutdown 48 sys_shutdown (int sockfd, int how)
-__NR_bind 49 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen)
-__NR_setsockopt 54 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen)
-__NR_getsockopt 55 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen)
-__NR_clone 56 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, void *child_tid)
-__NR_exit 60 sys_exit (unsigned long error_code)
-__NR_wait4 61 sys_wait4 (int pid, int *status, int options, struct rusage *ru)
-__NR_kill 62 sys_kill (long pid, int sig)
-__NR_fcntl 72 sys_fcntl (int fd, int type, long arg)
-__NR_flock 73 sys_flock (int fd, unsigned long cmd)
-__NR_mkdir 83 sys_mkdir (const char *name, int mode)
-__NR_rmdir 84 sys_rmdir (const char *name)
-__NR_unlink 87 sys_unlink (char *pathname)
-__NR_umask 95 sys_umask (int mask)
-__NR_ptrace 101 sys_ptrace (long request, pid_t pid, void *addr, void *data)
-__NR_getgroups 115 sys_getgroups (int gsize, unsigned int *groups)
-__NR_setgroups 116 sys_setgroups (int gsize, unsigned int *groups)
-__NR_setresuid 117 sys_setresuid (int uid, int euid, int suid)
-__NR_getresuid 118 sys_getresuid (int *uid, int *euid, int *suid)
-__NR_setresgid 119 sys_setresgid (int gid, int egid, int sgid)
-__NR_getresgid 120 sys_getresgid (int *gid, int *egid, int *sgid)
-__NR_getpgid 121 sys_getpgid (pid_t pid)
-__NR_setfsuid 122 sys_setfsuid (int fsuid)
-__NR_setfsgid 123 sys_setfsgid (int fsgid)
-__NR_getsid 124 sys_getsid (void)
-__NR_capget 125 sys_capget (struct cap_header *h, struct cap_data *d)
-__NR_capset 126 sys_capset (struct cap_header *h, struct cap_data *d)
-__NR_rt_sigqueueinfo 129 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info)
-__NR_sigaltstack 131 sys_sigaltstack (const void *uss, void *uoss)
-__NR_personality 135 sys_personality (unsigned int personality)
-__NR_setpriority 141 sys_setpriority (int which, int who, int nice)
-__NR_sched_setscheduler 144 sys_sched_setscheduler (int pid, int policy, struct sched_param *p)
-__NR_prctl 157 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5)
-__NR_arch_prctl 158 sys_arch_prctl (int option, unsigned long addr)
-__NR_setrlimit 160 sys_setrlimit (int resource, struct krlimit *rlim)
-__NR_mount 165 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data)
-__NR_umount2 166 sys_umount2 (char *name, int flags)
-__NR_gettid 186 sys_gettid (void)
-__NR_futex 202 sys_futex (u32 *uaddr, int op, u32 val, struct timespec *utime, u32 *uaddr2, u32 val3)
-__NR_set_thread_area 205 sys_set_thread_area (user_desc_t *info)
-__NR_io_setup 206 sys_io_setup (unsigned nr_events, aio_context_t *ctx)
-__NR_io_getevents 208 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo)
-__NR_get_thread_area 211 sys_get_thread_area (user_desc_t *info)
-__NR_set_tid_address 218 sys_set_tid_address (int *tid_addr)
-__NR_restart_syscall 219 sys_restart_syscall (void)
-__NR_sys_timer_create 222 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id)
-__NR_sys_timer_settime 223 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting)
-__NR_sys_timer_gettime 224 sys_timer_gettime (int timer_id, const struct itimerspec *setting)
-__NR_sys_timer_getoverrun 225 sys_timer_getoverrun (int timer_id)
-__NR_sys_timer_delete 226 sys_timer_delete (kernel_timer_t timer_id)
-__NR_clock_gettime 228 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp)
-__NR_exit_group 231 sys_exit_group (int error_code)
-__NR_openat 257 sys_openat (int dfd, const char *filename, int flags, int mode)
-__NR_waitid 247 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru)
-__NR_readlinkat 267 sys_readlinkat (int fd, const char *path, char *buf, int bufsize)
-__NR_set_robust_list 273 sys_set_robust_list (struct robust_list_head *head, size_t len)
-__NR_get_robust_list 274 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
-__NR_seccomp 317 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs)
-__NR_vmsplice 278 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags)
-__NR_timerfd_settime 286 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
-__NR_signalfd4 289 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
-__NR_rt_tgsigqueueinfo 297 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info)
-__NR_fanotify_init 300 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags)
-__NR_fanotify_mark 301 sys_fanotify_mark (int fanotify_fd, unsigned int flags, u64 mask, int dfd, const char *pathname)
-__NR_prlimit64 302 sys_prlimit64 (pid_t pid, unsigned int resource, const struct rlimit64 *new_rlim, struct rlimit64 *old_rlim)
-__NR_open_by_handle_at 304 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags)
-__NR_setns 308 sys_setns (int fd, int nstype)
-__NR_kcmp 312 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2)
-__NR_memfd_create 319 sys_memfd_create (const char *name, unsigned int flags)
diff --git a/arch/x86/vdso-pie.c b/arch/x86/vdso-pie.c
deleted file mode 100644
index b1e087cd8837..000000000000
--- a/arch/x86/vdso-pie.c
+++ /dev/null
@@ -1,59 +0,0 @@
-#include <unistd.h>
-
-#include "asm/string.h"
-#include "asm/types.h"
-
-#include "syscall.h"
-#include "parasite-vdso.h"
-#include "log.h"
-#include "bug.h"
-
-#ifdef LOG_PREFIX
-# undef LOG_PREFIX
-#endif
-#define LOG_PREFIX "vdso: "
-
-#ifdef CONFIG_X86_64
-typedef struct {
- u16 movabs;
- u64 imm64;
- u16 jmp_rax;
- u32 guards;
-} __packed jmp_t;
-
-int vdso_redirect_calls(unsigned long base_to, unsigned long base_from,
- struct vdso_symtable *to,
- struct vdso_symtable *from)
-{
- jmp_t jmp = {
- .movabs = 0xb848,
- .jmp_rax = 0xe0ff,
- .guards = 0xcccccccc,
- };
- unsigned int i;
-
- for (i = 0; i < ARRAY_SIZE(to->symbols); i++) {
- if (vdso_symbol_empty(&from->symbols[i]))
- continue;
-
- pr_debug("jmp: %lx/%lx -> %lx/%lx (index %d)\n",
- base_from, from->symbols[i].offset,
- base_to, to->symbols[i].offset, i);
-
- jmp.imm64 = base_to + to->symbols[i].offset;
- builtin_memcpy((void *)(base_from + from->symbols[i].offset), &jmp, sizeof(jmp));
- }
-
- return 0;
-}
-
-#else /* CONFIG_X86_64 */
-
-int vdso_redirect_calls(unsigned long base_to, unsigned long base_from,
- struct vdso_symtable *to,
- struct vdso_symtable *from)
-{
- return 0;
-}
-
-#endif /* CONFIG_X86_64 */
diff --git a/bfd.c b/bfd.c
deleted file mode 100644
index 66c318c6ee5b..000000000000
--- a/bfd.c
+++ /dev/null
@@ -1,327 +0,0 @@
-#include <unistd.h>
-#include <stdbool.h>
-#include <string.h>
-#include <stdio.h>
-#include <sys/mman.h>
-#include <fcntl.h>
-#include <sys/uio.h>
-#include <errno.h>
-
-#include "bug.h"
-#include "log.h"
-#include "bfd.h"
-#include "list.h"
-#include "util.h"
-#include "xmalloc.h"
-#include "asm/page.h"
-
-#undef LOG_PREFIX
-#define LOG_PREFIX "bfd: "
-
-/*
- * Kernel doesn't produce more than one page of
- * date per one read call on proc files.
- */
-#define BUFSIZE (PAGE_SIZE)
-
-struct bfd_buf {
- char *mem;
- struct list_head l;
-};
-
-static LIST_HEAD(bufs);
-
-#define BUFBATCH (16)
-
-static int buf_get(struct xbuf *xb)
-{
- struct bfd_buf *b;
-
- if (list_empty(&bufs)) {
- void *mem;
- int i;
-
- mem = mmap(NULL, BUFBATCH * BUFSIZE, PROT_READ | PROT_WRITE,
- MAP_PRIVATE | MAP_ANON, 0, 0);
- if (mem == MAP_FAILED) {
- pr_perror("No buf");
- return -1;
- }
-
- for (i = 0; i < BUFBATCH; i++) {
- b = xmalloc(sizeof(*b));
- if (!b) {
- if (i == 0) {
- pr_err("No buffer for bfd\n");
- return -1;
- }
-
- pr_warn("BFD buffers partial refil!\n");
- break;
- }
-
- b->mem = mem + i * BUFSIZE;
- list_add_tail(&b->l, &bufs);
- }
- }
-
- b = list_first_entry(&bufs, struct bfd_buf, l);
- list_del_init(&b->l);
-
- xb->mem = b->mem;
- xb->data = xb->mem;
- xb->sz = 0;
- xb->buf = b;
- return 0;
-}
-
-static void buf_put(struct xbuf *xb)
-{
- /*
- * Don't unmap buffer back, it will get reused
- * by next bfdopen call
- */
- list_add(&xb->buf->l, &bufs);
- xb->buf = NULL;
- xb->mem = NULL;
- xb->data = NULL;
-}
-
-static int bfdopen(struct bfd *f, bool writable)
-{
- if (buf_get(&f->b)) {
- close(f->fd);
- return -1;
- }
-
- f->writable = writable;
- return 0;
-}
-
-int bfdopenr(struct bfd *f)
-{
- return bfdopen(f, false);
-}
-
-int bfdopenw(struct bfd *f)
-{
- return bfdopen(f, true);
-}
-
-static int bflush(struct bfd *bfd);
-static bool flush_failed = false;
-
-int bfd_flush_images(void)
-{
- return flush_failed ? -1 : 0;
-}
-
-void bclose(struct bfd *f)
-{
- if (bfd_buffered(f)) {
- if (f->writable && bflush(f) < 0) {
- /*
- * This is to propagate error up. It's
- * hardly possible by returning and
- * checking it, but setting a static
- * flag, failing further bfdopen-s and
- * checking one at the end would work.
- */
- flush_failed = true;
- pr_perror("Error flushing image");
- }
-
- buf_put(&f->b);
- }
- close_safe(&f->fd);
-}
-
-static int brefill(struct bfd *f)
-{
- int ret;
- struct xbuf *b = &f->b;
-
- memmove(b->mem, b->data, b->sz);
- b->data = b->mem;
-
- ret = read(f->fd, b->mem + b->sz, BUFSIZE - b->sz);
- if (ret < 0) {
- pr_perror("Error reading file");
- return -1;
- }
-
- if (ret == 0)
- return 0;
-
- b->sz += ret;
- return 1;
-}
-
-static char *strnchr(char *str, unsigned int len, char c)
-{
- while (len > 0 && *str != c) {
- str++;
- len--;
- }
-
- return len == 0 ? NULL : str;
-}
-
-char *breadline(struct bfd *f)
-{
- return breadchr(f, '\n');
-}
-
-char *breadchr(struct bfd *f, char c)
-{
- struct xbuf *b = &f->b;
- bool refilled = false;
- char *n;
- unsigned int ss = 0;
-
-again:
- n = strnchr(b->data + ss, b->sz - ss, c);
- if (n) {
- char *ret;
-
- ret = b->data;
- b->data = n + 1; /* skip the \n found */
- *n = '\0';
- b->sz -= (b->data - ret);
- return ret;
- }
-
- if (refilled) {
- if (!b->sz)
- return NULL;
-
- /*
- * Last bytes may lack the \n at the
- * end, need to report this as full
- * line anyway
- */
- b->data[b->sz] = '\0';
-
- /*
- * The b->data still points to old data,
- * but we say that no bytes left there
- * so next call to breadline will not
- * "find" these bytes again.
- */
- b->sz = 0;
- return b->data;
- }
-
- /*
- * small optimization -- we've scanned b->sz
- * symols already, no need to re-scan them after
- * the buffer refill.
- */
- ss = b->sz;
-
- /* no full line in the buffer -- refill one */
- if (brefill(f) < 0)
- return ERR_PTR(-EIO);
-
- refilled = true;
-
- goto again;
-}
-
-static int bflush(struct bfd *bfd)
-{
- struct xbuf *b = &bfd->b;
- int ret;
-
- if (!b->sz)
- return 0;
-
- ret = write(bfd->fd, b->data, b->sz);
- if (ret != b->sz)
- return -1;
-
- b->sz = 0;
- return 0;
-}
-
-static int __bwrite(struct bfd *bfd, const void *buf, int size)
-{
- struct xbuf *b = &bfd->b;
-
- if (b->sz + size > BUFSIZE) {
- int ret;
- ret = bflush(bfd);
- if (ret < 0)
- return ret;
- }
-
- if (size > BUFSIZE)
- return write(bfd->fd, buf, size);
-
- memcpy(b->data + b->sz, buf, size);
- b->sz += size;
- return size;
-}
-
-int bwrite(struct bfd *bfd, const void *buf, int size)
-{
- if (!bfd_buffered(bfd))
- return write(bfd->fd, buf, size);
-
- return __bwrite(bfd, buf, size);
-}
-
-int bwritev(struct bfd *bfd, const struct iovec *iov, int cnt)
-{
- int i, written = 0;
-
- if (!bfd_buffered(bfd))
- return writev(bfd->fd, iov, cnt);
-
- for (i = 0; i < cnt; i++) {
- int ret;
-
- ret = __bwrite(bfd, (const void *)iov[i].iov_base, iov[i].iov_len);
- if (ret < 0)
- return ret;
-
- written += ret;
- if (ret < iov[i].iov_len)
- break;
- }
-
- return written;
-}
-
-int bread(struct bfd *bfd, void *buf, int size)
-{
- struct xbuf *b = &bfd->b;
- int more = 1, filled = 0;
-
- if (!bfd_buffered(bfd))
- return read(bfd->fd, buf, size);
-
- while (more > 0) {
- int chunk;
-
- chunk = size - filled;
- if (chunk > b->sz)
- chunk = b->sz;
-
- if (chunk) {
- memcpy(buf + filled, b->data, chunk);
- b->data += chunk;
- b->sz -= chunk;
- filled += chunk;
- }
-
- if (filled < size)
- more = brefill(bfd);
- else {
- BUG_ON(filled > size);
- more = 0;
- }
- }
-
- return more < 0 ? more : filled;
-}
diff --git a/bitmap.c b/bitmap.c
deleted file mode 100644
index 65a501e728cb..000000000000
--- a/bitmap.c
+++ /dev/null
@@ -1,54 +0,0 @@
-#include "asm/bitsperlong.h"
-
-#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
-
-#define BITMAP_FIRST_WORD_MASK(start) (~0ul << ((start) % BITS_PER_LONG))
-
-#define BITMAP_LAST_WORD_MASK(nbits) \
-( \
- ((nbits) % BITS_PER_LONG) ? \
- (1ul << ((nbits) % BITS_PER_LONG)) - 1 : ~0ul \
-)
-
-#define small_const_nbits(nbits) \
- (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG)
-
-void bitmap_set(unsigned long *map, int start, int nr)
-{
- unsigned long *p = map + BIT_WORD(start);
- const int size = start + nr;
- int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
- unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start);
-
- while (nr - bits_to_set >= 0) {
- *p |= mask_to_set;
- nr -= bits_to_set;
- bits_to_set = BITS_PER_LONG;
- mask_to_set = ~0UL;
- p++;
- }
- if (nr) {
- mask_to_set &= BITMAP_LAST_WORD_MASK(size);
- *p |= mask_to_set;
- }
-}
-
-void bitmap_clear(unsigned long *map, int start, int nr)
-{
- unsigned long *p = map + BIT_WORD(start);
- const int size = start + nr;
- int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
- unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
-
- while (nr - bits_to_clear >= 0) {
- *p &= ~mask_to_clear;
- nr -= bits_to_clear;
- bits_to_clear = BITS_PER_LONG;
- mask_to_clear = ~0UL;
- p++;
- }
- if (nr) {
- mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
- *p &= ~mask_to_clear;
- }
-}
diff --git a/cgroup.c b/cgroup.c
deleted file mode 100644
index 704f144f0001..000000000000
--- a/cgroup.c
+++ /dev/null
@@ -1,1571 +0,0 @@
-#define LOG_PREFIX "cg: "
-#include <sys/types.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include <sys/mount.h>
-#include <sys/stat.h>
-#include <ftw.h>
-#include <libgen.h>
-#include "list.h"
-#include "xmalloc.h"
-#include "cgroup.h"
-#include "cr_options.h"
-#include "pstree.h"
-#include "proc_parse.h"
-#include "util.h"
-#include "imgset.h"
-#include "util-pie.h"
-#include "namespaces.h"
-#include "seize.h"
-#include "protobuf.h"
-#include "protobuf/core.pb-c.h"
-#include "protobuf/cgroup.pb-c.h"
-
-/*
- * These string arrays have the names of all the properties that will be
- * restored. To add a property for a cgroup type, add it to the
- * corresponding char array above the NULL terminator. If you are adding
- * a new cgroup family all together, you must also edit get_known_properties()
- * Currently the code only supports properties with 1 value
- */
-
-static const char *cpu_props[] = {
- "cpu.shares",
- "cpu.cfs_period_us",
- "cpu.cfs_quota_us",
- "cpu.rt_period_us",
- "cpu.rt_runtime_us",
- "notify_on_release",
- NULL
-};
-
-static const char *memory_props[] = {
- /* limit_in_bytes and memsw.limit_in_bytes must be set in this order */
- "memory.limit_in_bytes",
- "memory.memsw.limit_in_bytes",
- "memory.use_hierarchy",
- "notify_on_release",
- NULL
-};
-
-static const char *cpuset_props[] = {
- /*
- * cpuset.cpus and cpuset.mems must be set before the process moves
- * into its cgroup; they are "initialized" below to whatever the root
- * values are in copy_special_cg_props so as not to cause ENOSPC when
- * values are restored via this code.
- */
- "cpuset.cpus",
- "cpuset.mems",
- "cpuset.memory_migrate",
- "cpuset.cpu_exclusive",
- "cpuset.mem_exclusive",
- "cpuset.mem_hardwall",
- "cpuset.memory_spread_page",
- "cpuset.memory_spread_slab",
- "cpuset.sched_load_balance",
- "cpuset.sched_relax_domain_level",
- "notify_on_release",
- NULL
-};
-
-static const char *blkio_props[] = {
- "blkio.weight",
- "notify_on_release",
- NULL
-};
-
-static const char *freezer_props[] = {
- "notify_on_release",
- NULL
-};
-
-static const char *global_props[] = {
- "cgroup.clone_children",
- "notify_on_release",
- "cgroup.procs",
- "tasks",
- NULL
-};
-
-/*
- * This structure describes set of controller groups
- * a task lives in. The cg_ctl entries are stored in
- * the @ctls list sorted by the .name field and then
- * by the .path field.
- */
-
-struct cg_set {
- u32 id;
- struct list_head l;
- unsigned int n_ctls;
- struct list_head ctls;
-};
-
-static LIST_HEAD(cg_sets);
-static unsigned int n_sets;
-static CgSetEntry **rst_sets;
-static unsigned int n_controllers;
-static CgControllerEntry **controllers;
-static char *cg_yard;
-static struct cg_set *root_cgset; /* Set root item lives in */
-static struct cg_set *criu_cgset; /* Set criu process lives in */
-static u32 cg_set_ids = 1;
-
-static LIST_HEAD(cgroups);
-static unsigned int n_cgroups;
-
-static CgSetEntry *find_rst_set_by_id(u32 id)
-{
- int i;
-
- for (i = 0; i < n_sets; i++)
- if (rst_sets[i]->id == id)
- return rst_sets[i];
-
- return NULL;
-}
-
-#define CGCMP_MATCH 1 /* check for exact match */
-#define CGCMP_ISSUB 2 /* check set is subset of ctls */
-
-static bool cg_set_compare(struct cg_set *set, struct list_head *ctls, int what)
-{
- struct list_head *l1 = &set->ctls, *l2 = ctls;
-
- while (1) {
- struct cg_ctl *c1 = NULL, *c2 = NULL;
-
- if (l1->next != &set->ctls)
- c1 = list_first_entry(l1, struct cg_ctl, l);
- if (l2->next != ctls)
- c2 = list_first_entry(l2, struct cg_ctl, l);
-
- if (!c1 || !c2) /* Nowhere to move next */
- return !c1 && !c2; /* Both lists scanned -- match */
-
- if (strcmp(c1->name, c2->name))
- return false;
-
- switch (what) {
- case CGCMP_MATCH:
- if (strcmp(c1->path, c2->path))
- return false;
-
- break;
- case CGCMP_ISSUB:
- if (!strstartswith(c1->path, c2->path))
- return false;
-
- break;
- }
-
- l1 = l1->next;
- l2 = l2->next;
- }
-}
-
-static struct cg_set *get_cg_set(struct list_head *ctls, unsigned int n_ctls)
-{
- struct cg_set *cs;
-
- list_for_each_entry(cs, &cg_sets, l)
- if (cg_set_compare(cs, ctls, CGCMP_MATCH)) {
- pr_debug(" `- Existing css %d found\n", cs->id);
- put_ctls(ctls);
- return cs;
- }
-
- pr_debug(" `- New css ID %d\n", cg_set_ids);
- cs = xmalloc(sizeof(*cs));
- if (cs) {
- cs->id = cg_set_ids++;
- INIT_LIST_HEAD(&cs->ctls);
- list_splice(ctls, &cs->ctls);
- cs->n_ctls = n_ctls;
- list_add_tail(&cs->l, &cg_sets);
- n_sets++;
-
- if (!pr_quelled(LOG_DEBUG)) {
- struct cg_ctl *ctl;
-
- list_for_each_entry(ctl, &cs->ctls, l)
- pr_debug(" `- [%s] -> [%s]\n", ctl->name, ctl->path);
- }
- }
-
- return cs;
-}
-
-struct cg_controller *new_controller(const char *name)
-{
- struct cg_controller *nc = xmalloc(sizeof(*nc));
- if (!nc)
- return NULL;
-
- nc->controllers = xmalloc(sizeof(char *));
- if (!nc->controllers) {
- xfree(nc);
- return NULL;
- }
-
- nc->controllers[0] = xstrdup(name);
- if (!nc->controllers[0]) {
- xfree(nc->controllers);
- xfree(nc);
- return NULL;
- }
-
- nc->n_controllers = 1;
-
- nc->n_heads = 0;
- INIT_LIST_HEAD(&nc->heads);
-
- return nc;
-}
-
-int parse_cg_info(void)
-{
- if (collect_controllers(&cgroups, &n_cgroups) < 0)
- return -1;
-
- return 0;
-}
-
-/* Check that co-mounted controllers from /proc/cgroups (e.g. cpu and cpuacct)
- * are contained in a comma separated string (e.g. from /proc/self/cgroup or
- * mount options). */
-static bool cgroup_contains(char **controllers, unsigned int n_controllers, char *name)
-{
- unsigned int i;
- bool all_match = true;
- for (i = 0; i < n_controllers; i++) {
- bool found = false;
- const char *loc = name;
- do {
- loc = strstr(loc, controllers[i]);
- if (loc) {
- loc += strlen(controllers[i]);
- switch (*loc) {
- case '\0':
- case ',':
- found = true;
- break;
- }
- }
- } while (loc);
- all_match &= found;
- }
-
- return all_match && n_controllers > 0;
-}
-
-/* This is for use in add_cgroup() as additional arguments for the ftw()
- * callback */
-static struct cg_controller *current_controller;
-static unsigned int path_pref_len;
-
-#define EXACT_MATCH 0
-#define PARENT_MATCH 1
-#define NO_MATCH 2
-
-static int find_dir(const char *path, struct list_head *dirs, struct cgroup_dir **rdir)
-{
- struct cgroup_dir *d;
- list_for_each_entry(d, dirs, siblings) {
- if (strcmp(d->path, path) == 0) {
- *rdir = d;
- return EXACT_MATCH;
- }
-
- if (strstartswith(path, d->path)) {
- int ret = find_dir(path, &d->children, rdir);
- if (ret == NO_MATCH) {
- *rdir = d;
- return PARENT_MATCH;
- }
- return ret;
-
- }
- }
-
- return NO_MATCH;
-}
-
-/*
- * Strips trailing '\n' from the string
- */
-static inline char *strip(char *str)
-{
- char *e;
-
- e = strchr(str, '\0');
- if (e != str && *(e - 1) == '\n')
- *(e - 1) = '\0';
-
- return str;
-}
-
-/*
- * Currently this function only supports properties that have a string value
- * under 1024 chars.
- */
-static int read_cgroup_prop(struct cgroup_prop *property, const char *fullpath)
-{
- char buf[1024];
- int fd, ret;
- struct stat sb;
-
- fd = open(fullpath, O_RDONLY);
- if (fd == -1) {
- property->value = NULL;
- pr_perror("Failed opening %s", fullpath);
- return -1;
- }
-
- if (fstat(fd, &sb) < 0) {
- pr_perror("failed statting cgroup prop %s", fullpath);
- close(fd);
- return -1;
- }
-
- property->mode = sb.st_mode;
- property->uid = sb.st_uid;
- property->gid = sb.st_gid;
-
- /* skip dumping the value of these, since it doesn't make sense (we
- * just want to restore the perms) */
- if (!strcmp(property->name, "cgroup.procs") || !strcmp(property->name, "tasks")) {
- ret = 0;
- /* libprotobuf segfaults if we leave a null pointer in a
- * string, so let's not do that */
- property->value = xstrdup("");
- if (!property->value)
- ret = -1;
-
- close(fd);
- return ret;
- }
-
- ret = read(fd, buf, sizeof(buf) - 1);
- if (ret == -1) {
- pr_err("Failed scanning %s\n", fullpath);
- close(fd);
- return -1;
- }
- close(fd);
-
- buf[ret] = 0;
-
- if (strtoll(buf, NULL, 10) == LLONG_MAX)
- strcpy(buf, "-1");
-
- property->value = xstrdup(strip(buf));
- if (!property->value)
- return -1;
- return 0;
-}
-
-static struct cgroup_prop *create_cgroup_prop(const char *name)
-{
- struct cgroup_prop *property;
-
- property = xmalloc(sizeof(*property));
- if (!property)
- return NULL;
-
- property->name = xstrdup(name);
- if (!property->name) {
- xfree(property);
- return NULL;
- }
-
- property->value = NULL;
- return property;
-}
-
-static void free_cgroup_prop(struct cgroup_prop *prop)
-{
- xfree(prop->name);
- xfree(prop->value);
- xfree(prop);
-}
-
-static void free_all_cgroup_props(struct cgroup_dir *ncd)
-{
- struct cgroup_prop *prop, *t;
-
- list_for_each_entry_safe(prop, t, &ncd->properties, list) {
- list_del(&prop->list);
- free_cgroup_prop(prop);
- }
-
- INIT_LIST_HEAD(&ncd->properties);
- ncd->n_properties = 0;
-}
-
-static const char **get_known_properties(char *controller)
-{
- const char **prop_arr = NULL;
-
- if (!strcmp(controller, "cpu"))
- prop_arr = cpu_props;
- else if (!strcmp(controller, "memory"))
- prop_arr = memory_props;
- else if (!strcmp(controller, "cpuset"))
- prop_arr = cpuset_props;
- else if (!strcmp(controller, "blkio"))
- prop_arr = blkio_props;
- else if (!strcmp(controller, "freezer"))
- prop_arr = freezer_props;
-
- return prop_arr;
-}
-
-static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd,
- const char **prop_arr)
-{
- int j;
- char buf[PATH_MAX];
- struct cgroup_prop *prop;
-
- for (j = 0; prop_arr != NULL && prop_arr[j] != NULL; ++j) {
- if (snprintf(buf, PATH_MAX, "%s/%s", fpath, prop_arr[j]) >= PATH_MAX) {
- pr_err("snprintf output was truncated\n");
- return -1;
- }
-
- if (access(buf, F_OK) < 0 && errno == ENOENT) {
- pr_info("Couldn't open %s. This cgroup property may not exist on this kernel\n", buf);
- continue;
- }
-
- prop = create_cgroup_prop(prop_arr[j]);
- if (!prop) {
- free_all_cgroup_props(ncd);
- return -1;
- }
-
- if (read_cgroup_prop(prop, buf) < 0) {
- free_cgroup_prop(prop);
- free_all_cgroup_props(ncd);
- return -1;
- }
-
- pr_info("Dumping value %s from %s/%s\n", prop->value, fpath, prop->name);
- list_add_tail(&prop->list, &ncd->properties);
- ncd->n_properties++;
- }
-
- return 0;
-}
-
-static int add_cgroup_properties(const char *fpath, struct cgroup_dir *ncd,
- struct cg_controller *controller)
-{
- int i;
-
- for (i = 0; i < controller->n_controllers; ++i) {
-
- const char **prop_arr = get_known_properties(controller->controllers[i]);
-
- if (dump_cg_props_array(fpath, ncd, prop_arr) < 0) {
- pr_err("dumping known properties failed");
- return -1;
- }
-
- if (dump_cg_props_array(fpath, ncd, global_props) < 0) {
- pr_err("dumping global properties failed");
- return -1;
- }
- }
-
- return 0;
-}
-
-static int add_cgroup(const char *fpath, const struct stat *sb, int typeflag)
-{
- struct cgroup_dir *ncd = NULL, *match;
- int exit_code = -1;
-
- if (typeflag == FTW_D) {
- int mtype;
-
- pr_info("adding cgroup %s\n", fpath);
-
- ncd = xmalloc(sizeof(*ncd));
- if (!ncd)
- goto out;
-
- ncd->mode = sb->st_mode;
- ncd->uid = sb->st_uid;
- ncd->gid = sb->st_gid;
-
- /* chop off the first "/proc/self/fd/N" str */
- if (fpath[path_pref_len] == '\0')
- ncd->path = xstrdup("/");
- else
- ncd->path = xstrdup(fpath + path_pref_len);
-
- if (!ncd->path)
- goto out;
-
- mtype = find_dir(ncd->path, ¤t_controller->heads, &match);
-
- switch (mtype) {
- /* ignore co-mounted cgroups */
- case EXACT_MATCH:
- exit_code = 0;
- goto out;
- case PARENT_MATCH:
- list_add_tail(&ncd->siblings, &match->children);
- match->n_children++;
- break;
- case NO_MATCH:
- list_add_tail(&ncd->siblings, ¤t_controller->heads);
- current_controller->n_heads++;
- break;
- default:
- BUG();
- }
-
- INIT_LIST_HEAD(&ncd->children);
- ncd->n_children = 0;
-
- INIT_LIST_HEAD(&ncd->properties);
- ncd->n_properties = 0;
- if (add_cgroup_properties(fpath, ncd, current_controller) < 0)
- goto out;
- }
-
- return 0;
-
-out:
- if (ncd)
- xfree(ncd->path);
- xfree(ncd);
- return exit_code;
-}
-
-static int add_freezer_state(struct cg_controller *controller)
-{
- struct cgroup_dir *root_dir;
- struct cgroup_prop *prop;
-
- /*
- * Here we rely on --freeze-cgroup option assumption that all tasks are in a
- * specified freezer cgroup hierarchy, so we need to dump only one root freezer cgroup.
- */
- if (!list_is_singular(&controller->heads)) {
- pr_err("Should be only one root freezer cgroup");
- return -1;
- }
- root_dir = list_first_entry(&controller->heads, struct cgroup_dir, siblings);
-
- prop = create_cgroup_prop("freezer.state");
- if (!prop)
- return -1;
- prop->value = xstrdup(get_real_freezer_state());
- if (!prop->value) {
- free_cgroup_prop(prop);
- return -1;
- }
-
- list_add_tail(&prop->list, &root_dir->properties);
- root_dir->n_properties++;
-
- return 0;
-}
-
-static int collect_cgroups(struct list_head *ctls)
-{
- struct cg_ctl *cc;
- int ret = 0;
- int fd = -1;
-
- list_for_each_entry(cc, ctls, l) {
- char path[PATH_MAX], mopts[1024];
- char *name, prefix[] = ".criu.cgmounts.XXXXXX";
- struct cg_controller *cg;
-
- current_controller = NULL;
-
- /* We should get all the "real" (i.e. not name=systemd type)
- * controller from parse_cgroups(), so find that controller if
- * it exists. */
- list_for_each_entry(cg, &cgroups, l) {
- if (cgroup_contains(cg->controllers, cg->n_controllers, cc->name)) {
- current_controller = cg;
- break;
- }
- }
-
- if (!current_controller) {
- /* only allow "fake" controllers to be created this way */
- if (!strstartswith(cc->name, "name=")) {
- pr_err("controller %s not found\n", cc->name);
- return -1;
- } else {
- struct cg_controller *nc = new_controller(cc->name);
- list_add_tail(&nc->l, &cg->l);
- n_cgroups++;
- current_controller = nc;
- }
- }
-
- if (!opts.manage_cgroups)
- continue;
-
- if (strstartswith(cc->name, "name=")) {
- name = cc->name + 5;
- snprintf(mopts, sizeof(mopts), "none,%s", cc->name);
- } else {
- name = cc->name;
- snprintf(mopts, sizeof(mopts), "%s", name);
- }
-
- if (mkdtemp(prefix) == NULL) {
- pr_perror("can't make dir for cg mounts");
- return -1;
- }
-
- if (mount("none", prefix, "cgroup", 0, mopts) < 0) {
- pr_perror("couldn't mount %s", mopts);
- rmdir(prefix);
- return -1;
- }
-
- fd = open_detach_mount(prefix);
- if (fd < 0)
- return -1;
-
- path_pref_len = snprintf(path, PATH_MAX, "/proc/self/fd/%d", fd);
- snprintf(path + path_pref_len, PATH_MAX - path_pref_len, "%s", cc->path);
-
- ret = ftw(path, add_cgroup, 4);
- if (ret < 0)
- pr_perror("failed walking %s for empty cgroups", path);
-
- close_safe(&fd);
-
- if (ret < 0)
- return ret;
-
- if (opts.freeze_cgroup && !strcmp(cc->name, "freezer") &&
- add_freezer_state(current_controller))
- return -1;
- }
-
- return 0;
-}
-
-int dump_task_cgroup(struct pstree_item *item, u32 *cg_id)
-{
- int pid;
- LIST_HEAD(ctls);
- unsigned int n_ctls = 0;
- struct cg_set *cs;
-
- if (item)
- pid = item->pid.real;
- else
- pid = getpid();
-
- pr_info("Dumping cgroups for %d\n", pid);
- if (parse_task_cgroup(pid, &ctls, &n_ctls))
- return -1;
-
- cs = get_cg_set(&ctls, n_ctls);
- if (!cs)
- return -1;
-
- if (!item) {
- BUG_ON(criu_cgset);
- criu_cgset = cs;
- pr_info("Set %d is criu one\n", cs->id);
- } else if (item == root_item) {
- BUG_ON(root_cgset);
- root_cgset = cs;
- pr_info("Set %d is root one\n", cs->id);
-
- /*
- * The on-stack ctls is moved into cs inside
- * the get_cg_set routine.
- */
- if (cs != criu_cgset && collect_cgroups(&cs->ctls))
- return -1;
- }
-
- *cg_id = cs->id;
- return 0;
-}
-
-static int dump_cg_dir_props(struct list_head *props, size_t n_props,
- CgroupPropEntry ***ents)
-{
- struct cgroup_prop *prop_cur;
- CgroupPropEntry *cpe;
- void *m;
- int i = 0;
-
- m = xmalloc(n_props * (sizeof(CgroupPropEntry *) + sizeof(CgroupPropEntry)));
- *ents = m;
- if (!m)
- return -1;
-
- cpe = m + n_props * sizeof(CgroupPropEntry *);
-
- list_for_each_entry(prop_cur, props, list) {
- cgroup_prop_entry__init(cpe);
-
- cpe->perms = xmalloc(sizeof(*cpe->perms));
- if (!cpe->perms)
- goto error;
- cgroup_perms__init(cpe->perms);
-
- cpe->name = xstrdup(prop_cur->name);
- cpe->value = xstrdup(prop_cur->value);
- if (!cpe->name || !cpe->value)
- goto error;
- cpe->perms->mode = prop_cur->mode;
- cpe->perms->uid = prop_cur->uid;
- cpe->perms->gid = prop_cur->gid;
-
- (*ents)[i++] = cpe++;
- }
-
- return 0;
-
-error:
- while (i >= 0) {
- xfree(cpe->name);
- xfree(cpe->value);
- --cpe;
- --i;
- }
-
- xfree(*ents);
- return -1;
-}
-
-static int dump_cg_dirs(struct list_head *dirs, size_t n_dirs, CgroupDirEntry ***ents, int poff)
-{
- struct cgroup_dir *cur;
- CgroupDirEntry *cde;
- void *m;
- int i = 0;
-
- m = xmalloc(n_dirs * (sizeof(CgroupDirEntry *) + sizeof(CgroupDirEntry)));
- *ents = m;
- if (!m)
- return -1;
-
- cde = m + n_dirs * sizeof(CgroupDirEntry *);
-
- list_for_each_entry(cur, dirs, siblings) {
- cgroup_dir_entry__init(cde);
-
- cde->dir_perms = xmalloc(sizeof(*cde->dir_perms));
- if (!cde->dir_perms)
- return -1;
- cgroup_perms__init(cde->dir_perms);
-
- cde->dir_perms->mode = cur->mode;
- cde->dir_perms->uid = cur->uid;
- cde->dir_perms->gid = cur->gid;
-
- cde->dir_name = cur->path + poff;
- if (poff != 1) /* parent isn't "/" */
- cde->dir_name++; /* leading / */
- cde->n_children = cur->n_children;
- if (cur->n_children > 0)
- if (dump_cg_dirs(&cur->children, cur->n_children, &cde->children, strlen(cur->path)) < 0) {
- xfree(*ents);
- return -1;
- }
-
- cde->n_properties = cur->n_properties;
- if (cde->n_properties > 0) {
- if (dump_cg_dir_props(&cur->properties,
- cde->n_properties, &cde->properties) < 0) {
- xfree(*ents);
- return -1;
- }
- }
-
- (*ents)[i++] = cde++;
- }
-
- return 0;
-}
-
-static int dump_controllers(CgroupEntry *cg)
-{
- struct cg_controller *cur;
- CgControllerEntry *ce;
- void *m;
- int i;
-
- cg->n_controllers = n_cgroups;
- m = xmalloc(n_cgroups * (sizeof(CgControllerEntry *) + sizeof(CgControllerEntry)));
- cg->controllers = m;
- ce = m + cg->n_controllers * sizeof(CgControllerEntry *);
- if (!m)
- return -1;
-
- i = 0;
- list_for_each_entry(cur, &cgroups, l) {
- cg_controller_entry__init(ce);
-
- ce->cnames = cur->controllers;
- ce->n_cnames = cur->n_controllers;
- ce->n_dirs = cur->n_heads;
- if (ce->n_dirs > 0)
- if (dump_cg_dirs(&cur->heads, cur->n_heads, &ce->dirs, 0) < 0) {
- xfree(cg->controllers);
- return -1;
- }
- cg->controllers[i++] = ce++;
- }
-
- return 0;
-}
-
-
-static int dump_sets(CgroupEntry *cg)
-{
- struct cg_set *set;
- struct cg_ctl *ctl;
- int s, c;
- void *m;
- CgSetEntry *se;
- CgMemberEntry *ce;
-
- pr_info("Dumping %d sets\n", n_sets - 1);
-
- cg->n_sets = n_sets - 1;
- m = xmalloc(cg->n_sets * (sizeof(CgSetEntry *) + sizeof(CgSetEntry)));
- cg->sets = m;
- se = m + cg->n_sets * sizeof(CgSetEntry *);
- if (!m)
- return -1;
-
- s = 0;
- list_for_each_entry(set, &cg_sets, l) {
- if (set == criu_cgset)
- continue;
-
- /*
- * Check that all sets we've found that tasks live in are
- * subsets of the one root task lives in
- */
-
- pr_info(" `- Dumping %d set (%d ctls)\n", set->id, set->n_ctls);
- if (!cg_set_compare(set, &root_cgset->ctls, CGCMP_ISSUB)) {
- pr_err("Set %d is not subset of %d\n",
- set->id, root_cgset->id);
-
- list_for_each_entry(ctl, &set->ctls, l)
- pr_info(" `- %s of %s\n", ctl->name, ctl->path);
- return -1;
- }
-
- /*
- * Now encode them onto the image entry
- */
-
- cg_set_entry__init(se);
- se->id = set->id;
-
- se->n_ctls = set->n_ctls;
- m = xmalloc(se->n_ctls * (sizeof(CgMemberEntry *) + sizeof(CgMemberEntry)));
- se->ctls = m;
- ce = m + se->n_ctls * sizeof(CgMemberEntry *);
- if (!m)
- return -1;
-
- c = 0;
- list_for_each_entry(ctl, &set->ctls, l) {
- pr_info(" `- Dumping %s of %s\n", ctl->name, ctl->path);
- cg_member_entry__init(ce);
- ce->name = ctl->name;
- ce->path = ctl->path;
- se->ctls[c++] = ce++;
- }
-
- cg->sets[s++] = se++;
- }
-
- return 0;
-}
-
-int dump_cgroups(void)
-{
- CgroupEntry cg = CGROUP_ENTRY__INIT;
-
- BUG_ON(!criu_cgset || !root_cgset);
-
- /*
- * Check whether root task lives in its own set as compared
- * to criu. If yes, we should not dump anything, but make
- * sure no other sets exist. The latter case can be supported,
- * but requires some trickery and is hardly needed at the
- * moment.
- */
-
- if (root_cgset == criu_cgset) {
- if (!list_is_singular(&cg_sets)) {
- pr_err("Non supported sub-cgroups found\n");
- return -1;
- }
-
- pr_info("All tasks in criu's cgroups. Nothing to dump.\n");
- return 0;
- }
-
- if (dump_sets(&cg))
- return -1;
- if (dump_controllers(&cg))
- return -1;
-
- pr_info("Writing CG image\n");
- return pb_write_one(img_from_set(glob_imgset, CR_FD_CGROUP), &cg, PB_CGROUP);
-}
-
-static int ctrl_dir_and_opt(CgControllerEntry *ctl, char *dir, int ds,
- char *opt, int os)
-{
- int i, doff = 0, ooff = 0;
- bool none_opt = false;
-
- for (i = 0; i < ctl->n_cnames; i++) {
- char *n;
-
- n = ctl->cnames[i];
- if (strstartswith(n, "name=")) {
- n += 5;
- if (opt && !none_opt) {
- ooff += snprintf(opt + ooff, os - ooff, "none,");
- none_opt = true;
- }
- }
-
- doff += snprintf(dir + doff, ds - doff, "%s,", n);
- if (opt)
- ooff += snprintf(opt + ooff, os - ooff, "%s,", ctl->cnames[i]);
- }
-
- /* Chop the trailing ','-s */
- dir[--doff] = '\0';
- if (opt)
- opt[ooff - 1] = '\0';
-
- return doff;
-}
-
-static const char *special_cpuset_props[] = {
- "cpuset.cpus",
- "cpuset.mems",
- NULL,
-};
-
-static int userns_move(void *arg, int fd, pid_t pid)
-{
- char pidbuf[32];
- int cg, len, err;
-
- len = snprintf(pidbuf, sizeof(pidbuf), "%d", pid);
-
- if (len >= sizeof(pidbuf)) {
- pr_err("pid printing failed: %d\n", pid);
- return -1;
- }
-
- cg = get_service_fd(CGROUP_YARD);
- err = fd = openat(cg, arg, O_WRONLY);
- if (fd >= 0) {
- err = write(fd, pidbuf, len);
- close(fd);
- }
-
- if (err < 0) {
- pr_perror("Can't move %s into %s (%d/%d)", pidbuf, (char *)arg, err, fd);
- return -1;
- }
-
- return 0;
-}
-
-static int move_in_cgroup(CgSetEntry *se)
-{
- int i;
-
- pr_info("Move into %d\n", se->id);
- for (i = 0; i < se->n_ctls; i++) {
- char aux[PATH_MAX];
- int fd = -1, err, j, aux_off;
- CgMemberEntry *ce = se->ctls[i];
- CgControllerEntry *ctrl = NULL;
-
- for (j = 0; j < n_controllers; j++) {
- CgControllerEntry *cur = controllers[j];
- if (cgroup_contains(cur->cnames, cur->n_cnames, ce->name)) {
- ctrl = cur;
- break;
- }
- }
-
- if (!ctrl) {
- pr_err("No cg_controller_entry found for %s/%s\n", ce->name, ce->path);
- return -1;
- }
-
- aux_off = ctrl_dir_and_opt(ctrl, aux, sizeof(aux), NULL, 0);
-
- snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/tasks", ce->path);
- pr_debug(" `-> %s\n", aux);
- err = userns_call(userns_move, UNS_ASYNC, aux, strlen(aux) + 1, -1);
- if (err < 0) {
- pr_perror("Can't move into %s (%d/%d)", aux, err, fd);
- return -1;
- }
- }
-
- return 0;
-}
-
-int prepare_task_cgroup(struct pstree_item *me)
-{
- CgSetEntry *se;
- u32 current_cgset;
-
- if (!rsti(me)->cg_set)
- return 0;
-
- if (me->parent)
- current_cgset = rsti(me->parent)->cg_set;
- else
- current_cgset = root_cg_set;
-
- if (rsti(me)->cg_set == current_cgset) {
- pr_info("Cgroups %d inherited from parent\n", current_cgset);
- return 0;
- }
-
- se = find_rst_set_by_id(rsti(me)->cg_set);
- if (!se) {
- pr_err("No set %d found\n", rsti(me)->cg_set);
- return -1;
- }
-
- return move_in_cgroup(se);
-}
-
-void fini_cgroup(void)
-{
- if (!cg_yard)
- return;
-
- close_service_fd(CGROUP_YARD);
- umount2(cg_yard, MNT_DETACH);
- rmdir(cg_yard);
- xfree(cg_yard);
- cg_yard = NULL;
-}
-
-static int restore_perms(int fd, const char *path, CgroupPerms *perms)
-{
- struct stat sb;
-
- if (perms) {
- if (fstat(fd, &sb) < 0) {
- pr_perror("stat of property %s failed", path);
- return -1;
- }
-
- /* only chmod/chown if the perms are actually different: we aren't
- * allowed to chmod some cgroup props (e.g. the read only ones), so we
- * don't want to try if the perms already match.
- */
- if (sb.st_mode != (mode_t) perms->mode && fchmod(fd, perms->mode) < 0) {
- pr_perror("chmod of %s failed", path);
- return -1;
- }
-
- if ((sb.st_uid != perms->uid || sb.st_gid != perms->gid) &&
- fchown(fd, perms->uid, perms->gid)) {
- pr_perror("chown of %s failed", path);
- return -1;
- }
- }
-
- return 0;
-}
-
-static int restore_cgroup_prop(const CgroupPropEntry * cg_prop_entry_p,
- char *path, int off)
-{
- FILE *f;
- int cg, fd;
- CgroupPerms *perms = cg_prop_entry_p->perms;
-
- if (!cg_prop_entry_p->value) {
- pr_err("cg_prop_entry->value was empty when should have had a value\n");
- return -1;
- }
-
- if (snprintf(path + off, PATH_MAX - off, "/%s", cg_prop_entry_p->name) >= PATH_MAX) {
- pr_err("snprintf output was truncated for %s\n", cg_prop_entry_p->name);
- return -1;
- }
-
- pr_info("Restoring cgroup property value [%s] to [%s]\n", cg_prop_entry_p->value, path);
-
- cg = get_service_fd(CGROUP_YARD);
- f = fopenat(cg, path, "w+");
- if (!f) {
- pr_perror("Failed opening %s for writing", path);
- return -1;
- }
-
- fd = fileno(f);
- if (fd < 0) {
- fclose(f);
- pr_err("bad file stream?");
- return -1;
- }
-
- if (restore_perms(fd, path, perms) < 0) {
- fclose(f);
- return -1;
- }
-
- /* skip these two since restoring their values doesn't make sense */
- if (!strcmp(cg_prop_entry_p->name, "cgroup.procs") || !strcmp(cg_prop_entry_p->name, "tasks")) {
- fclose(f);
- return 0;
- }
-
- if (fprintf(f, "%s", cg_prop_entry_p->value) < 0) {
- fclose(f);
- pr_err("Failed writing %s to %s\n", cg_prop_entry_p->value, path);
- return -1;
- }
-
- if (fclose(f) != 0) {
- pr_perror("Failed closing %s", path);
- return -1;
- }
-
- return 0;
-}
-
-static CgroupPropEntry *freezer_state_entry;
-static char freezer_path[PATH_MAX];
-
-int restore_freezer_state(void)
-{
- size_t freezer_path_len;
-
- if (!freezer_state_entry)
- return 0;
-
- freezer_path_len = strlen(freezer_path);
- return restore_cgroup_prop(freezer_state_entry, freezer_path, freezer_path_len);
-}
-
-static void add_freezer_state_for_restore(CgroupPropEntry *entry, char *path, size_t path_len)
-{
- BUG_ON(freezer_state_entry);
- BUG_ON(path_len >= sizeof(freezer_path));
-
- freezer_state_entry = entry;
- /* Path is not null terminated at path_len */
- strncpy(freezer_path, path, path_len);
- freezer_path[path_len] = 0;
-}
-
-static int prepare_cgroup_dir_properties(char *path, int off, CgroupDirEntry **ents,
- unsigned int n_ents)
-{
- unsigned int i, j;
-
- for (i = 0; i < n_ents; i++) {
- CgroupDirEntry *e = ents[i];
- size_t off2 = off;
-
- if (strcmp(e->dir_name, "") == 0)
- goto skip; /* skip root cgroups */
-
- off2 += sprintf(path + off, "/%s", e->dir_name);
- if (e->n_properties > 0) {
- for (j = 0; j < e->n_properties; ++j) {
- if (!strcmp(e->properties[j]->name, "freezer.state")) {
- add_freezer_state_for_restore(e->properties[j], path, off2);
- continue; /* skip restore now */
- }
- if (restore_cgroup_prop(e->properties[j], path, off2) < 0)
- return -1;
- }
- }
-skip:
- if (prepare_cgroup_dir_properties(path, off2, e->children, e->n_children) < 0)
- return -1;
- }
-
- return 0;
-}
-
-int prepare_cgroup_properties(void)
-{
- char cname_path[PATH_MAX];
- unsigned int i, off;
-
- for (i = 0; i < n_controllers; i++) {
- CgControllerEntry *c = controllers[i];
-
- if (c->n_cnames < 1) {
- pr_err("Each CgControllerEntry should have at least 1 cname\n");
- return -1;
- }
-
- off = ctrl_dir_and_opt(c, cname_path, sizeof(cname_path), NULL, 0);
- if (prepare_cgroup_dir_properties(cname_path, off, c->dirs, c->n_dirs) < 0)
- return -1;
- }
-
- return 0;
-}
-
-static int restore_special_cpuset_props(char *paux, size_t off, CgroupDirEntry *e)
-{
- int i, j;
-
- pr_info("Restore special cpuset props\n");
-
- for (i = 0; special_cpuset_props[i]; i++) {
- const char *name = special_cpuset_props[i];
-
- for (j = 0; j < e->n_properties; j++) {
- CgroupPropEntry *prop = e->properties[j];
-
- if (strcmp(name, prop->name) == 0)
- if (restore_cgroup_prop(prop, paux, off) < 0)
- return -1;
- }
- }
-
- return 0;
-}
-
-static int prepare_dir_perms(int cg, char *path, CgroupPerms *perms)
-{
- int fd, ret;
-
- fd = openat(cg, path, O_DIRECTORY);
- if (fd < 0) {
- pr_perror("failed to open cg dir fd (%s) for chowning", path);
- return -1;
- }
-
- ret = restore_perms(fd, path, perms);
- close(fd);
- return ret;
-}
-
-static int prepare_cgroup_dirs(char **controllers, int n_controllers, char *paux, size_t off,
- CgroupDirEntry **ents, size_t n_ents)
-{
- size_t i, j;
- CgroupDirEntry *e;
- int cg = get_service_fd(CGROUP_YARD);
-
- for (i = 0; i < n_ents; i++) {
- size_t off2 = off;
- e = ents[i];
-
- off2 += sprintf(paux + off, "/%s", e->dir_name);
-
- if (faccessat(cg, paux, F_OK, 0) < 0) {
- if (errno != ENOENT) {
- pr_perror("Failed accessing cgroup dir %s", paux);
- return -1;
- }
-
- if (opts.manage_cgroups & (CG_MODE_NONE | CG_MODE_PROPS)) {
- pr_err("Cgroup dir %s doesn't exist\n", paux);
- return -1;
- }
-
- if (mkdirpat(cg, paux)) {
- pr_perror("Can't make cgroup dir %s", paux);
- return -1;
- }
- pr_info("Created cgroup dir %s\n", paux);
-
- if (prepare_dir_perms(cg, paux, e->dir_perms) < 0)
- return -1;
-
- for (j = 0; j < n_controllers; j++) {
- if (strcmp(controllers[j], "cpuset") == 0) {
- if (restore_special_cpuset_props(paux, off2, e) < 0) {
- pr_err("Restoring special cpuset props failed!\n");
- return -1;
- }
- }
- }
- } else {
- pr_info("Determined cgroup dir %s already exist\n", paux);
-
- if (opts.manage_cgroups & CG_MODE_STRICT) {
- pr_err("Abort restore of existing cgroups\n");
- return -1;
- }
-
- if (opts.manage_cgroups & (CG_MODE_SOFT | CG_MODE_NONE)) {
- pr_info("Skip restoring properties on cgroup dir %s\n", paux);
- if (e->n_properties > 0) {
- xfree(e->properties);
- e->properties = NULL;
- e->n_properties = 0;
- }
- }
-
- if (!(opts.manage_cgroups & CG_MODE_NONE) &&
- prepare_dir_perms(cg, paux, e->dir_perms) < 0)
- return -1;
- }
-
- if (prepare_cgroup_dirs(controllers, n_controllers, paux, off2,
- e->children, e->n_children) < 0)
- return -1;
- }
-
- return 0;
-}
-
-/*
- * Prepare the CGROUP_YARD service descriptor. This guy is
- * tmpfs mount with the set of ctl->name directories each
- * one having the respective cgroup mounted.
- *
- * It's required for two reasons.
- *
- * First, if we move more than one task into cgroups it's
- * faster to have cgroup tree visible by them all in sime
- * single place. Searching for this thing existing in the
- * criu's space is not nice, as parsing /proc/mounts is not
- * very fast, other than this not all cgroups may be mounted.
- *
- * Second, when we have user-namespaces support we will
- * loose the ability to mount cgroups on-demand, so prepare
- * them in advance.
- */
-
-static int prepare_cgroup_sfd(CgroupEntry *ce)
-{
- int off, i, ret;
- char paux[PATH_MAX];
-
- if (!opts.manage_cgroups)
- return 0;
-
- pr_info("Preparing cgroups yard (cgroups restore mode %#x)\n",
- opts.manage_cgroups);
-
- off = sprintf(paux, ".criu.cgyard.XXXXXX");
- if (mkdtemp(paux) == NULL) {
- pr_perror("Can't make temp cgyard dir");
- return -1;
- }
-
- cg_yard = xstrdup(paux);
- if (!cg_yard) {
- rmdir(paux);
- return -1;
- }
-
- if (make_yard(cg_yard))
- goto err;
-
- pr_debug("Opening %s as cg yard\n", cg_yard);
- i = open(cg_yard, O_DIRECTORY);
- if (i < 0) {
- pr_perror("Can't open cgyard");
- goto err;
- }
-
- ret = install_service_fd(CGROUP_YARD, i);
- close(i);
- if (ret < 0)
- goto err;
-
- paux[off++] = '/';
-
- for (i = 0; i < ce->n_controllers; i++) {
- int ctl_off = off, yard_off;
- char opt[128], *yard;
- CgControllerEntry *ctrl = ce->controllers[i];
-
- if (ctrl->n_cnames < 1) {
- pr_err("Each cg_controller_entry must have at least 1 controller\n");
- goto err;
- }
-
- ctl_off += ctrl_dir_and_opt(ctrl,
- paux + ctl_off, sizeof(paux) - ctl_off,
- opt, sizeof(opt));
-
- /* Create controller if not yet present */
- if (access(paux, F_OK)) {
- pr_debug("\tMaking controller dir %s (%s)\n", paux, opt);
- if (mkdir(paux, 0700)) {
- pr_perror("\tCan't make controller dir %s", paux);
- return -1;
- }
- if (mount("none", paux, "cgroup", 0, opt) < 0) {
- pr_perror("\tCan't mount controller dir %s", paux);
- return -1;
- }
- }
-
- /*
- * Finally handle all cgroups for this controller.
- */
- yard = paux + strlen(cg_yard) + 1;
- yard_off = ctl_off - (strlen(cg_yard) + 1);
- if (opts.manage_cgroups &&
- prepare_cgroup_dirs(ctrl->cnames, ctrl->n_cnames, yard, yard_off,
- ctrl->dirs, ctrl->n_dirs))
- goto err;
- }
-
- return 0;
-
-err:
- fini_cgroup();
- return -1;
-}
-
-static int rewrite_cgsets(CgroupEntry *cge, char **controllers, int n_controllers,
- char *from, char *to)
-{
- int i, j;
- for (i = 0; i < cge->n_sets; i++) {
- CgSetEntry *set = cge->sets[i];
- for (j = 0; j < set->n_ctls; j++) {
- CgMemberEntry *cg = set->ctls[j];
- if (cgroup_contains(controllers, n_controllers, cg->name) &&
- /* +1 to get rid of leading / */
- strstartswith(cg->path + 1, from)) {
-
- char *tmp = cg->path;
-
- /* +1 to get rid of leading /, again */
- cg->path = xsprintf("%s%s", to, cg->path +
- strlen(from) + 1);
- if (!cg->path)
- return -1;
- free(tmp);
- }
- }
-
- }
- return 0;
-}
-
-static int rewrite_cgroup_roots(CgroupEntry *cge)
-{
- int i, j;
- struct cg_root_opt *o;
- char *newroot = NULL;
-
- for (i = 0; i < cge->n_controllers; i++) {
- CgControllerEntry *ctrl = cge->controllers[i];
- newroot = opts.new_global_cg_root;
-
- list_for_each_entry(o, &opts.new_cgroup_roots, node) {
- if (cgroup_contains(ctrl->cnames, ctrl->n_cnames, o->controller)) {
- newroot = o->newroot;
- break;
- }
-
- }
-
- if (newroot) {
- for (j = 0; j < ctrl->n_dirs; j++) {
- CgroupDirEntry *cgde = ctrl->dirs[j];
- char *m;
-
- pr_info("rewriting %s to %s\n", cgde->dir_name, newroot);
- if (rewrite_cgsets(cge, ctrl->cnames, ctrl->n_cnames, cgde->dir_name, newroot))
- return -1;
-
- m = xstrdup(newroot);
- if (!m)
- return -1;
-
- free(cgde->dir_name);
- cgde->dir_name = m;
- }
- }
- }
-
- return 0;
-}
-
-int prepare_cgroup(void)
-{
- int ret;
- struct cr_img *img;
- CgroupEntry *ce;
-
- img = open_image(CR_FD_CGROUP, O_RSTR);
- if (!img)
- return -1;
-
- ret = pb_read_one_eof(img, &ce, PB_CGROUP);
- close_image(img);
- if (ret <= 0) /* Zero is OK -- no sets there. */
- return ret;
-
- if (rewrite_cgroup_roots(ce))
- return -1;
-
- n_sets = ce->n_sets;
- rst_sets = ce->sets;
- n_controllers = ce->n_controllers;
- controllers = ce->controllers;
-
- if (n_sets)
- /*
- * We rely on the fact that all sets contain the same
- * set of controllers. This is checked during dump
- * with cg_set_compare(CGCMP_ISSUB) call.
- */
- ret = prepare_cgroup_sfd(ce);
- else
- ret = 0;
-
- return ret;
-}
-
-int new_cg_root_add(char *controller, char *newroot)
-{
- struct cg_root_opt *o;
-
- if (!controller) {
- opts.new_global_cg_root = newroot;
- return 0;
- }
-
- o = xmalloc(sizeof(*o));
- if (!o)
- return -1;
-
- o->controller = controller;
- o->newroot = newroot;
- list_add(&o->node, &opts.new_cgroup_roots);
- return 0;
-}
diff --git a/cr-check.c b/cr-check.c
deleted file mode 100644
index 2bb0d9a2e69f..000000000000
--- a/cr-check.c
+++ /dev/null
@@ -1,958 +0,0 @@
-#include <unistd.h>
-#include <linux/netlink.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-#include <sys/eventfd.h>
-#include <sys/epoll.h>
-#include <sys/inotify.h>
-#include <sys/signalfd.h>
-#include <sys/ptrace.h>
-#include <sys/wait.h>
-#include <sys/socket.h>
-#include <fcntl.h>
-#include <signal.h>
-#include <linux/if.h>
-#include <linux/filter.h>
-#include <linux/seccomp.h>
-#include <sys/ioctl.h>
-#include <termios.h>
-#include <sys/mman.h>
-#include <netinet/in.h>
-#include <sys/prctl.h>
-#include <sched.h>
-#include <linux/aio_abi.h>
-
-#include "proc_parse.h"
-#include "sockets.h"
-#include "crtools.h"
-#include "log.h"
-#include "util-pie.h"
-#include "prctl.h"
-#include "files.h"
-#include "sk-inet.h"
-#include "proc_parse.h"
-#include "mount.h"
-#include "tty.h"
-#include "ptrace.h"
-#include "kerndat.h"
-#include "timerfd.h"
-#include "tun.h"
-#include "namespaces.h"
-#include "pstree.h"
-#include "cr_options.h"
-
-static int check_tty(void)
-{
- int master = -1, slave = -1;
- const int lock = 1;
- struct termios t;
- char *slavename;
- int ret = -1;
-
- if (ARRAY_SIZE(t.c_cc) < TERMIOS_NCC) {
- pr_msg("struct termios has %d @c_cc while "
- "at least %d expected.\n",
- (int)ARRAY_SIZE(t.c_cc),
- TERMIOS_NCC);
- goto out;
- }
-
- master = open("/dev/ptmx", O_RDWR);
- if (master < 0) {
- pr_perror("Can't open /dev/ptmx");
- goto out;
- }
-
- if (ioctl(master, TIOCSPTLCK, &lock)) {
- pr_perror("Can't lock pty master");
- goto out;
- }
-
- slavename = ptsname(master);
- slave = open(slavename, O_RDWR);
- if (slave < 0) {
- if (errno != EIO) {
- pr_perror("Unexpected error on locked pty");
- goto out;
- }
- } else {
- pr_err("Managed to open locked pty.\n");
- goto out;
- }
-
- ret = 0;
-out:
- close_safe(&master);
- close_safe(&slave);
- return ret;
-}
-
-static int check_map_files(void)
-{
- int ret;
-
- ret = access("/proc/self/map_files", R_OK);
- if (!ret)
- return 0;
-
- pr_perror("/proc/<pid>/map_files is inaccessible");
- return -1;
-}
-
-static int check_sock_diag(void)
-{
- int ret;
- struct ns_id ns;
-
- ns.ns_pid = 0;
- ns.type = NS_CRIU;
- ns.net.nlsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG);
- if (ns.net.nlsk < 0) {
- pr_perror("Can't make diag socket for check");
- return -1;
- }
-
- ret = collect_sockets(&ns);
- if (!ret)
- return 0;
-
- pr_msg("The sock diag infrastructure is incomplete.\n");
- pr_msg("Make sure you have:\n");
- pr_msg(" 1. *_DIAG kernel config options turned on;\n");
- pr_msg(" 2. *_diag.ko modules loaded (if compiled as modules).\n");
- return -1;
-}
-
-static int check_ns_last_pid(void)
-{
- int ret;
-
- ret = access("/proc/" LAST_PID_PATH, W_OK);
- if (!ret)
- return 0;
-
- pr_perror("%s sysctl is inaccessible", LAST_PID_PATH);
- return -1;
-}
-
-static int check_sock_peek_off(void)
-{
- int sk;
- int ret, off, sz;
-
- sk = socket(PF_UNIX, SOCK_DGRAM, 0);
- if (sk < 0) {
- pr_perror("Can't create unix socket for check");
- return -1;
- }
-
- sz = sizeof(off);
- ret = getsockopt(sk, SOL_SOCKET, SO_PEEK_OFF, &off, (socklen_t *)&sz);
- close(sk);
-
- if ((ret == 0) && (off == -1) && (sz == sizeof(int)))
- return 0;
-
- pr_msg("SO_PEEK_OFF sockoption doesn't work.\n");
- return -1;
-}
-
-static int check_kcmp(void)
-{
- int ret = syscall(SYS_kcmp, getpid(), -1, -1, -1, -1);
-
- if (ret != -ENOSYS)
- return 0;
-
- errno = -ret;
- pr_perror("System call kcmp is not supported");
- return -1;
-}
-
-static int check_prctl(void)
-{
- unsigned long user_auxv = 0;
- unsigned int *tid_addr;
- unsigned int size = 0;
- int ret;
-
- ret = prctl(PR_GET_TID_ADDRESS, (unsigned long)&tid_addr, 0, 0, 0);
- if (ret) {
- pr_msg("prctl: PR_GET_TID_ADDRESS is not supported");
- return -1;
- }
-
- /*
- * Either new or old interface must be supported in the kernel.
- */
- ret = prctl(PR_SET_MM, PR_SET_MM_MAP_SIZE, (unsigned long)&size, 0, 0);
- if (ret) {
- if (!opts.check_ms_kernel) {
- pr_msg("prctl: PR_SET_MM_MAP is not supported, which "
- "is required for restoring user namespaces\n");
- return -1;
- } else
- pr_warn("Skipping unssuported PR_SET_MM_MAP\n");
-
- ret = prctl(PR_SET_MM, PR_SET_MM_BRK, brk(0), 0, 0);
- if (ret) {
- if (ret == -EPERM)
- pr_msg("prctl: One needs CAP_SYS_RESOURCE capability to perform testing\n");
- else
- pr_msg("prctl: PR_SET_MM is not supported\n");
- return -1;
- }
-
- ret = prctl(PR_SET_MM, PR_SET_MM_EXE_FILE, -1, 0, 0);
- if (ret != -EBADF) {
- pr_msg("prctl: PR_SET_MM_EXE_FILE is not supported (%d)\n", ret);
- return -1;
- }
-
- ret = prctl(PR_SET_MM, PR_SET_MM_AUXV, (long)&user_auxv, sizeof(user_auxv), 0);
- if (ret) {
- pr_msg("prctl: PR_SET_MM_AUXV is not supported\n");
- return -1;
- }
- }
-
- return 0;
-}
-
-static int check_fcntl(void)
-{
- u32 v[2];
- int fd;
-
- fd = open("/proc/self/comm", O_RDONLY);
- if (fd < 0) {
- pr_perror("Can't open self comm file");
- return -1;
- }
-
- if (fcntl(fd, F_GETOWNER_UIDS, (long)v)) {
- pr_perror("Can'r fetch file owner UIDs");
- close(fd);
- return -1;
- }
-
- close(fd);
- return 0;
-}
-
-static int check_proc_stat(void)
-{
- struct proc_pid_stat stat;
- int ret;
-
- ret = parse_pid_stat(getpid(), &stat);
- if (ret) {
- pr_msg("procfs: stat extension is not supported\n");
- return -1;
- }
-
- return 0;
-}
-
-static int check_one_fdinfo(union fdinfo_entries *e, void *arg)
-{
- *(int *)arg = (int)e->efd.counter;
- return 0;
-}
-
-static int check_fdinfo_eventfd(void)
-{
- int fd, ret;
- int cnt = 13, proc_cnt = 0;
-
- fd = eventfd(cnt, 0);
- if (fd < 0) {
- pr_perror("Can't make eventfd");
- return -1;
- }
-
- ret = parse_fdinfo(fd, FD_TYPES__EVENTFD, check_one_fdinfo, &proc_cnt);
- close(fd);
-
- if (ret) {
- pr_err("Error parsing proc fdinfo\n");
- return -1;
- }
-
- if (proc_cnt != cnt) {
- pr_err("Counter mismatch (or not met) %d want %d\n",
- proc_cnt, cnt);
- return -1;
- }
-
- pr_info("Eventfd fdinfo works OK (%d vs %d)\n", cnt, proc_cnt);
- return 0;
-}
-
-static int check_one_sfd(union fdinfo_entries *e, void *arg)
-{
- return 0;
-}
-
-int check_mnt_id(void)
-{
- struct fdinfo_common fdinfo = { .mnt_id = -1 };
- int ret;
-
- ret = parse_fdinfo(get_service_fd(LOG_FD_OFF), FD_TYPES__UND, NULL, &fdinfo);
- if (ret < 0)
- return -1;
-
- if (fdinfo.mnt_id == -1) {
- pr_err("fdinfo doesn't contain the mnt_id field\n");
- return -1;
- }
-
- return 0;
-}
-
-static int check_fdinfo_signalfd(void)
-{
- int fd, ret;
- sigset_t mask;
-
- sigemptyset(&mask);
- sigaddset(&mask, SIGUSR1);
- fd = signalfd(-1, &mask, 0);
- if (fd < 0) {
- pr_perror("Can't make signalfd");
- return -1;
- }
-
- ret = parse_fdinfo(fd, FD_TYPES__SIGNALFD, check_one_sfd, NULL);
- close(fd);
-
- if (ret) {
- pr_err("Error parsing proc fdinfo\n");
- return -1;
- }
-
- return 0;
-}
-
-static int check_one_epoll(union fdinfo_entries *e, void *arg)
-{
- *(int *)arg = e->epl.e.tfd;
- free_event_poll_entry(e);
- return 0;
-}
-
-static int check_fdinfo_eventpoll(void)
-{
- int efd, pfd[2], proc_fd = 0, ret = -1;
- struct epoll_event ev;
-
- if (pipe(pfd)) {
- pr_perror("Can't make pipe to watch");
- return -1;
- }
-
- efd = epoll_create(1);
- if (efd < 0) {
- pr_perror("Can't make epoll fd");
- goto pipe_err;
- }
-
- memset(&ev, 0, sizeof(ev));
- ev.events = EPOLLIN | EPOLLOUT;
-
- if (epoll_ctl(efd, EPOLL_CTL_ADD, pfd[0], &ev)) {
- pr_perror("Can't add epoll tfd");
- goto epoll_err;
- }
-
- ret = parse_fdinfo(efd, FD_TYPES__EVENTPOLL, check_one_epoll, &proc_fd);
- if (ret) {
- pr_err("Error parsing proc fdinfo\n");
- goto epoll_err;
- }
-
- if (pfd[0] != proc_fd) {
- pr_err("TFD mismatch (or not met) %d want %d\n",
- proc_fd, pfd[0]);
- ret = -1;
- goto epoll_err;
- }
-
- pr_info("Epoll fdinfo works OK (%d vs %d)\n", pfd[0], proc_fd);
-
-epoll_err:
- close(efd);
-pipe_err:
- close(pfd[0]);
- close(pfd[1]);
-
- return ret;
-}
-
-static int check_one_inotify(union fdinfo_entries *e, void *arg)
-{
- *(int *)arg = e->ify.e.wd;
- free_inotify_wd_entry(e);
- return 0;
-}
-
-static int check_fdinfo_inotify(void)
-{
- int ifd, wd, proc_wd = -1, ret;
-
- ifd = inotify_init1(0);
- if (ifd < 0) {
- pr_perror("Can't make inotify fd");
- return -1;
- }
-
- wd = inotify_add_watch(ifd, ".", IN_ALL_EVENTS);
- if (wd < 0) {
- pr_perror("Can't add watch");
- close(ifd);
- return -1;
- }
-
- ret = parse_fdinfo(ifd, FD_TYPES__INOTIFY, check_one_inotify, &proc_wd);
- close(ifd);
-
- if (ret < 0) {
- pr_err("Error parsing proc fdinfo\n");
- return -1;
- }
-
- if (wd != proc_wd) {
- pr_err("WD mismatch (or not met) %d want %d\n", proc_wd, wd);
- return -1;
- }
-
- pr_info("Inotify fdinfo works OK (%d vs %d)\n", wd, proc_wd);
- return 0;
-}
-
-static int check_fdinfo_ext(void)
-{
- int ret = 0;
-
- ret |= check_fdinfo_eventfd();
- ret |= check_fdinfo_eventpoll();
- ret |= check_fdinfo_signalfd();
- ret |= check_fdinfo_inotify();
-
- return ret;
-}
-
-static int check_unaligned_vmsplice(void)
-{
- int p[2], ret;
- char buf; /* :) */
- struct iovec iov;
-
- ret = pipe(p);
- if (ret < 0) {
- pr_perror("Can't create pipe");
- return ret;
- }
- iov.iov_base = &buf;
- iov.iov_len = sizeof(buf);
- ret = vmsplice(p[1], &iov, 1, SPLICE_F_GIFT | SPLICE_F_NONBLOCK);
- if (ret < 0) {
- pr_perror("Unaligned vmsplice doesn't work");
- goto err;
- }
-
- pr_info("Unaligned vmsplice works OK\n");
- ret = 0;
-err:
- close(p[0]);
- close(p[1]);
-
- return ret;
-}
-
-#ifndef SO_GET_FILTER
-#define SO_GET_FILTER SO_ATTACH_FILTER
-#endif
-
-static int check_so_gets(void)
-{
- int sk, ret = -1;
- socklen_t len;
- char name[IFNAMSIZ];
-
- sk = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP);
- if (sk < 0) {
- pr_perror("No socket");
- return -1;
- }
-
- len = 0;
- if (getsockopt(sk, SOL_SOCKET, SO_GET_FILTER, NULL, &len)) {
- pr_perror("Can't get socket filter");
- goto err;
- }
-
- len = sizeof(name);
- if (getsockopt(sk, SOL_SOCKET, SO_BINDTODEVICE, name, &len)) {
- pr_perror("Can't get socket bound dev");
- goto err;
- }
-
- ret = 0;
-err:
- close(sk);
- return ret;
-}
-
-static int check_ipc(void)
-{
- int ret;
-
- ret = access("/proc/sys/kernel/sem_next_id", R_OK | W_OK);
- if (!ret)
- return 0;
-
- pr_perror("/proc/sys/kernel/sem_next_id is inaccessible");
- return -1;
-}
-
-static int check_sigqueuinfo()
-{
- siginfo_t info = { .si_code = 1 };
-
- signal(SIGUSR1, SIG_IGN);
-
- if (syscall(SYS_rt_sigqueueinfo, getpid(), SIGUSR1, &info)) {
- pr_perror("Unable to send siginfo with positive si_code to itself");
- return -1;
- }
-
- return 0;
-}
-
-static pid_t fork_and_ptrace_attach(int (*child_setup)(void))
-{
- pid_t pid;
- int sk_pair[2], sk;
- char c = 0;
-
- if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) {
- pr_perror("socketpair");
- return -1;
- }
-
- pid = fork();
- if (pid < 0) {
- pr_perror("fork");
- return -1;
- } else if (pid == 0) {
- sk = sk_pair[1];
- close(sk_pair[0]);
-
- if (child_setup && child_setup() != 0)
- exit(1);
-
- if (write(sk, &c, 1) != 1) {
- pr_perror("write");
- exit(1);
- }
-
- while (1)
- sleep(1000);
- exit(1);
- }
-
- sk = sk_pair[0];
- close(sk_pair[1]);
-
- if (read(sk, &c, 1) != 1) {
- close(sk);
- kill(pid, SIGKILL);
- pr_perror("read");
- return -1;
- }
-
- close(sk);
-
- if (ptrace(PTRACE_ATTACH, pid, NULL, NULL) == -1) {
- pr_perror("Unable to ptrace the child");
- kill(pid, SIGKILL);
- return -1;
- }
-
- waitpid(pid, NULL, 0);
-
- return pid;
-}
-
-static int check_ptrace_peeksiginfo()
-{
- struct ptrace_peeksiginfo_args arg;
- siginfo_t siginfo;
- pid_t pid, ret = 0;
- k_rtsigset_t mask;
-
- pid = fork_and_ptrace_attach(NULL);
- if (pid < 0)
- return -1;
-
- arg.flags = 0;
- arg.off = 0;
- arg.nr = 1;
-
- if (ptrace(PTRACE_PEEKSIGINFO, pid, &arg, &siginfo) != 0) {
- pr_perror("Unable to dump pending signals");
- ret = -1;
- }
-
- if (ptrace(PTRACE_GETSIGMASK, pid, sizeof(mask), &mask) != 0) {
- pr_perror("Unable to dump signal blocking mask");
- ret = -1;
- }
-
- kill(pid, SIGKILL);
- return ret;
-}
-
-static int check_ptrace_suspend_seccomp(void)
-{
- pid_t pid;
- int ret = 0;
-
- pid = fork_and_ptrace_attach(NULL);
- if (pid < 0)
- return -1;
-
- if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP) < 0) {
- if (errno == EINVAL) {
- pr_err("Kernel doesn't support PTRACE_O_SUSPEND_SECCOMP\n");
- } else {
- pr_perror("couldn't suspend seccomp");
- }
- ret = -1;
- }
-
- kill(pid, SIGKILL);
- return ret;
-}
-
-static int setup_seccomp_filter(void)
-{
- struct sock_filter filter[] = {
- BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr)),
- /* Allow all syscalls except ptrace */
- BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_ptrace, 0, 1),
- BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),
- BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
- };
-
- struct sock_fprog bpf_prog = {
- .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
- .filter = filter,
- };
-
- if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, (long) &bpf_prog, 0, 0) < 0)
- return -1;
-
- return 0;
-}
-
-static int check_ptrace_dump_seccomp_filters(void)
-{
- pid_t pid;
- int ret = 0, len;
-
- pid = fork_and_ptrace_attach(setup_seccomp_filter);
- if (pid < 0)
- return -1;
-
- len = ptrace(PTRACE_SECCOMP_GET_FILTER, pid, 0, NULL);
- if (len < 0) {
- ret = -1;
- pr_perror("Dumping seccomp filters not supported");
- }
-
- kill(pid, SIGKILL);
- return ret;
-}
-
-static int check_mem_dirty_track(void)
-{
- if (kerndat_get_dirty_track() < 0)
- return -1;
-
- if (!kdat.has_dirty_track)
- pr_warn("Dirty tracking is OFF. Memory snapshot will not work.\n");
- return 0;
-}
-
-static int check_posix_timers(void)
-{
- int ret;
-
- ret = access("/proc/self/timers", R_OK);
- if (!ret)
- return 0;
-
- pr_msg("/proc/<pid>/timers file is missing.\n");
- return -1;
-}
-
-static unsigned long get_ring_len(unsigned long addr)
-{
- FILE *maps;
- char buf[256];
-
- maps = fopen("/proc/self/maps", "r");
- if (!maps) {
- pr_perror("No maps proc file");
- return 0;
- }
-
- while (fgets(buf, sizeof(buf), maps)) {
- unsigned long start, end;
- int r, tail;
-
- r = sscanf(buf, "%lx-%lx %*s %*s %*s %*s %n\n", &start, &end, &tail);
- if (r != 2) {
- fclose(maps);
- pr_err("Bad maps format %d.%d (%s)\n", r, tail, buf + tail);
- return 0;
- }
-
- if (start == addr) {
- fclose(maps);
- if (strcmp(buf + tail, "/[aio] (deleted)\n"))
- goto notfound;
-
- return end - start;
- }
- }
-
- fclose(maps);
-notfound:
- pr_err("No AIO ring at expected location\n");
- return 0;
-}
-
-static int check_aio_remap(void)
-{
- aio_context_t ctx = 0;
- unsigned long len;
- void *naddr;
- int r;
-
- if (syscall(SYS_io_setup, 16, &ctx) < 0) {
- pr_err("No AIO syscall\n");
- return -1;
- }
-
- len = get_ring_len((unsigned long) ctx);
- if (!len)
- return -1;
-
- naddr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, 0, 0);
- if (naddr == MAP_FAILED) {
- pr_perror("Can't find place for new AIO ring");
- return -1;
- }
-
- if (mremap((void *)ctx, len, len, MREMAP_FIXED | MREMAP_MAYMOVE, naddr) == MAP_FAILED) {
- pr_perror("Can't remap AIO ring");
- return -1;
- }
-
- ctx = (aio_context_t)naddr;
- r = syscall(SYS_io_getevents, ctx, 0, 1, NULL, NULL);
- if (r < 0) {
- if (!opts.check_ms_kernel) {
- pr_err("AIO remap doesn't work properly\n");
- return -1;
- } else
- pr_warn("Skipping unsupported AIO remap\n");
- }
-
- return 0;
-}
-
-static int check_fdinfo_lock(void)
-{
- if (kerndat_fdinfo_has_lock())
- return -1;
-
- if (!kdat.has_fdinfo_lock) {
- if (!opts.check_ms_kernel) {
- pr_err("fdinfo doesn't contain the lock field\n");
- return -1;
- } else {
- pr_warn("fdinfo doesn't contain the lock field\n");
- }
- }
-
- return 0;
-}
-
-struct clone_arg {
- /*
- * Reserve some space for clone() to locate arguments
- * and retcode in this place
- */
- char stack[128] __stack_aligned__;
- char stack_ptr[0];
-};
-
-static int clone_cb(void *_arg) {
- exit(0);
-}
-
-static int check_clone_parent_vs_pid()
-{
- struct clone_arg ca;
- pid_t pid;
-
- pid = clone(clone_cb, ca.stack_ptr, CLONE_NEWPID | CLONE_PARENT, &ca);
- if (pid < 0) {
- pr_err("CLONE_PARENT | CLONE_NEWPID don't work together\n");
- return -1;
- }
-
- return 0;
-}
-
-static int (*chk_feature)(void);
-
-int cr_check(void)
-{
- struct ns_id ns = { .type = NS_CRIU, .ns_pid = PROC_SELF, .nd = &mnt_ns_desc };
- int ret = 0;
-
- if (!is_root_user())
- return -1;
-
- root_item = alloc_pstree_item();
- if (root_item == NULL)
- return -1;
-
- root_item->pid.real = getpid();
-
- if (collect_pstree_ids())
- return -1;
-
- ns.id = root_item->ids->mnt_ns_id;
-
- mntinfo = collect_mntinfo(&ns, false);
- if (mntinfo == NULL)
- return -1;
-
- if (chk_feature) {
- ret = chk_feature();
- goto out;
- }
-
- ret |= check_map_files();
- ret |= check_sock_diag();
- ret |= check_ns_last_pid();
- ret |= check_sock_peek_off();
- ret |= check_kcmp();
- ret |= check_prctl();
- ret |= check_fcntl();
- ret |= check_proc_stat();
- ret |= check_tcp();
- ret |= check_fdinfo_ext();
- ret |= check_unaligned_vmsplice();
- ret |= check_tty();
- ret |= check_so_gets();
- ret |= check_ipc();
- ret |= check_sigqueuinfo();
- ret |= check_ptrace_peeksiginfo();
- ret |= check_ptrace_suspend_seccomp();
- ret |= check_ptrace_dump_seccomp_filters();
- ret |= check_mem_dirty_track();
- ret |= check_posix_timers();
- ret |= check_tun_cr(0);
- ret |= check_timerfd();
- ret |= check_mnt_id();
- ret |= check_aio_remap();
- ret |= check_fdinfo_lock();
- ret |= check_clone_parent_vs_pid();
-
-out:
- if (!ret)
- print_on_level(DEFAULT_LOGLEVEL, "Looks good.\n");
-
- return ret;
-}
-
-static int check_tun(void)
-{
- /*
- * In case there's no TUN support at all we
- * should report error. Unlike this plain criu
- * check would report "Looks good" in this case
- * since C/R effectively works, just not for TUN.
- */
- return check_tun_cr(-1);
-}
-
-static int check_userns(void)
-{
- int ret;
- unsigned long size = 0;
-
- ret = access("/proc/self/ns/user", F_OK);
- if (ret) {
- pr_perror("No userns proc file");
- return -1;
- }
-
- ret = prctl(PR_SET_MM, PR_SET_MM_MAP_SIZE, (unsigned long)&size, 0, 0);
- if (ret) {
- errno = -ret;
- pr_perror("No new prctl API");
- return -1;
- }
-
- return 0;
-}
-
-static int check_loginuid(void)
-{
- if (kerndat_loginuid(false) < 0)
- return -1;
-
- if (!kdat.has_loginuid) {
- pr_warn("Loginuid restore is OFF.\n");
- return -1;
- }
-
- return 0;
-}
-
-int check_add_feature(char *feat)
-{
- if (!strcmp(feat, "mnt_id"))
- chk_feature = check_mnt_id;
- else if (!strcmp(feat, "aio_remap"))
- chk_feature = check_aio_remap;
- else if (!strcmp(feat, "timerfd"))
- chk_feature = check_timerfd;
- else if (!strcmp(feat, "tun"))
- chk_feature = check_tun;
- else if (!strcmp(feat, "userns"))
- chk_feature = check_userns;
- else if (!strcmp(feat, "fdinfo_lock"))
- chk_feature = check_fdinfo_lock;
- else if (!strcmp(feat, "seccomp_suspend"))
- chk_feature = check_ptrace_suspend_seccomp;
- else if (!strcmp(feat, "seccomp_filters"))
- chk_feature = check_ptrace_dump_seccomp_filters;
- else if (!strcmp(feat, "loginuid"))
- chk_feature = check_loginuid;
- else {
- pr_err("Unknown feature %s\n", feat);
- return -1;
- }
-
- return 0;
-}
diff --git a/cr-dedup.c b/cr-dedup.c
deleted file mode 100644
index b453c3e61f4e..000000000000
--- a/cr-dedup.c
+++ /dev/null
@@ -1,197 +0,0 @@
-#include <sys/uio.h>
-#include <fcntl.h>
-#include <linux/falloc.h>
-#include <unistd.h>
-
-#include "crtools.h"
-#include "page-read.h"
-#include "restorer.h"
-
-#define MAX_BUNCH_SIZE 256
-
-static int cr_dedup_one_pagemap(int pid);
-
-int cr_dedup(void)
-{
- int close_ret, ret = 0;
- int pid;
- DIR * dirp;
- struct dirent *ent;
-
- dirp = opendir(CR_PARENT_LINK);
- if (dirp == NULL) {
- pr_perror("Can't enter previous snapshot folder, error=%d", errno);
- ret = -1;
- goto err;
- }
-
- while (1) {
- errno = 0;
- ent = readdir(dirp);
- if (ent == NULL) {
- if (errno) {
- pr_perror("Failed readdir, error=%d", errno);
- ret = -1;
- goto err;
- }
- break;
- }
-
- ret = sscanf(ent->d_name, "pagemap-%d.img", &pid);
- if (ret == 1) {
- pr_info("pid=%d\n", pid);
- ret = cr_dedup_one_pagemap(pid);
- if (ret < 0)
- break;
- }
- }
-
-err:
- if (dirp) {
- close_ret = closedir(dirp);
- if (close_ret == -1)
- return close_ret;
- }
-
- if (ret < 0)
- return ret;
-
- pr_info("Deduplicated\n");
- return 0;
-}
-
-static int cr_dedup_one_pagemap(int pid)
-{
- int ret;
- struct page_read pr;
- struct page_read * prp;
- struct iovec iov;
-
- ret = open_page_read(pid, &pr, PR_TASK | PR_MOD);
- if (ret <= 0) {
- ret = -1;
- goto exit;
- }
-
- prp = pr.parent;
- if (!prp)
- goto exit;
-
- ret = pr.get_pagemap(&pr, &iov);
- if (ret <= 0)
- goto exit;
-
- while (1) {
- pr_debug("dedup iovec base=%p, len=%zu\n", iov.iov_base, iov.iov_len);
- if (!pr.pe->in_parent) {
- ret = dedup_one_iovec(prp, &iov);
- if (ret)
- goto exit;
- }
-
- pr.put_pagemap(&pr);
- ret = pr.get_pagemap(&pr, &iov);
- if (ret <= 0)
- goto exit;
- }
-exit:
- pr.close(&pr);
-
- if (ret < 0)
- return ret;
-
- return 0;
-}
-
-static inline bool can_extend_batch(struct iovec *bunch,
- unsigned long off, unsigned long len)
-{
- return /* The next region is the continuation of the existing */
- ((unsigned long)bunch->iov_base + bunch->iov_len == off) &&
- /* The resulting region is non empty and is small enough */
- (bunch->iov_len == 0 || bunch->iov_len + len < MAX_BUNCH_SIZE * PAGE_SIZE);
-}
-
-int punch_hole(struct page_read *pr, unsigned long off, unsigned long len,
- bool cleanup)
-{
- int ret;
- struct iovec * bunch = &pr->bunch;
-
- if (!cleanup && can_extend_batch(bunch, off, len)) {
- pr_debug("pr%d:Extend bunch len from %zu to %lu\n", pr->id,
- bunch->iov_len, bunch->iov_len + len);
- bunch->iov_len += len;
- } else {
- if (bunch->iov_len > 0) {
- pr_debug("Punch!/%p/%zu/\n", bunch->iov_base, bunch->iov_len);
- ret = fallocate(img_raw_fd(pr->pi), FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
- (unsigned long)bunch->iov_base, bunch->iov_len);
- if (ret != 0) {
- pr_perror("Error punching hole");
- return -1;
- }
- }
- bunch->iov_base = (void *)off;
- bunch->iov_len = len;
- pr_debug("pr%d:New bunch/%p/%zu/\n", pr->id, bunch->iov_base, bunch->iov_len);
- }
- return 0;
-}
-
-int dedup_one_iovec(struct page_read *pr, struct iovec *iov)
-{
- unsigned long off;
- unsigned long off_real;
- unsigned long iov_end;
-
- iov_end = (unsigned long)iov->iov_base + iov->iov_len;
- off = (unsigned long)iov->iov_base;
- while (1) {
- int ret;
- struct iovec piov;
- unsigned long piov_end;
- struct iovec tiov;
- struct page_read * prp;
-
- ret = seek_pagemap_page(pr, off, false);
- if (ret == -1)
- return -1;
-
- if (ret == 0) {
- if (off < pr->cvaddr && pr->cvaddr < iov_end)
- off = pr->cvaddr;
- else
- return 0;
- }
-
- if (!pr->pe)
- return -1;
- pagemap2iovec(pr->pe, &piov);
- piov_end = (unsigned long)piov.iov_base + piov.iov_len;
- off_real = lseek(img_raw_fd(pr->pi), 0, SEEK_CUR);
- if (!pr->pe->in_parent) {
- ret = punch_hole(pr, off_real, min(piov_end, iov_end) - off, false);
- if (ret == -1)
- return ret;
- }
-
- prp = pr->parent;
- if (prp) {
- /* recursively */
- pr_debug("Go to next parent level\n");
- tiov.iov_base = (void*)off;
- tiov.iov_len = min(piov_end, iov_end) - off;
- ret = dedup_one_iovec(prp, &tiov);
- if (ret != 0)
- return -1;
- }
-
- if (piov_end < iov_end) {
- off = piov_end;
- continue;
- } else
- return 0;
- }
- return 0;
-}
diff --git a/cr-dump.c b/cr-dump.c
deleted file mode 100644
index 385b7bb7b25a..000000000000
--- a/cr-dump.c
+++ /dev/null
@@ -1,1720 +0,0 @@
-#include <sys/time.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <signal.h>
-#include <limits.h>
-#include <unistd.h>
-#include <errno.h>
-#include <string.h>
-
-#include <fcntl.h>
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/vfs.h>
-#include <sys/time.h>
-#include <sys/resource.h>
-#include <sys/wait.h>
-
-#include <sys/sendfile.h>
-
-#include <sched.h>
-#include <sys/resource.h>
-
-#include "protobuf.h"
-#include "protobuf/fdinfo.pb-c.h"
-#include "protobuf/fs.pb-c.h"
-#include "protobuf/mm.pb-c.h"
-#include "protobuf/creds.pb-c.h"
-#include "protobuf/core.pb-c.h"
-#include "protobuf/file-lock.pb-c.h"
-#include "protobuf/rlimit.pb-c.h"
-#include "protobuf/siginfo.pb-c.h"
-
-#include "asm/types.h"
-#include "list.h"
-#include "imgset.h"
-#include "file-ids.h"
-#include "kcmp-ids.h"
-#include "compiler.h"
-#include "crtools.h"
-#include "cr_options.h"
-#include "servicefd.h"
-#include "string.h"
-#include "ptrace.h"
-#include "util.h"
-#include "namespaces.h"
-#include "image.h"
-#include "proc_parse.h"
-#include "parasite.h"
-#include "parasite-syscall.h"
-#include "files.h"
-#include "files-reg.h"
-#include "shmem.h"
-#include "sk-inet.h"
-#include "pstree.h"
-#include "mount.h"
-#include "tty.h"
-#include "net.h"
-#include "sk-packet.h"
-#include "cpu.h"
-#include "elf.h"
-#include "cgroup.h"
-#include "file-lock.h"
-#include "page-xfer.h"
-#include "kerndat.h"
-#include "stats.h"
-#include "mem.h"
-#include "page-pipe.h"
-#include "posix-timer.h"
-#include "vdso.h"
-#include "vma.h"
-#include "cr-service.h"
-#include "plugin.h"
-#include "irmap.h"
-#include "sysfs_parse.h"
-#include "action-scripts.h"
-#include "aio.h"
-#include "lsm.h"
-#include "seccomp.h"
-#include "seize.h"
-#include "fault-injection.h"
-
-#include "asm/dump.h"
-
-static char loc_buf[PAGE_SIZE];
-
-static void close_vma_file(struct vma_area *vma)
-{
- if (vma->vm_file_fd < 0)
- return;
- if (vma->e->status & VMA_AREA_SOCKET)
- return;
- if (vma->file_borrowed)
- return;
- if (vma_area_is(vma, VMA_AREA_AIORING))
- return;
-
- close(vma->vm_file_fd);
-}
-
-void free_mappings(struct vm_area_list *vma_area_list)
-{
- struct vma_area *vma_area, *p;
-
- list_for_each_entry_safe(vma_area, p, &vma_area_list->h, list) {
- close_vma_file(vma_area);
- if (!vma_area->file_borrowed)
- free(vma_area->vmst);
- free(vma_area);
- }
-
- INIT_LIST_HEAD(&vma_area_list->h);
- vma_area_list->nr = 0;
-}
-
-int collect_mappings(pid_t pid, struct vm_area_list *vma_area_list)
-{
- int ret = -1;
-
- pr_info("\n");
- pr_info("Collecting mappings (pid: %d)\n", pid);
- pr_info("----------------------------------------\n");
-
- ret = parse_smaps(pid, vma_area_list);
- if (ret < 0)
- goto err;
-
- pr_info("Collected, longest area occupies %lu pages\n", vma_area_list->longest);
- pr_info_vma_list(&vma_area_list->h);
-
- pr_info("----------------------------------------\n");
-err:
- return ret;
-}
-
-static int dump_sched_info(int pid, ThreadCoreEntry *tc)
-{
- int ret;
- struct sched_param sp;
-
- BUILD_BUG_ON(SCHED_OTHER != 0); /* default in proto message */
-
- ret = sched_getscheduler(pid);
- if (ret < 0) {
- pr_perror("Can't get sched policy for %d", pid);
- return -1;
- }
-
- pr_info("%d has %d sched policy\n", pid, ret);
- tc->has_sched_policy = true;
- tc->sched_policy = ret;
-
- if ((ret == SCHED_RR) || (ret == SCHED_FIFO)) {
- ret = sched_getparam(pid, &sp);
- if (ret < 0) {
- pr_perror("Can't get sched param for %d", pid);
- return -1;
- }
-
- pr_info("\tdumping %d prio for %d\n", sp.sched_priority, pid);
- tc->has_sched_prio = true;
- tc->sched_prio = sp.sched_priority;
- }
-
- /*
- * The nice is ignored for RT sched policies, but is stored
- * in kernel. Thus we have to take it with us in the image.
- */
-
- errno = 0;
- ret = getpriority(PRIO_PROCESS, pid);
- if (errno) {
- pr_perror("Can't get nice for %d", pid);
- return -1;
- }
-
- pr_info("\tdumping %d nice for %d\n", ret, pid);
- tc->has_sched_nice = true;
- tc->sched_nice = ret;
-
- return 0;
-}
-
-struct cr_imgset *glob_imgset;
-
-static int collect_fds(pid_t pid, struct parasite_drain_fd *dfds)
-{
- struct dirent *de;
- DIR *fd_dir;
- int n;
-
- pr_info("\n");
- pr_info("Collecting fds (pid: %d)\n", pid);
- pr_info("----------------------------------------\n");
-
- fd_dir = opendir_proc(pid, "fd");
- if (!fd_dir)
- return -1;
-
- n = 0;
- while ((de = readdir(fd_dir))) {
- if (dir_dots(de))
- continue;
-
- if (n > PARASITE_MAX_FDS - 1)
- return -ENOMEM;
-
- dfds->fds[n++] = atoi(de->d_name);
- }
-
- dfds->nr_fds = n;
- pr_info("Found %d file descriptors\n", n);
- pr_info("----------------------------------------\n");
-
- closedir(fd_dir);
-
- return 0;
-}
-
-static int fill_fd_params_special(int fd, struct fd_parms *p)
-{
- *p = FD_PARMS_INIT;
-
- if (fstat(fd, &p->stat) < 0) {
- pr_perror("Can't fstat exe link");
- return -1;
- }
-
- if (get_fd_mntid(fd, &p->mnt_id))
- return -1;
-
- return 0;
-}
-
-static int dump_task_exe_link(pid_t pid, MmEntry *mm)
-{
- struct fd_parms params;
- int fd, ret = 0;
-
- fd = open_proc_path(pid, "exe");
- if (fd < 0)
- return -1;
-
- if (fill_fd_params_special(fd, ¶ms))
- return -1;
-
- if (fd_id_generate_special(¶ms, &mm->exe_file_id))
- ret = dump_one_reg_file(fd, mm->exe_file_id, ¶ms);
-
- close(fd);
- return ret;
-}
-
-static int dump_task_fs(pid_t pid, struct parasite_dump_misc *misc, struct cr_imgset *imgset)
-{
- struct fd_parms p;
- FsEntry fe = FS_ENTRY__INIT;
- int fd, ret;
-
- fe.has_umask = true;
- fe.umask = misc->umask;
-
- fd = open_proc_path(pid, "cwd");
- if (fd < 0)
- return -1;
-
- if (fill_fd_params_special(fd, &p))
- return -1;
-
- if (fd_id_generate_special(&p, &fe.cwd_id)) {
- ret = dump_one_reg_file(fd, fe.cwd_id, &p);
- if (ret < 0)
- return ret;
- }
-
- close(fd);
-
- fd = open_proc_path(pid, "root");
- if (fd < 0)
- return -1;
-
- if (fill_fd_params_special(fd, &p))
- return -1;
-
- if (fd_id_generate_special(&p, &fe.root_id)) {
- ret = dump_one_reg_file(fd, fe.root_id, &p);
- if (ret < 0)
- return ret;
- }
-
- close(fd);
-
- pr_info("Dumping task cwd id %#x root id %#x\n",
- fe.cwd_id, fe.root_id);
-
- return pb_write_one(img_from_set(imgset, CR_FD_FS), &fe, PB_FS);
-}
-
-static inline u_int64_t encode_rlim(unsigned long val)
-{
- return val == RLIM_INFINITY ? -1 : val;
-}
-
-static int dump_task_rlimits(int pid, TaskRlimitsEntry *rls)
-{
- int res;
-
- for (res = 0; res <rls->n_rlimits ; res++) {
- struct rlimit lim;
-
- if (prlimit(pid, res, NULL, &lim)) {
- pr_perror("Can't get rlimit %d", res);
- return -1;
- }
-
- rls->rlimits[res]->cur = encode_rlim(lim.rlim_cur);
- rls->rlimits[res]->max = encode_rlim(lim.rlim_max);
- }
-
- return 0;
-}
-
-static int dump_pid_misc(pid_t pid, TaskCoreEntry *tc)
-{
- int ret;
-
- if (kdat.has_loginuid) {
- pr_info("dumping /proc/%d/loginuid\n", pid);
-
- tc->has_loginuid = true;
- tc->loginuid = parse_pid_loginuid(pid, &ret, false);
- tc->loginuid = userns_uid(tc->loginuid);
- /*
- * loginuid dumping is critical, as if not correctly
- * restored, you may loss ability to login via SSH to CT
- */
- if (ret < 0)
- return ret;
- } else {
- tc->has_loginuid = false;
- }
-
- pr_info("dumping /proc/%d/oom_score_adj\n", pid);
-
- tc->oom_score_adj = parse_pid_oom_score_adj(pid, &ret);
- /*
- * oom_score_adj dumping is not very critical, as it will affect
- * on victim in OOM situation and one will find dumping error in log
- */
- if (ret < 0)
- tc->has_oom_score_adj = false;
- else
- tc->has_oom_score_adj = true;
-
- return 0;
-}
-
-static int dump_filemap(pid_t pid, struct vma_area *vma_area,
- const struct cr_imgset *imgset)
-{
- struct fd_parms p = FD_PARMS_INIT;
- VmaEntry *vma = vma_area->e;
- int ret = 0;
- u32 id;
-
- BUG_ON(!vma_area->vmst);
- p.stat = *vma_area->vmst;
- p.mnt_id = vma_area->mnt_id;
-
- /*
- * AUFS support to compensate for the kernel bug
- * exposing branch pathnames in map_files.
- *
- * If the link found in vma_get_mapfile() pointed
- * inside a branch, we should use the pathname
- * from root that was saved in vma_area->aufs_rpath.
- */
- if (vma_area->aufs_rpath) {
- struct fd_link aufs_link;
-
- strlcpy(aufs_link.name, vma_area->aufs_rpath,
- sizeof(aufs_link.name));
- aufs_link.len = strlen(aufs_link.name);
- p.link = &aufs_link;
- }
-
- /* Flags will be set during restore in get_filemap_fd() */
-
- if (fd_id_generate_special(&p, &id))
- ret = dump_one_reg_file(vma_area->vm_file_fd, id, &p);
-
- vma->shmid = id;
- return ret;
-}
-
-static int check_sysvipc_map_dump(pid_t pid, VmaEntry *vma)
-{
- if (root_ns_mask & CLONE_NEWIPC)
- return 0;
-
- pr_err("Task %d with SysVIPC shmem map @%"PRIx64" doesn't live in IPC ns\n",
- pid, vma->start);
- return -1;
-}
-
-static int get_task_auxv(pid_t pid, MmEntry *mm)
-{
- auxv_t mm_saved_auxv[AT_VECTOR_SIZE];
- int fd, i, ret;
-
- pr_info("Obtaining task auvx ...\n");
-
- fd = open_proc(pid, "auxv");
- if (fd < 0)
- return -1;
-
- ret = read(fd, mm_saved_auxv, sizeof(mm_saved_auxv));
- if (ret < 0) {
- ret = -1;
- pr_perror("Error reading %d's auxv", pid);
- goto err;
- } else {
- mm->n_mm_saved_auxv = ret / sizeof(auxv_t);
- for (i = 0; i < mm->n_mm_saved_auxv; i++)
- mm->mm_saved_auxv[i] = (u64)mm_saved_auxv[i];
- }
-
- ret = 0;
-err:
- close_safe(&fd);
- return ret;
-}
-
-static int dump_task_mm(pid_t pid, const struct proc_pid_stat *stat,
- const struct parasite_dump_misc *misc,
- const struct vm_area_list *vma_area_list,
- const struct cr_imgset *imgset)
-{
- MmEntry mme = MM_ENTRY__INIT;
- struct vma_area *vma_area;
- int ret = -1, i = 0;
-
- pr_info("\n");
- pr_info("Dumping mm (pid: %d)\n", pid);
- pr_info("----------------------------------------\n");
-
- mme.n_vmas = vma_area_list->nr;
- mme.vmas = xmalloc(mme.n_vmas * sizeof(VmaEntry *));
- if (!mme.vmas)
- goto err;
-
- list_for_each_entry(vma_area, &vma_area_list->h, list) {
- VmaEntry *vma = vma_area->e;
-
- pr_info_vma(vma_area);
-
- if (!vma_entry_is(vma, VMA_AREA_REGULAR))
- ret = 0;
- else if (vma_entry_is(vma, VMA_AREA_SYSVIPC))
- ret = check_sysvipc_map_dump(pid, vma);
- else if (vma_entry_is(vma, VMA_ANON_SHARED))
- ret = add_shmem_area(pid, vma);
- else if (vma_entry_is(vma, VMA_FILE_PRIVATE) ||
- vma_entry_is(vma, VMA_FILE_SHARED))
- ret = dump_filemap(pid, vma_area, imgset);
- else if (vma_entry_is(vma, VMA_AREA_SOCKET))
- ret = dump_socket_map(vma_area);
- else
- ret = 0;
- if (ret)
- goto err;
-
- mme.vmas[i++] = vma;
-
- if (vma_entry_is(vma, VMA_AREA_AIORING)) {
- ret = dump_aio_ring(&mme, vma_area);
- if (ret)
- goto err;
- }
- }
-
- mme.mm_start_code = stat->start_code;
- mme.mm_end_code = stat->end_code;
- mme.mm_start_data = stat->start_data;
- mme.mm_end_data = stat->end_data;
- mme.mm_start_stack = stat->start_stack;
- mme.mm_start_brk = stat->start_brk;
-
- mme.mm_arg_start = stat->arg_start;
- mme.mm_arg_end = stat->arg_end;
- mme.mm_env_start = stat->env_start;
- mme.mm_env_end = stat->env_end;
-
- mme.mm_brk = misc->brk;
-
- mme.dumpable = misc->dumpable;
- mme.has_dumpable = true;
-
- mme.n_mm_saved_auxv = AT_VECTOR_SIZE;
- mme.mm_saved_auxv = xmalloc(pb_repeated_size(&mme, mm_saved_auxv));
- if (!mme.mm_saved_auxv)
- goto err;
-
- if (get_task_auxv(pid, &mme))
- goto err;
-
- if (dump_task_exe_link(pid, &mme))
- goto err;
-
- ret = pb_write_one(img_from_set(imgset, CR_FD_MM), &mme, PB_MM);
- xfree(mme.mm_saved_auxv);
- free_aios(&mme);
-err:
- return ret;
-}
-
-static int get_task_futex_robust_list(pid_t pid, ThreadCoreEntry *info)
-{
- struct robust_list_head *head = NULL;
- size_t len = 0;
- int ret;
-
- ret = syscall(SYS_get_robust_list, pid, &head, &len);
- if (ret == -ENOSYS) {
- /*
- * If the kernel says get_robust_list is not implemented, then
- * check whether set_robust_list is also not implemented, in
- * that case we can assume it is empty, since set_robust_list
- * is the only way to populate it. This case is possible when
- * "futex_cmpxchg_enabled" is unset in the kernel.
- *
- * The following system call should always fail, even if it is
- * implemented, in which case it will return -EINVAL because
- * len should be greater than zero.
- */
- if (syscall(SYS_set_robust_list, NULL, 0) != -ENOSYS)
- goto err;
-
- head = NULL;
- len = 0;
- } else if (ret) {
- goto err;
- }
-
- info->futex_rla = encode_pointer(head);
- info->futex_rla_len = (u32)len;
-
- return 0;
-
-err:
- pr_err("Failed obtaining futex robust list on %d\n", pid);
- return -1;
-}
-
-static int get_task_personality(pid_t pid, u32 *personality)
-{
- int fd, ret = -1;
-
- pr_info("Obtaining personality ... \n");
-
- fd = open_proc(pid, "personality");
- if (fd < 0)
- goto err;
-
- ret = read(fd, loc_buf, sizeof(loc_buf) - 1);
- close(fd);
-
- if (ret >= 0) {
- loc_buf[ret] = '\0';
- *personality = atoi(loc_buf);
- }
-err:
- return ret;
-}
-
-static DECLARE_KCMP_TREE(vm_tree, KCMP_VM);
-static DECLARE_KCMP_TREE(fs_tree, KCMP_FS);
-static DECLARE_KCMP_TREE(files_tree, KCMP_FILES);
-static DECLARE_KCMP_TREE(sighand_tree, KCMP_SIGHAND);
-
-static int dump_task_kobj_ids(struct pstree_item *item)
-{
- int new;
- struct kid_elem elem;
- int pid = item->pid.real;
- TaskKobjIdsEntry *ids = item->ids;
-
- elem.pid = pid;
- elem.idx = 0; /* really 0 for all */
- elem.genid = 0; /* FIXME optimize */
-
- new = 0;
- ids->vm_id = kid_generate_gen(&vm_tree, &elem, &new);
- if (!ids->vm_id || !new) {
- pr_err("Can't make VM id for %d\n", pid);
- return -1;
- }
-
- new = 0;
- ids->fs_id = kid_generate_gen(&fs_tree, &elem, &new);
- if (!ids->fs_id || !new) {
- pr_err("Can't make FS id for %d\n", pid);
- return -1;
- }
-
- new = 0;
- ids->files_id = kid_generate_gen(&files_tree, &elem, &new);
- if (!ids->files_id || (!new && !shared_fdtable(item))) {
- pr_err("Can't make FILES id for %d\n", pid);
- return -1;
- }
-
- new = 0;
- ids->sighand_id = kid_generate_gen(&sighand_tree, &elem, &new);
- if (!ids->sighand_id || !new) {
- pr_err("Can't make IO id for %d\n", pid);
- return -1;
- }
-
- return 0;
-}
-
-int get_task_ids(struct pstree_item *item)
-{
- int ret;
-
- item->ids = xmalloc(sizeof(*item->ids));
- if (!item->ids)
- goto err;
-
- task_kobj_ids_entry__init(item->ids);
-
- if (item->state != TASK_DEAD) {
- ret = dump_task_kobj_ids(item);
- if (ret)
- goto err_free;
-
- ret = dump_task_ns_ids(item);
- if (ret)
- goto err_free;
- }
-
- return 0;
-
-err_free:
- xfree(item->ids);
- item->ids = NULL;
-err:
- return -1;
-}
-
-static int dump_task_ids(struct pstree_item *item, const struct cr_imgset *cr_imgset)
-{
- return pb_write_one(img_from_set(cr_imgset, CR_FD_IDS), item->ids, PB_IDS);
-}
-
-int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread *ti)
-{
- int ret;
- ThreadCoreEntry *tc = core->thread_core;
-
- ret = collect_lsm_profile(pid, tc->creds);
- if (!ret)
- ret = get_task_futex_robust_list(pid, tc);
- if (!ret)
- ret = dump_sched_info(pid, tc);
- if (!ret) {
- core_put_tls(core, ti->tls);
- CORE_THREAD_ARCH_INFO(core)->clear_tid_addr = encode_pointer(ti->tid_addr);
- BUG_ON(!tc->sas);
- copy_sas(tc->sas, &ti->sas);
- if (ti->pdeath_sig) {
- tc->has_pdeath_sig = true;
- tc->pdeath_sig = ti->pdeath_sig;
- }
- }
-
- return ret;
-}
-
-static int dump_task_core_all(struct parasite_ctl *ctl,
- struct pstree_item *item,
- const struct proc_pid_stat *stat,
- const struct cr_imgset *cr_imgset)
-{
- struct cr_img *img;
- CoreEntry *core = item->core[0];
- pid_t pid = item->pid.real;
- int ret = -1;
- struct proc_status_creds *creds;
-
- pr_info("\n");
- pr_info("Dumping core (pid: %d)\n", pid);
- pr_info("----------------------------------------\n");
-
- ret = get_task_personality(pid, &core->tc->personality);
- if (ret < 0)
- goto err;
-
- creds = dmpi(item)->pi_creds;
- if (creds->seccomp_mode != SECCOMP_MODE_DISABLED) {
- pr_info("got seccomp mode %d for %d\n", creds->seccomp_mode, item->pid.virt);
- core->tc->has_seccomp_mode = true;
- core->tc->seccomp_mode = creds->seccomp_mode;
-
- if (creds->seccomp_mode == SECCOMP_MODE_FILTER) {
- core->tc->has_seccomp_filter = true;
- core->tc->seccomp_filter = creds->last_filter;
- }
- }
-
- strlcpy((char *)core->tc->comm, stat->comm, TASK_COMM_LEN);
- core->tc->flags = stat->flags;
- core->tc->task_state = item->state;
- core->tc->exit_code = 0;
-
- ret = parasite_dump_thread_leader_seized(ctl, pid, core);
- if (ret)
- goto err;
-
- ret = dump_pid_misc(pid, core->tc);
- if (ret)
- goto err;
-
- ret = dump_task_rlimits(pid, core->tc->rlimits);
- if (ret)
- goto err;
-
- core->tc->has_cg_set = true;
- ret = dump_task_cgroup(item, &core->tc->cg_set);
- if (ret)
- goto err;
-
- img = img_from_set(cr_imgset, CR_FD_CORE);
- ret = pb_write_one(img, core, PB_CORE);
- if (ret < 0)
- goto err;
-
-err:
- pr_info("----------------------------------------\n");
-
- return ret;
-}
-
-static int collect_pstree_ids_predump(void)
-{
- struct pstree_item *item;
- struct {
- struct pstree_item i;
- struct dmp_info d;
- } crt = { };
-
- /*
- * This thing is normally done inside
- * write_img_inventory().
- */
-
- crt.i.state = TASK_ALIVE;
- crt.i.pid.real = getpid();
-
- if (predump_task_ns_ids(&crt.i))
- return -1;
-
- for_each_pstree_item(item) {
- if (item->state == TASK_DEAD)
- continue;
-
- if (predump_task_ns_ids(item))
- return -1;
- }
-
- return 0;
-}
-
-int collect_pstree_ids(void)
-{
- struct pstree_item *item;
-
- for_each_pstree_item(item)
- if (get_task_ids(item))
- return -1;
-
- return 0;
-}
-
-static int collect_file_locks(void)
-{
- return parse_file_locks();
-}
-
-static int dump_task_thread(struct parasite_ctl *parasite_ctl,
- const struct pstree_item *item, int id)
-{
- struct pid *tid = &item->threads[id];
- CoreEntry *core = item->core[id];
- pid_t pid = tid->real;
- int ret = -1;
- struct cr_img *img;
-
- pr_info("\n");
- pr_info("Dumping core for thread (pid: %d)\n", pid);
- pr_info("----------------------------------------\n");
-
- ret = parasite_dump_thread_seized(parasite_ctl, id, tid, core);
- if (ret) {
- pr_err("Can't dump thread for pid %d\n", pid);
- goto err;
- }
-
- img = open_image(CR_FD_CORE, O_DUMP, tid->virt);
- if (!img)
- goto err;
-
- ret = pb_write_one(img, core, PB_CORE);
-
- close_image(img);
-err:
- pr_info("----------------------------------------\n");
- return ret;
-}
-
-static int dump_one_zombie(const struct pstree_item *item,
- const struct proc_pid_stat *pps)
-{
- CoreEntry *core;
- int ret = -1;
- struct cr_img *img;
-
- core = core_entry_alloc(0, 1);
- if (!core)
- return -1;
-
- strlcpy((char *)core->tc->comm, pps->comm, TASK_COMM_LEN);
- core->tc->task_state = TASK_DEAD;
- core->tc->exit_code = pps->exit_code;
-
- img = open_image(CR_FD_CORE, O_DUMP, item->pid.virt);
- if (!img)
- goto err;
-
- ret = pb_write_one(img, core, PB_CORE);
- close_image(img);
-err:
- core_entry_free(core);
- return ret;
-}
-
-#define SI_BATCH 32
-
-static int dump_signal_queue(pid_t tid, SignalQueueEntry **sqe, bool group)
-{
- struct ptrace_peeksiginfo_args arg;
- int ret;
- SignalQueueEntry *queue = NULL;
-
- pr_debug("Dump %s signals of %d\n", group ? "shared" : "private", tid);
-
- arg.nr = SI_BATCH;
- arg.flags = 0;
- if (group)
- arg.flags |= PTRACE_PEEKSIGINFO_SHARED;
- arg.off = 0;
-
- queue = xmalloc(sizeof(*queue));
- if (!queue)
- return -1;
-
- signal_queue_entry__init(queue);
-
- while (1) {
- int nr, si_pos;
- siginfo_t *si;
-
- si = xmalloc(SI_BATCH * sizeof(*si));
- if (!si) {
- ret = -1;
- break;
- }
-
- nr = ret = ptrace(PTRACE_PEEKSIGINFO, tid, &arg, si);
- if (ret == 0)
- break; /* Finished */
-
- if (ret < 0) {
- if (errno == EIO) {
- pr_warn("ptrace doesn't support PTRACE_PEEKSIGINFO\n");
- ret = 0;
- } else
- pr_perror("ptrace");
-
- break;
- }
-
- queue->n_signals += nr;
- queue->signals = xrealloc(queue->signals, sizeof(*queue->signals) * queue->n_signals);
- if (!queue->signals) {
- ret = -1;
- break;
- }
-
- for (si_pos = queue->n_signals - nr;
- si_pos < queue->n_signals; si_pos++) {
- SiginfoEntry *se;
-
- se = xmalloc(sizeof(*se));
- if (!se) {
- ret = -1;
- break;
- }
-
- siginfo_entry__init(se);
- se->siginfo.len = sizeof(siginfo_t);
- se->siginfo.data = (void *)si++; /* XXX we don't free cores, but when
- * we will, this would cause problems
- */
- queue->signals[si_pos] = se;
- }
-
- if (ret < 0)
- break;
-
- arg.off += nr;
- }
-
- *sqe = queue;
- return ret;
-}
-
-static int dump_task_signals(pid_t pid, struct pstree_item *item)
-{
- int i, ret;
-
- /* Dump private signals for each thread */
- for (i = 0; i < item->nr_threads; i++) {
- ret = dump_signal_queue(item->threads[i].real, &item->core[i]->thread_core->signals_p, false);
- if (ret) {
- pr_err("Can't dump private signals for thread %d\n", item->threads[i].real);
- return -1;
- }
- }
-
- /* Dump shared signals */
- ret = dump_signal_queue(pid, &item->core[0]->tc->signals_s, true);
- if (ret) {
- pr_err("Can't dump shared signals (pid: %d)\n", pid);
- return -1;
- }
-
- return 0;
-}
-
-static struct proc_pid_stat pps_buf;
-
-static int dump_task_threads(struct parasite_ctl *parasite_ctl,
- const struct pstree_item *item)
-{
- int i;
-
- for (i = 0; i < item->nr_threads; i++) {
- /* Leader is already dumped */
- if (item->pid.real == item->threads[i].real) {
- item->threads[i].virt = item->pid.virt;
- continue;
- }
- if (dump_task_thread(parasite_ctl, item, i))
- return -1;
- }
-
- return 0;
-}
-
-/*
- * What this routine does is just reads pid-s of dead
- * tasks in item's children list from item's ns proc.
- *
- * It does *not* find wihch real pid corresponds to
- * which virtual one, but it's not required -- all we
- * need to dump for zombie can be found in the same
- * ns proc.
- */
-
-static int fill_zombies_pids(struct pstree_item *item)
-{
- struct pstree_item *child;
- int i, nr;
- pid_t *ch;
-
- /*
- * Pids read here are virtual -- caller has set up
- * the proc of target pid namespace.
- */
- if (parse_children(item->pid.virt, &ch, &nr) < 0)
- return -1;
-
- /*
- * Step 1 -- filter our ch's pid of alive tasks
- */
- list_for_each_entry(child, &item->children, sibling) {
- if (child->pid.virt < 0)
- continue;
- for (i = 0; i < nr; i++) {
- if (ch[i] == child->pid.virt) {
- ch[i] = -1;
- break;
- }
- }
- }
-
- /*
- * Step 2 -- assign remaining pids from ch on
- * children's items in arbitrary order. The caller
- * will then re-read everything needed to dump
- * zombies using newly obtained virtual pids.
- */
- i = 0;
- list_for_each_entry(child, &item->children, sibling) {
- if (child->pid.virt > 0)
- continue;
- for (; i < nr; i++) {
- if (ch[i] < 0)
- continue;
- child->pid.virt = ch[i];
- ch[i] = -1;
- break;
- }
- BUG_ON(i == nr);
- }
-
- xfree(ch);
-
- return 0;
-}
-
-static int dump_zombies(void)
-{
- struct pstree_item *item;
- int ret = -1;
- int pidns = root_ns_mask & CLONE_NEWPID;
-
- if (pidns && set_proc_fd(get_service_fd(CR_PROC_FD_OFF)))
- return -1;
-
- /*
- * We dump zombies separately becase for pid-ns case
- * we'd have to resolve their pids w/o parasite via
- * target ns' proc.
- */
-
- for_each_pstree_item(item) {
- if (item->state != TASK_DEAD)
- continue;
-
- if (item->pid.virt < 0) {
- if (!pidns)
- item->pid.virt = item->pid.real;
- else if (root_item == item) {
- pr_err("A root task is dead\n");
- goto err;
- } else if (fill_zombies_pids(item->parent))
- goto err;
- }
-
- pr_info("Obtaining zombie stat ... \n");
- if (parse_pid_stat(item->pid.virt, &pps_buf) < 0)
- goto err;
-
- item->sid = pps_buf.sid;
- item->pgid = pps_buf.pgid;
-
- BUG_ON(!list_empty(&item->children));
- if (dump_one_zombie(item, &pps_buf) < 0)
- goto err;
- }
-
- ret = 0;
-err:
- if (pidns)
- close_proc();
-
- return ret;
-}
-
-static int pre_dump_one_task(struct pstree_item *item, struct list_head *ctls)
-{
- pid_t pid = item->pid.real;
- struct vm_area_list vmas;
- struct parasite_ctl *parasite_ctl;
- int ret = -1;
- struct parasite_dump_misc misc;
-
- INIT_LIST_HEAD(&vmas.h);
- vmas.nr = 0;
-
- pr_info("========================================\n");
- pr_info("Pre-dumping task (pid: %d)\n", pid);
- pr_info("========================================\n");
-
- if (item->state == TASK_STOPPED) {
- pr_warn("Stopped tasks are not supported\n");
- return 0;
- }
-
- if (item->state == TASK_DEAD)
- return 0;
-
- ret = collect_mappings(pid, &vmas);
- if (ret) {
- pr_err("Collect mappings (pid: %d) failed with %d\n", pid, ret);
- goto err;
- }
-
- ret = -1;
- parasite_ctl = parasite_infect_seized(pid, item, &vmas);
- if (!parasite_ctl) {
- pr_err("Can't infect (pid: %d) with parasite\n", pid);
- goto err_free;
- }
-
- ret = parasite_fixup_vdso(parasite_ctl, pid, &vmas);
- if (ret) {
- pr_err("Can't fixup vdso VMAs (pid: %d)\n", pid);
- goto err_cure;
- }
-
- ret = parasite_dump_misc_seized(parasite_ctl, &misc);
- if (ret) {
- pr_err("Can't dump misc (pid: %d)\n", pid);
- goto err_cure;
- }
-
- ret = predump_task_files(pid);
- if (ret) {
- pr_err("Pre-dumping files failed (pid: %d)\n", pid);
- goto err_cure;
- }
-
- parasite_ctl->pid.virt = item->pid.virt = misc.pid;
-
- ret = parasite_dump_pages_seized(parasite_ctl, &vmas, ¶site_ctl->mem_pp);
- if (ret)
- goto err_cure;
-
- if (parasite_cure_remote(parasite_ctl))
- pr_err("Can't cure (pid: %d) from parasite\n", pid);
- list_add_tail(¶site_ctl->pre_list, ctls);
-err_free:
- free_mappings(&vmas);
-err:
- return ret;
-
-err_cure:
- if (parasite_cure_seized(parasite_ctl))
- pr_err("Can't cure (pid: %d) from parasite\n", pid);
- goto err_free;
-}
-
-static int dump_one_task(struct pstree_item *item)
-{
- pid_t pid = item->pid.real;
- struct vm_area_list vmas;
- struct parasite_ctl *parasite_ctl;
- int ret, exit_code = -1;
- struct parasite_dump_misc misc;
- struct cr_imgset *cr_imgset = NULL;
- struct parasite_drain_fd *dfds = NULL;
- struct proc_posix_timers_stat proc_args;
-
- INIT_LIST_HEAD(&vmas.h);
- vmas.nr = 0;
-
- pr_info("========================================\n");
- pr_info("Dumping task (pid: %d)\n", pid);
- pr_info("========================================\n");
-
- if (item->state == TASK_DEAD)
- /*
- * zombies are dumped separately in dump_zombies()
- */
- return 0;
-
- pr_info("Obtaining task stat ... \n");
- ret = parse_pid_stat(pid, &pps_buf);
- if (ret < 0)
- goto err;
-
- ret = collect_mappings(pid, &vmas);
- if (ret) {
- pr_err("Collect mappings (pid: %d) failed with %d\n", pid, ret);
- goto err;
- }
-
- if (!shared_fdtable(item)) {
- dfds = xmalloc(sizeof(*dfds));
- if (!dfds)
- goto err;
-
- ret = collect_fds(pid, dfds);
- if (ret) {
- pr_err("Collect fds (pid: %d) failed with %d\n", pid, ret);
- goto err;
- }
-
- parasite_ensure_args_size(drain_fds_size(dfds));
- }
-
- ret = parse_posix_timers(pid, &proc_args);
- if (ret < 0) {
- pr_err("Can't read posix timers file (pid: %d)\n", pid);
- goto err;
- }
-
- parasite_ensure_args_size(posix_timers_dump_size(proc_args.timer_n));
-
- ret = dump_task_signals(pid, item);
- if (ret) {
- pr_err("Dump %d signals failed %d\n", pid, ret);
- goto err;
- }
-
- parasite_ctl = parasite_infect_seized(pid, item, &vmas);
- if (!parasite_ctl) {
- pr_err("Can't infect (pid: %d) with parasite\n", pid);
- goto err;
- }
-
- if (fault_injected(FI_DUMP_EARLY)) {
- pr_info("fault: CRIU sudden detach\n");
- BUG();
- }
-
- if (root_ns_mask & CLONE_NEWPID && root_item == item) {
- int pfd;
-
- pfd = parasite_get_proc_fd_seized(parasite_ctl);
- if (pfd < 0) {
- pr_err("Can't get proc fd (pid: %d)\n", pid);
- goto err_cure_imgset;
- }
-
- if (install_service_fd(CR_PROC_FD_OFF, pfd) < 0)
- goto err_cure_imgset;
-
- close(pfd);
- }
-
- ret = parasite_fixup_vdso(parasite_ctl, pid, &vmas);
- if (ret) {
- pr_err("Can't fixup vdso VMAs (pid: %d)\n", pid);
- goto err_cure_imgset;
- }
-
- ret = parasite_check_aios(parasite_ctl, &vmas); /* FIXME -- merge with above */
- if (ret) {
- pr_err("Failed to check aio rings (pid: %d)\n", pid);
- goto err_cure_imgset;
- }
-
- ret = parasite_dump_misc_seized(parasite_ctl, &misc);
- if (ret) {
- pr_err("Can't dump misc (pid: %d)\n", pid);
- goto err_cure_imgset;
- }
-
- parasite_ctl->pid.virt = item->pid.virt = misc.pid;
- item->sid = misc.sid;
- item->pgid = misc.pgid;
-
- pr_info("sid=%d pgid=%d pid=%d\n",
- item->sid, item->pgid, item->pid.virt);
-
- if (item->sid == 0) {
- pr_err("A session leader of %d(%d) is outside of its pid namespace\n",
- item->pid.real, item->pid.virt);
- goto err_cure;
- }
-
- cr_imgset = cr_task_imgset_open(item->pid.virt, O_DUMP);
- if (!cr_imgset)
- goto err_cure;
-
- ret = dump_task_ids(item, cr_imgset);
- if (ret) {
- pr_err("Dump ids (pid: %d) failed with %d\n", pid, ret);
- goto err_cure;
- }
-
- if (dfds) {
- ret = dump_task_files_seized(parasite_ctl, item, dfds);
- if (ret) {
- pr_err("Dump files (pid: %d) failed with %d\n", pid, ret);
- goto err_cure;
- }
- }
-
- ret = parasite_dump_pages_seized(parasite_ctl, &vmas, NULL);
- if (ret)
- goto err_cure;
-
- ret = parasite_dump_sigacts_seized(parasite_ctl, cr_imgset);
- if (ret) {
- pr_err("Can't dump sigactions (pid: %d) with parasite\n", pid);
- goto err_cure;
- }
-
- ret = parasite_dump_itimers_seized(parasite_ctl, item);
- if (ret) {
- pr_err("Can't dump itimers (pid: %d)\n", pid);
- goto err_cure;
- }
-
- ret = parasite_dump_posix_timers_seized(&proc_args, parasite_ctl, item);
- if (ret) {
- pr_err("Can't dump posix timers (pid: %d)\n", pid);
- goto err_cure;
- }
-
- ret = dump_task_core_all(parasite_ctl, item, &pps_buf, cr_imgset);
- if (ret) {
- pr_err("Dump core (pid: %d) failed with %d\n", pid, ret);
- goto err_cure;
- }
-
- ret = parasite_stop_daemon(parasite_ctl);
- if (ret) {
- pr_err("Can't cure (pid: %d) from parasite\n", pid);
- goto err;
- }
-
- ret = dump_task_threads(parasite_ctl, item);
- if (ret) {
- pr_err("Can't dump threads\n");
- goto err;
- }
-
- ret = parasite_cure_seized(parasite_ctl);
- if (ret) {
- pr_err("Can't cure (pid: %d) from parasite\n", pid);
- goto err;
- }
-
- ret = dump_task_mm(pid, &pps_buf, &misc, &vmas, cr_imgset);
- if (ret) {
- pr_err("Dump mappings (pid: %d) failed with %d\n", pid, ret);
- goto err;
- }
-
- ret = dump_task_fs(pid, &misc, cr_imgset);
- if (ret) {
- pr_err("Dump fs (pid: %d) failed with %d\n", pid, ret);
- goto err;
- }
-
- close_cr_imgset(&cr_imgset);
- exit_code = 0;
-err:
- close_pid_proc();
- free_mappings(&vmas);
- xfree(dfds);
- return exit_code;
-
-err_cure:
- close_cr_imgset(&cr_imgset);
-err_cure_imgset:
- parasite_cure_seized(parasite_ctl);
- goto err;
-}
-
-typedef void (*sa_handler_t)(int);
-
-static int setup_alarm_handler(sa_handler_t handler)
-{
- struct sigaction sa = {
- .sa_handler = handler,
- .sa_flags = 0,
- };
-
- sigemptyset(&sa.sa_mask);
- sigaddset(&sa.sa_mask, SIGALRM);
- if (sigaction(SIGALRM, &sa, NULL)) {
- pr_perror("Unable to setup SIGALRM handler");
- return -1;
- }
-
- return 0;
-}
-
-static int cr_pre_dump_finish(struct list_head *ctls, int ret)
-{
- struct parasite_ctl *ctl, *n;
-
- pstree_switch_state(root_item,
- ret ? TASK_ALIVE : opts.final_state);
- free_pstree(root_item);
-
- timing_stop(TIME_FROZEN);
-
- pr_info("Pre-dumping tasks' memory\n");
- list_for_each_entry_safe(ctl, n, ctls, pre_list) {
- struct page_xfer xfer;
-
- pr_info("\tPre-dumping %d\n", ctl->pid.virt);
- timing_start(TIME_MEMWRITE);
- ret = open_page_xfer(&xfer, CR_FD_PAGEMAP, ctl->pid.virt);
- if (ret < 0)
- break;
-
- ret = page_xfer_dump_pages(&xfer, ctl->mem_pp, 0);
-
- xfer.close(&xfer);
-
- if (ret)
- break;
-
- timing_stop(TIME_MEMWRITE);
-
- destroy_page_pipe(ctl->mem_pp);
- list_del(&ctl->pre_list);
- parasite_cure_local(ctl);
- }
-
- if (irmap_predump_run())
- ret = -1;
-
- if (disconnect_from_page_server())
- ret = -1;
-
- if (bfd_flush_images())
- ret = -1;
-
- if (ret)
- pr_err("Pre-dumping FAILED.\n");
- else {
- write_stats(DUMP_STATS);
- pr_info("Pre-dumping finished successfully\n");
- }
- return ret;
-}
-
-void pre_dump_alarm_handler(int signum)
-{
- LIST_HEAD(empty_list);
-
- pr_err("Timeout reached\n");
- cr_pre_dump_finish(&empty_list, -1);
- exit(-1);
-}
-
-int cr_pre_dump_tasks(pid_t pid)
-{
- struct pstree_item *item;
- int ret = -1;
- LIST_HEAD(ctls);
-
- if (!opts.track_mem) {
- pr_info("Enforcing memory tracking for pre-dump.\n");
- opts.track_mem = true;
- }
-
- if (opts.final_state == TASK_DEAD) {
- pr_info("Enforcing tasks run after pre-dump.\n");
- opts.final_state = TASK_ALIVE;
- }
-
- if (init_stats(DUMP_STATS))
- goto err;
-
- if (cr_plugin_init(CR_PLUGIN_STAGE__PRE_DUMP))
- goto err;
-
- if (kerndat_init())
- goto err;
-
- if (irmap_load_cache())
- goto err;
-
- if (cpu_init())
- goto err;
-
- if (vdso_init())
- goto err;
-
- if (connect_to_page_server())
- goto err;
-
- if (setup_alarm_handler(pre_dump_alarm_handler))
- goto err;
-
- if (collect_pstree(pid))
- goto err;
-
- if (collect_pstree_ids_predump())
- goto err;
-
- if (collect_namespaces(false) < 0)
- goto err;
-
- for_each_pstree_item(item)
- if (pre_dump_one_task(item, &ctls))
- goto err;
-
- if (irmap_predump_prep())
- goto err;
-
- ret = 0;
-err:
- return cr_pre_dump_finish(&ctls, ret);
-}
-
-static int cr_dump_finish(int ret)
-{
- int post_dump_ret = 0;
-
- if (disconnect_from_page_server())
- ret = -1;
-
- close_cr_imgset(&glob_imgset);
-
- if (bfd_flush_images())
- ret = -1;
-
- cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret);
-
- if (!ret) {
- /*
- * It might be a migration case, where we're asked
- * to dump everything, then some script transfer
- * image on a new node and we're supposed to kill
- * dumpee because it continue running somewhere
- * else.
- *
- * Thus ask user via script if we're to break
- * checkpoint.
- */
- post_dump_ret = run_scripts(ACT_POST_DUMP);
- if (post_dump_ret) {
- post_dump_ret = WEXITSTATUS(post_dump_ret);
- pr_info("Post dump script passed with %d\n", post_dump_ret);
- }
- }
-
- /*
- * Dump is complete at this stage. To choose what
- * to do next we need to consider the following
- * scenarios
- *
- * - error happened during checkpoint: just clean up
- * everything and continue execution of the dumpee;
- *
- * - dump successed but post-dump script returned
- * some ret code: same as in previous scenario --
- * just clean up everything and continue execution,
- * we will return script ret code back to criu caller
- * and it's up to a caller what to do with running instance
- * of the dumpee -- either kill it, or continue running;
- *
- * - dump successed but -R option passed, pointing that
- * we're asked to continue execution of the dumpee. It's
- * assumed that a user will use post-dump script to keep
- * consistency of the FS and other resources, we simply
- * start rollback procedure and cleanup everyhting.
- */
- if (ret || post_dump_ret || opts.final_state == TASK_ALIVE) {
- network_unlock();
- delete_link_remaps();
- }
- pstree_switch_state(root_item,
- (ret || post_dump_ret) ?
- TASK_ALIVE : opts.final_state);
- timing_stop(TIME_FROZEN);
- free_pstree(root_item);
- free_file_locks();
- free_link_remaps();
- free_aufs_branches();
- free_userns_maps();
-
- close_service_fd(CR_PROC_FD_OFF);
-
- if (ret) {
- pr_err("Dumping FAILED.\n");
- } else {
- write_stats(DUMP_STATS);
- pr_info("Dumping finished successfully\n");
- }
- return post_dump_ret ? : (ret != 0);
-}
-
-void dump_alarm_handler(int signum)
-{
- pr_err("Timeout reached\n");
- cr_dump_finish(-1);
- exit(-1);
-}
-
-int cr_dump_tasks(pid_t pid)
-{
- InventoryEntry he = INVENTORY_ENTRY__INIT;
- struct pstree_item *item;
- int pre_dump_ret = 0;
- int ret = -1;
-
- pr_info("========================================\n");
- pr_info("Dumping processes (pid: %d)\n", pid);
- pr_info("========================================\n");
-
- pre_dump_ret = run_scripts(ACT_PRE_DUMP);
- if (pre_dump_ret != 0) {
- pr_err("Pre dump script failed with %d!\n", pre_dump_ret);
- goto err;
- }
- if (init_stats(DUMP_STATS))
- goto err;
-
- if (cr_plugin_init(CR_PLUGIN_STAGE__DUMP))
- goto err;
-
- if (kerndat_init())
- goto err;
-
- if (irmap_load_cache())
- goto err;
-
- if (cpu_init())
- goto err;
-
- if (vdso_init())
- goto err;
-
- if (parse_cg_info())
- goto err;
-
- if (prepare_inventory(&he))
- goto err;
-
- if (opts.cpu_cap & (CPU_CAP_CPU | CPU_CAP_INS)) {
- if (cpu_dump_cpuinfo())
- goto err;
- }
-
- if (connect_to_page_server())
- goto err;
-
- if (setup_alarm_handler(dump_alarm_handler))
- goto err;
-
- /*
- * The collect_pstree will also stop (PTRACE_SEIZE) the tasks
- * thus ensuring that they don't modify anything we collect
- * afterwards.
- */
-
- if (collect_pstree(pid))
- goto err;
-
- if (collect_pstree_ids())
- goto err;
-
- if (network_lock())
- goto err;
-
- if (collect_file_locks())
- goto err;
-
- if (collect_namespaces(true) < 0)
- goto err;
-
- glob_imgset = cr_glob_imgset_open(O_DUMP);
- if (!glob_imgset)
- goto err;
-
- if (collect_seccomp_filters() < 0)
- goto err;
-
- for_each_pstree_item(item) {
- if (dump_one_task(item))
- goto err;
- }
-
- /* MNT namespaces are dumped after files to save remapped links */
- if (dump_mnt_namespaces() < 0)
- goto err;
-
- if (dump_file_locks())
- goto err;
-
- if (dump_verify_tty_sids())
- goto err;
-
- if (dump_zombies())
- goto err;
-
- if (dump_pstree(root_item))
- goto err;
-
- if (root_ns_mask)
- if (dump_namespaces(root_item, root_ns_mask) < 0)
- goto err;
-
- ret = dump_cgroups();
- if (ret)
- goto err;
-
- ret = cr_dump_shmem();
- if (ret)
- goto err;
-
- ret = fix_external_unix_sockets();
- if (ret)
- goto err;
-
- ret = tty_verify_active_pairs();
- if (ret)
- goto err;
-
- ret = write_img_inventory(&he);
- if (ret)
- goto err;
-err:
- return cr_dump_finish(ret);
-}
diff --git a/cr-errno.c b/cr-errno.c
deleted file mode 100644
index b62bb545a174..000000000000
--- a/cr-errno.c
+++ /dev/null
@@ -1,12 +0,0 @@
-static int cr_errno;
-
-int get_cr_errno(void)
-{
- return cr_errno;
-}
-
-void set_cr_errno(int new_err)
-{
- if (!cr_errno)
- cr_errno = new_err;
-}
diff --git a/cr-exec.c b/cr-exec.c
deleted file mode 100644
index 8beb80f88914..000000000000
--- a/cr-exec.c
+++ /dev/null
@@ -1,170 +0,0 @@
-#include <unistd.h>
-#include <string.h>
-#include <stdlib.h>
-#include "crtools.h"
-#include "ptrace.h"
-#include "parasite-syscall.h"
-#include "vma.h"
-#include "log.h"
-
-struct syscall_exec_desc {
- char *name;
- unsigned nr;
-};
-
-static struct syscall_exec_desc sc_exec_table[] = {
-#define SYSCALL(__name, __nr) { .name = #__name, .nr = __nr, },
-#include "sys-exec-tbl.c"
-#undef SYSCALL
- { }, /* terminator */
-};
-
-static struct syscall_exec_desc *find_syscall(char *name)
-{
- int i;
-
- for (i = 0; sc_exec_table[i].name != NULL; i++)
- if (!strcmp(sc_exec_table[i].name, name))
- return &sc_exec_table[i];
-
- return NULL;
-}
-
-#define MAX_ARGS 6
-
-static int execute_syscall(struct parasite_ctl *ctl,
- struct syscall_exec_desc *scd, char **opt)
-{
- int i, err;
- unsigned long args[MAX_ARGS] = {}, ret, r_mem_size = 0;
- unsigned int ret_args[MAX_ARGS] = {};
- void *r_mem = NULL;
-
- for (i = 0; i < MAX_ARGS; i++) {
- if (opt[i] == NULL)
- break;
-
- /*
- * &foo -- argument string "foo"
- * @<size> -- ret-arg of size <size>
- */
-
- if ((opt[i][0] == '&') || (opt[i][0] == '@')) {
- int len;
-
- if (!r_mem) {
- err = parasite_map_exchange(ctl, PAGE_SIZE);
- if (err)
- return err;
-
- r_mem_size = PAGE_SIZE;
- r_mem = ctl->local_map;
- }
-
- if (opt[i][0] == '&') {
- len = strlen(opt[i]);
- if (r_mem_size < len) {
- pr_err("Arg size overflow\n");
- return -1;
- }
-
- memcpy(r_mem, opt[i] + 1, len);
- } else {
- len = strtol(opt[i] + 1, NULL, 0);
- if (!len || (r_mem_size < len)) {
- pr_err("Bad argument size %d\n", len);
- return -1;
- }
-
- ret_args[i] = len;
- }
-
- args[i] = (unsigned long)ctl->remote_map + (r_mem - ctl->local_map);
- pr_info("Pushing %c mem arg [%s]\n", opt[i][0], (char *)r_mem);
- r_mem_size -= len;
- r_mem += len;
- } else
- args[i] = strtol(opt[i], NULL, 0);
- }
-
- pr_info("Calling %d with %lu %lu %lu %lu %lu %lu\n", scd->nr,
- args[0], args[1], args[2], args[3], args[4], args[5]);
-
- err = syscall_seized(ctl, scd->nr, &ret,
- args[0], args[1], args[2], args[3], args[4], args[5]);
- if (err)
- return err;
-
- pr_msg("Syscall returned %lx(%d)\n", ret, (int)ret);
- for (i = 0; i < MAX_ARGS; i++) {
- unsigned long addr;
-
- if (!ret_args[i])
- continue;
-
- pr_msg("Argument %d returns:\n", i);
- addr = (unsigned long)ctl->local_map + (args[i] - (unsigned long)ctl->remote_map);
- print_data(0, (unsigned char *)addr, ret_args[i]);
- }
-
- return 0;
-}
-
-int cr_exec(int pid, char **opt)
-{
- char *sys_name = opt[0];
- struct syscall_exec_desc *si;
- struct parasite_ctl *ctl;
- struct vm_area_list vmas;
- int ret = -1, prev_state;
- struct proc_status_creds *creds;
-
- if (!sys_name) {
- pr_err("Syscall name required\n");
- goto out;
- }
-
- si = find_syscall(sys_name);
- if (!si) {
- pr_err("Unknown syscall [%s]\n", sys_name);
- goto out;
- }
-
- if (seize_catch_task(pid))
- goto out;
-
- prev_state = ret = seize_wait_task(pid, -1, &creds);
- if (ret < 0) {
- pr_err("Can't seize task %d\n", pid);
- goto out;
- }
-
- /*
- * We don't seize a task's threads here, and there is no reason to
- * compare threads' creds in this use case anyway, so let's just free
- * the creds.
- */
- free(creds);
-
- ret = collect_mappings(pid, &vmas);
- if (ret) {
- pr_err("Can't collect vmas for %d\n", pid);
- goto out_unseize;
- }
-
- ctl = parasite_prep_ctl(pid, &vmas);
- if (!ctl) {
- pr_err("Can't prep ctl %d\n", pid);
- goto out_unseize;
- }
-
- ret = execute_syscall(ctl, si, opt + 1);
- if (ret < 0)
- pr_err("Can't execute syscall remotely\n");
-
- parasite_cure_seized(ctl);
-out_unseize:
- unseize_task(pid, prev_state, prev_state);
-out:
- return ret;
-}
diff --git a/cr-restore.c b/cr-restore.c
deleted file mode 100644
index 0985b8675f64..000000000000
--- a/cr-restore.c
+++ /dev/null
@@ -1,3364 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <signal.h>
-#include <limits.h>
-#include <unistd.h>
-#include <errno.h>
-#include <dirent.h>
-#include <string.h>
-
-#include <fcntl.h>
-#include <grp.h>
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-#include <sys/vfs.h>
-#include <sys/wait.h>
-#include <sys/file.h>
-#include <sys/shm.h>
-#include <sys/mount.h>
-#include <sys/prctl.h>
-
-#include <sched.h>
-
-#include <sys/sendfile.h>
-
-#include "ptrace.h"
-#include "compiler.h"
-#include "asm/types.h"
-#include "asm/restorer.h"
-
-#include "cr_options.h"
-#include "servicefd.h"
-#include "image.h"
-#include "util.h"
-#include "util-pie.h"
-#include "log.h"
-#include "restorer.h"
-#include "sockets.h"
-#include "sk-packet.h"
-#include "lock.h"
-#include "files.h"
-#include "files-reg.h"
-#include "pipes.h"
-#include "fifo.h"
-#include "sk-inet.h"
-#include "eventfd.h"
-#include "eventpoll.h"
-#include "signalfd.h"
-#include "proc_parse.h"
-#include "restorer-blob.h"
-#include "crtools.h"
-#include "namespaces.h"
-#include "mem.h"
-#include "mount.h"
-#include "fsnotify.h"
-#include "pstree.h"
-#include "net.h"
-#include "tty.h"
-#include "cpu.h"
-#include "file-lock.h"
-#include "page-read.h"
-#include "vdso.h"
-#include "stats.h"
-#include "tun.h"
-#include "vma.h"
-#include "kerndat.h"
-#include "rst-malloc.h"
-#include "plugin.h"
-#include "cgroup.h"
-#include "timerfd.h"
-#include "file-lock.h"
-#include "action-scripts.h"
-#include "aio.h"
-#include "lsm.h"
-#include "seccomp.h"
-#include "bitmap.h"
-#include "fault-injection.h"
-#include "parasite-syscall.h"
-
-#include "protobuf.h"
-#include "protobuf/sa.pb-c.h"
-#include "protobuf/timer.pb-c.h"
-#include "protobuf/vma.pb-c.h"
-#include "protobuf/rlimit.pb-c.h"
-#include "protobuf/pagemap.pb-c.h"
-#include "protobuf/siginfo.pb-c.h"
-
-#include "asm/restore.h"
-#include "asm/atomic.h"
-#include "asm/bitops.h"
-
-#include "cr-errno.h"
-
-#include "pie/pie-relocs.h"
-
-#ifndef arch_export_restore_thread
-#define arch_export_restore_thread __export_restore_thread
-#endif
-
-#ifndef arch_export_restore_task
-#define arch_export_restore_task __export_restore_task
-#endif
-
-#ifndef arch_export_unmap
-#define arch_export_unmap __export_unmap
-#endif
-
-static struct pstree_item *current;
-
-static int restore_task_with_children(void *);
-static int sigreturn_restore(pid_t pid, CoreEntry *core);
-static int prepare_restorer_blob(void);
-static int prepare_rlimits(int pid, CoreEntry *core);
-static int prepare_posix_timers(int pid, CoreEntry *core);
-static int prepare_signals(int pid, CoreEntry *core);
-
-static int root_as_sibling;
-static unsigned long helpers_pos = 0;
-static int n_helpers = 0;
-static unsigned long zombies_pos = 0;
-static int n_zombies = 0;
-
-static int crtools_prepare_shared(void)
-{
- if (prepare_shared_fdinfo())
- return -1;
-
- /* We might want to remove ghost files on failed restore */
- if (collect_remaps_and_regfiles())
- return -1;
-
- /* dead pid remap needs to allocate task helpers which all tasks need
- * to see */
- if (prepare_procfs_remaps())
- return -1;
-
- /* Connections are unlocked from criu */
- if (collect_inet_sockets())
- return -1;
-
- if (tty_prep_fds())
- return -1;
-
- if (prepare_cgroup())
- return -1;
-
- return 0;
-}
-
-/*
- * Collect order information:
- * - reg_file should be before remap, as the latter needs
- * to find file_desc objects
- * - per-pid collects (mm and fd) should be after remap and
- * reg_file since both per-pid ones need to get fdesc-s
- * and bump counters on remaps if they exist
- */
-
-static struct collect_image_info *cinfos[] = {
- &nsfile_cinfo,
- &pipe_cinfo,
- &fifo_cinfo,
- &unix_sk_cinfo,
- &packet_sk_cinfo,
- &netlink_sk_cinfo,
- &eventfd_cinfo,
- &epoll_tfd_cinfo,
- &epoll_cinfo,
- &signalfd_cinfo,
- &inotify_cinfo,
- &inotify_mark_cinfo,
- &fanotify_cinfo,
- &fanotify_mark_cinfo,
- &tty_info_cinfo,
- &tty_cinfo,
- &tunfile_cinfo,
- &ext_file_cinfo,
- &timerfd_cinfo,
- &file_locks_cinfo,
-};
-
-static int root_prepare_shared(void)
-{
- int ret = 0, i;
- struct pstree_item *pi;
-
- pr_info("Preparing info about shared resources\n");
-
- if (prepare_shared_tty())
- return -1;
-
- if (prepare_shared_reg_files())
- return -1;
-
- if (prepare_remaps())
- return -1;
-
- if (prepare_seccomp_filters())
- return -1;
-
- for (i = 0; i < ARRAY_SIZE(cinfos); i++) {
- ret = collect_image(cinfos[i]);
- if (ret)
- return -1;
- }
-
- if (collect_pipes())
- return -1;
- if (collect_fifo())
- return -1;
- if (collect_unix_sockets())
- return -1;
-
- if (tty_verify_active_pairs())
- return -1;
-
- for_each_pstree_item(pi) {
- if (pi->state == TASK_HELPER)
- continue;
-
- ret = prepare_mm_pid(pi);
- if (ret < 0)
- break;
-
- ret = prepare_fd_pid(pi);
- if (ret < 0)
- break;
-
- ret = prepare_fs_pid(pi);
- if (ret < 0)
- break;
- }
-
- if (ret < 0)
- goto err;
-
- mark_pipe_master();
-
- ret = tty_setup_slavery();
- if (ret)
- goto err;
-
- ret = resolve_unix_peers();
- if (ret)
- goto err;
-
- ret = prepare_restorer_blob();
- if (ret)
- goto err;
-
- show_saved_shmems();
- show_saved_files();
-err:
- return ret;
-}
-
-/* Map a private vma, if it is not mapped by a parent yet */
-static int map_private_vma(struct vma_area *vma, void **tgt_addr,
- struct vma_area **pvma, struct list_head *pvma_list)
-{
- int ret;
- void *addr, *paddr = NULL;
- unsigned long nr_pages, size;
- struct vma_area *p = *pvma;
-
- if (vma_area_is(vma, VMA_FILE_PRIVATE)) {
- ret = get_filemap_fd(vma);
- if (ret < 0) {
- pr_err("Can't fixup VMA's fd\n");
- return -1;
- }
- vma->e->fd = ret;
- }
-
- nr_pages = vma_entry_len(vma->e) / PAGE_SIZE;
- vma->page_bitmap = xzalloc(BITS_TO_LONGS(nr_pages) * sizeof(long));
- if (vma->page_bitmap == NULL)
- return -1;
-
- list_for_each_entry_from(p, pvma_list, list) {
- if (p->e->start > vma->e->start)
- break;
-
- if (!vma_area_is_private(p, kdat.task_size))
- continue;
-
- if (p->e->end != vma->e->end ||
- p->e->start != vma->e->start)
- continue;
-
- /* Check flags, which must be identical for both vma-s */
- if ((vma->e->flags ^ p->e->flags) & (MAP_GROWSDOWN | MAP_ANONYMOUS))
- break;
-
- if (!(vma->e->flags & MAP_ANONYMOUS) &&
- vma->e->shmid != p->e->shmid)
- break;
-
- pr_info("COW 0x%016"PRIx64"-0x%016"PRIx64" 0x%016"PRIx64" vma\n",
- vma->e->start, vma->e->end, vma->e->pgoff);
- paddr = decode_pointer(p->premmaped_addr);
-
- break;
- }
-
- /*
- * A grow-down VMA has a guard page, which protect a VMA below it.
- * So one more page is mapped here to restore content of the first page
- */
- if (vma->e->flags & MAP_GROWSDOWN) {
- vma->e->start -= PAGE_SIZE;
- if (paddr)
- paddr -= PAGE_SIZE;
- }
-
- size = vma_entry_len(vma->e);
- if (paddr == NULL) {
- /*
- * The respective memory area was NOT found in the parent.
- * Map a new one.
- */
- pr_info("Map 0x%016"PRIx64"-0x%016"PRIx64" 0x%016"PRIx64" vma\n",
- vma->e->start, vma->e->end, vma->e->pgoff);
-
- addr = mmap(*tgt_addr, size,
- vma->e->prot | PROT_WRITE,
- vma->e->flags | MAP_FIXED,
- vma->e->fd, vma->e->pgoff);
-
- if (addr == MAP_FAILED) {
- pr_perror("Unable to map ANON_VMA");
- return -1;
- }
-
- *pvma = p;
- } else {
- /*
- * This region was found in parent -- remap it to inherit physical
- * pages (if any) from it (and COW them later if required).
- */
- vma->ppage_bitmap = p->page_bitmap;
-
- addr = mremap(paddr, size, size,
- MREMAP_FIXED | MREMAP_MAYMOVE, *tgt_addr);
- if (addr != *tgt_addr) {
- pr_perror("Unable to remap a private vma");
- return -1;
- }
-
- *pvma = list_entry(p->list.next, struct vma_area, list);
- }
-
- vma->premmaped_addr = (unsigned long) addr;
- pr_debug("\tpremap 0x%016"PRIx64"-0x%016"PRIx64" -> %016lx\n",
- vma->e->start, vma->e->end, (unsigned long)addr);
-
- if (vma->e->flags & MAP_GROWSDOWN) { /* Skip gurad page */
- vma->e->start += PAGE_SIZE;
- vma->premmaped_addr += PAGE_SIZE;
- }
-
- if (vma_area_is(vma, VMA_FILE_PRIVATE))
- close(vma->e->fd);
-
- *tgt_addr += size;
- return 0;
-}
-
-static int premap_priv_vmas(struct vm_area_list *vmas, void *at)
-{
- struct list_head *parent_vmas;
- struct vma_area *pvma, *vma;
- unsigned long pstart = 0;
- int ret = 0;
- LIST_HEAD(empty);
-
- /*
- * Keep parent vmas at hands to check whether we can "inherit" them.
- * See comments in map_private_vma.
- */
- if (current->parent)
- parent_vmas = &rsti(current->parent)->vmas.h;
- else
- parent_vmas = ∅
-
- pvma = list_first_entry(parent_vmas, struct vma_area, list);
-
- list_for_each_entry(vma, &vmas->h, list) {
- if (pstart > vma->e->start) {
- ret = -1;
- pr_err("VMA-s are not sorted in the image file\n");
- break;
- }
- pstart = vma->e->start;
-
- if (!vma_area_is_private(vma, kdat.task_size))
- continue;
-
- ret = map_private_vma(vma, &at, &pvma, parent_vmas);
- if (ret < 0)
- break;
- }
-
- return ret;
-}
-
-static int restore_priv_vma_content(void)
-{
- struct vma_area *vma;
- int ret = 0;
- struct list_head *vmas = &rsti(current)->vmas.h;
-
- unsigned int nr_restored = 0;
- unsigned int nr_shared = 0;
- unsigned int nr_droped = 0;
- unsigned int nr_compared = 0;
- unsigned long va;
- struct page_read pr;
-
- vma = list_first_entry(vmas, struct vma_area, list);
-
- ret = open_page_read(current->pid.virt, &pr, PR_TASK);
- if (ret <= 0)
- return -1;
-
- /*
- * Read page contents.
- */
- while (1) {
- unsigned long off, i, nr_pages;
- struct iovec iov;
-
- ret = pr.get_pagemap(&pr, &iov);
- if (ret <= 0)
- break;
-
- va = (unsigned long)iov.iov_base;
- nr_pages = iov.iov_len / PAGE_SIZE;
-
- for (i = 0; i < nr_pages; i++) {
- unsigned char buf[PAGE_SIZE];
- void *p;
-
- /*
- * The lookup is over *all* possible VMAs
- * read from image file.
- */
- while (va >= vma->e->end) {
- if (vma->list.next == vmas)
- goto err_addr;
- vma = list_entry(vma->list.next, struct vma_area, list);
- }
-
- /*
- * Make sure the page address is inside existing VMA
- * and the VMA it refers to still private one, since
- * there is no guarantee that the data from pagemap is
- * valid.
- */
- if (va < vma->e->start)
- goto err_addr;
- else if (unlikely(!vma_area_is_private(vma, kdat.task_size))) {
- pr_err("Trying to restore page for non-private VMA\n");
- goto err_addr;
- }
-
- off = (va - vma->e->start) / PAGE_SIZE;
- p = decode_pointer((off) * PAGE_SIZE +
- vma->premmaped_addr);
-
- set_bit(off, vma->page_bitmap);
- if (vma->ppage_bitmap) { /* inherited vma */
- clear_bit(off, vma->ppage_bitmap);
-
- ret = pr.read_pages(&pr, va, 1, buf);
- if (ret < 0)
- goto err_read;
-
- va += PAGE_SIZE;
- nr_compared++;
-
- if (memcmp(p, buf, PAGE_SIZE) == 0) {
- nr_shared++; /* the page is cowed */
- continue;
- }
-
- nr_restored++;
- memcpy(p, buf, PAGE_SIZE);
- } else {
- int nr;
-
- /*
- * Try to read as many pages as possible at once.
- *
- * Within the current pagemap we still have
- * nr_pages - i pages (not all, as we might have
- * switched VMA above), within the current VMA
- * we have at most (vma->end - current_addr) bytes.
- */
-
- nr = min_t(int, nr_pages - i, (vma->e->end - va) / PAGE_SIZE);
-
- ret = pr.read_pages(&pr, va, nr, p);
- if (ret < 0)
- goto err_read;
-
- va += nr * PAGE_SIZE;
- nr_restored += nr;
- i += nr - 1;
-
- bitmap_set(vma->page_bitmap, off + 1, nr - 1);
- }
-
- }
-
- if (pr.put_pagemap)
- pr.put_pagemap(&pr);
- }
-
-err_read:
- pr.close(&pr);
- if (ret < 0)
- return ret;
-
- /* Remove pages, which were not shared with a child */
- list_for_each_entry(vma, vmas, list) {
- unsigned long size, i = 0;
- void *addr = decode_pointer(vma->premmaped_addr);
-
- if (vma->ppage_bitmap == NULL)
- continue;
-
- size = vma_entry_len(vma->e) / PAGE_SIZE;
- while (1) {
- /* Find all pages, which are not shared with this child */
- i = find_next_bit(vma->ppage_bitmap, size, i);
-
- if ( i >= size)
- break;
-
- ret = madvise(addr + PAGE_SIZE * i,
- PAGE_SIZE, MADV_DONTNEED);
- if (ret < 0) {
- pr_perror("madvise failed");
- return -1;
- }
- i++;
- nr_droped++;
- }
- }
-
- cnt_add(CNT_PAGES_COMPARED, nr_compared);
- cnt_add(CNT_PAGES_SKIPPED_COW, nr_shared);
- cnt_add(CNT_PAGES_RESTORED, nr_restored);
-
- pr_info("nr_restored_pages: %d\n", nr_restored);
- pr_info("nr_shared_pages: %d\n", nr_shared);
- pr_info("nr_droped_pages: %d\n", nr_droped);
-
- return 0;
-
-err_addr:
- pr_err("Page entry address %lx outside of VMA %lx-%lx\n",
- va, (long)vma->e->start, (long)vma->e->end);
- return -1;
-}
-
-static int prepare_mappings(void)
-{
- int ret = 0;
- void *addr;
- struct vm_area_list *vmas;
-
- void *old_premmapped_addr = NULL;
- unsigned long old_premmapped_len;
-
- vmas = &rsti(current)->vmas;
- if (vmas->nr == 0) /* Zombie */
- goto out;
-
- /* Reserve a place for mapping private vma-s one by one */
- addr = mmap(NULL, vmas->priv_size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
- if (addr == MAP_FAILED) {
- ret = -1;
- pr_perror("Unable to reserve memory (%lu bytes)", vmas->priv_size);
- goto out;
- }
-
- old_premmapped_addr = rsti(current)->premmapped_addr;
- old_premmapped_len = rsti(current)->premmapped_len;
- rsti(current)->premmapped_addr = addr;
- rsti(current)->premmapped_len = vmas->priv_size;
-
- ret = premap_priv_vmas(vmas, addr);
- if (ret < 0)
- goto out;
-
- ret = restore_priv_vma_content();
- if (ret < 0)
- goto out;
-
- if (old_premmapped_addr) {
- ret = munmap(old_premmapped_addr, old_premmapped_len);
- if (ret < 0)
- pr_perror("Unable to unmap %p(%lx)",
- old_premmapped_addr, old_premmapped_len);
- }
-
-out:
- return ret;
-}
-
-/*
- * A gard page must be unmapped after restoring content and
- * forking children to restore COW memory.
- */
-static int unmap_guard_pages()
-{
- struct vma_area *vma;
- struct list_head *vmas = &rsti(current)->vmas.h;
-
- list_for_each_entry(vma, vmas, list) {
- if (!vma_area_is_private(vma, kdat.task_size))
- continue;
-
- if (vma->e->flags & MAP_GROWSDOWN) {
- void *addr = decode_pointer(vma->premmaped_addr);
-
- if (munmap(addr - PAGE_SIZE, PAGE_SIZE)) {
- pr_perror("Can't unmap guard page");
- return -1;
- }
- }
- }
-
- return 0;
-}
-
-static int open_vmas(int pid)
-{
- struct vma_area *vma;
- int ret = 0;
- struct list_head *vmas = &rsti(current)->vmas.h;
-
- list_for_each_entry(vma, vmas, list) {
- if (!(vma_area_is(vma, VMA_AREA_REGULAR)))
- continue;
-
- pr_info("Opening 0x%016"PRIx64"-0x%016"PRIx64" 0x%016"PRIx64" (%x) vma\n",
- vma->e->start, vma->e->end,
- vma->e->pgoff, vma->e->status);
-
- if (vma_area_is(vma, VMA_AREA_SYSVIPC))
- ret = vma->e->shmid;
- else if (vma_area_is(vma, VMA_ANON_SHARED))
- ret = get_shmem_fd(pid, vma->e);
- else if (vma_area_is(vma, VMA_FILE_SHARED))
- ret = get_filemap_fd(vma);
- else if (vma_area_is(vma, VMA_AREA_SOCKET))
- ret = get_socket_fd(pid, vma->e);
- else
- continue;
-
- if (ret < 0) {
- pr_err("Can't fixup fd\n");
- break;
- }
-
- pr_info("\t`- setting %d as mapping fd\n", ret);
- vma->e->fd = ret;
- }
-
- return ret < 0 ? -1 : 0;
-}
-
-static rt_sigaction_t sigchld_act;
-static rt_sigaction_t parent_act[SIGMAX];
-
-static bool sa_inherited(int sig, rt_sigaction_t *sa)
-{
- rt_sigaction_t *pa;
-
- if (current == root_item)
- return false; /* XXX -- inherit from CRIU? */
-
- pa = &parent_act[sig];
- return pa->rt_sa_handler == sa->rt_sa_handler &&
- pa->rt_sa_flags == sa->rt_sa_flags &&
- pa->rt_sa_restorer == sa->rt_sa_restorer &&
- pa->rt_sa_mask.sig[0] == sa->rt_sa_mask.sig[0];
-}
-
-static int prepare_sigactions(void)
-{
- int pid = current->pid.virt;
- rt_sigaction_t act;
- struct cr_img *img;
- SaEntry *e;
- int sig, rst = 0;
- int ret = 0;
-
- if (!task_alive(current))
- return 0;
-
- pr_info("Restore sigacts for %d\n", pid);
-
- img = open_image(CR_FD_SIGACT, O_RSTR, pid);
- if (!img)
- return -1;
-
- for (sig = 1; sig <= SIGMAX; sig++) {
- if (sig == SIGKILL || sig == SIGSTOP)
- continue;
-
- ret = pb_read_one_eof(img, &e, PB_SIGACT);
- if (ret == 0) {
- if (sig != SIGMAX_OLD + 1) { /* backward compatibility */
- pr_err("Unexpected EOF %d\n", sig);
- ret = -1;
- break;
- }
- pr_warn("This format of sigacts-%d.img is deprecated\n", pid);
- break;
- }
- if (ret < 0)
- break;
-
- ASSIGN_TYPED(act.rt_sa_handler, decode_pointer(e->sigaction));
- ASSIGN_TYPED(act.rt_sa_flags, e->flags);
- ASSIGN_TYPED(act.rt_sa_restorer, decode_pointer(e->restorer));
- ASSIGN_TYPED(act.rt_sa_mask.sig[0], e->mask);
-
- sa_entry__free_unpacked(e, NULL);
-
- if (sig == SIGCHLD) {
- sigchld_act = act;
- continue;
- }
-
- if (sa_inherited(sig - 1, &act))
- continue;
-
- /*
- * A pure syscall is used, because glibc
- * sigaction overwrites se_restorer.
- */
- ret = syscall(SYS_rt_sigaction, sig, &act, NULL, sizeof(k_rtsigset_t));
- if (ret < 0) {
- errno = -ret;
- pr_perror("Can't restore sigaction");
- goto err;
- }
-
- parent_act[sig - 1] = act;
- rst++;
- }
-
- pr_info("Restored %d/%d sigacts\n", rst,
- SIGMAX - 3 /* KILL, STOP and CHLD */);
-
-err:
- close_image(img);
- return ret;
-}
-
-static int collect_child_pids(int state, int *n)
-{
- struct pstree_item *pi;
-
- *n = 0;
- list_for_each_entry(pi, ¤t->children, sibling) {
- pid_t *child;
-
- if (pi->state != state)
- continue;
-
- child = rst_mem_alloc(sizeof(*child), RM_PRIVATE);
- if (!child)
- return -1;
-
- (*n)++;
- *child = pi->pid.virt;
- }
-
- return 0;
-}
-
-static int collect_helper_pids()
-{
- helpers_pos = rst_mem_align_cpos(RM_PRIVATE);
- return collect_child_pids(TASK_HELPER, &n_helpers);
-}
-
-static int collect_zombie_pids()
-{
- zombies_pos = rst_mem_align_cpos(RM_PRIVATE);
- return collect_child_pids(TASK_DEAD, &n_zombies);
-}
-
-static int open_cores(int pid, CoreEntry *leader_core)
-{
- int i, tpid;
- CoreEntry **cores = NULL;
-
- cores = xmalloc(sizeof(*cores)*current->nr_threads);
- if (!cores)
- goto err;
-
- for (i = 0; i < current->nr_threads; i++) {
- tpid = current->threads[i].virt;
-
- if (tpid == pid)
- cores[i] = leader_core;
- else {
- struct cr_img *img;
-
- img = open_image(CR_FD_CORE, O_RSTR, tpid);
- if (!img) {
- pr_err("Can't open core data for thread %d\n", tpid);
- goto err;
- }
-
- if (pb_read_one(img, &cores[i], PB_CORE) <= 0) {
- close_image(img);
- goto err;
- }
-
- close_image(img);
- }
- }
-
- current->core = cores;
-
- return 0;
-err:
- xfree(cores);
- return -1;
-}
-
-static int prepare_oom_score_adj(int value)
-{
- int fd, ret = 0;
- char buf[11];
-
- fd = open_proc_rw(PROC_SELF, "oom_score_adj");
- if (fd < 0)
- return -1;
-
- snprintf(buf, 11, "%d", value);
-
- if (write(fd, buf, 11) < 0) {
- pr_perror("Write %s to /proc/self/oom_score_adj failed", buf);
- ret = -1;
- }
-
- close(fd);
- return ret;
-}
-
-static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc)
-{
- int ret;
-
- /* loginuid value is critical to restore */
- if (kdat.has_loginuid && tc->has_loginuid &&
- tc->loginuid != INVALID_UID) {
- ret = prepare_loginuid(tc->loginuid, LOG_ERROR);
- if (ret < 0)
- return ret;
- }
-
- /* oom_score_adj is not critical: only log errors */
- if (tc->has_oom_score_adj && tc->oom_score_adj != 0)
- prepare_oom_score_adj(tc->oom_score_adj);
-
- return 0;
-}
-
-static int restore_one_alive_task(int pid, CoreEntry *core)
-{
- pr_info("Restoring resources\n");
-
- rst_mem_switch_to_private();
-
- if (prepare_fds(current))
- return -1;
-
- if (prepare_file_locks(pid))
- return -1;
-
- if (open_vmas(pid))
- return -1;
-
- if (open_cores(pid, core))
- return -1;
-
- if (prepare_signals(pid, core))
- return -1;
-
- if (prepare_posix_timers(pid, core))
- return -1;
-
- if (prepare_rlimits(pid, core) < 0)
- return -1;
-
- if (collect_helper_pids() < 0)
- return -1;
-
- if (collect_zombie_pids() < 0)
- return -1;
-
- if (inherit_fd_fini() < 0)
- return -1;
-
- if (prepare_proc_misc(pid, core->tc))
- return -1;
-
- return sigreturn_restore(pid, core);
-}
-
-static void zombie_prepare_signals(void)
-{
- sigset_t blockmask;
- int sig;
- struct sigaction act;
-
- sigfillset(&blockmask);
- sigprocmask(SIG_UNBLOCK, &blockmask, NULL);
-
- memset(&act, 0, sizeof(act));
- act.sa_handler = SIG_DFL;
-
- for (sig = 1; sig <= SIGMAX; sig++)
- sigaction(sig, &act, NULL);
-}
-
-#define SIG_FATAL_MASK ( \
- (1 << SIGHUP) |\
- (1 << SIGINT) |\
- (1 << SIGQUIT) |\
- (1 << SIGILL) |\
- (1 << SIGTRAP) |\
- (1 << SIGABRT) |\
- (1 << SIGIOT) |\
- (1 << SIGBUS) |\
- (1 << SIGFPE) |\
- (1 << SIGKILL) |\
- (1 << SIGUSR1) |\
- (1 << SIGSEGV) |\
- (1 << SIGUSR2) |\
- (1 << SIGPIPE) |\
- (1 << SIGALRM) |\
- (1 << SIGTERM) |\
- (1 << SIGXCPU) |\
- (1 << SIGXFSZ) |\
- (1 << SIGVTALRM)|\
- (1 << SIGPROF) |\
- (1 << SIGPOLL) |\
- (1 << SIGIO) |\
- (1 << SIGSYS) |\
- (1 << SIGUNUSED)|\
- (1 << SIGSTKFLT)|\
- (1 << SIGPWR) \
- )
-
-static inline int sig_fatal(int sig)
-{
- return (sig > 0) && (sig < SIGMAX) && (SIG_FATAL_MASK & (1UL << sig));
-}
-
-struct task_entries *task_entries;
-static unsigned long task_entries_pos;
-
-static int restore_one_zombie(CoreEntry *core)
-{
- int exit_code = core->tc->exit_code;
-
- pr_info("Restoring zombie with %d code\n", exit_code);
-
- if (inherit_fd_fini() < 0)
- return -1;
-
- prctl(PR_SET_NAME, (long)(void *)core->tc->comm, 0, 0, 0);
-
- if (task_entries != NULL) {
- restore_finish_stage(CR_STATE_RESTORE);
- zombie_prepare_signals();
- }
-
- if (exit_code & 0x7f) {
- int signr;
-
- /* prevent generating core files */
- if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0))
- pr_perror("Can't drop the dumpable flag");
-
- signr = exit_code & 0x7F;
- if (!sig_fatal(signr)) {
- pr_warn("Exit with non fatal signal ignored\n");
- signr = SIGABRT;
- }
-
- if (kill(current->pid.virt, signr) < 0)
- pr_perror("Can't kill myself, will just exit");
-
- exit_code = 0;
- }
-
- exit((exit_code >> 8) & 0x7f);
-
- /* never reached */
- BUG_ON(1);
- return -1;
-}
-
-static int check_core(CoreEntry *core, struct pstree_item *me)
-{
- int ret = -1;
-
- if (core->mtype != CORE_ENTRY__MARCH) {
- pr_err("Core march mismatch %d\n", (int)core->mtype);
- goto out;
- }
-
- if (!core->tc) {
- pr_err("Core task state data missed\n");
- goto out;
- }
-
- if (core->tc->task_state != TASK_DEAD) {
- if (!core->ids && !me->ids) {
- pr_err("Core IDS data missed for non-zombie\n");
- goto out;
- }
-
- if (!CORE_THREAD_ARCH_INFO(core)) {
- pr_err("Core info data missed for non-zombie\n");
- goto out;
- }
- }
-
- ret = 0;
-out:
- return ret;
-}
-
-static int restore_one_task(int pid, CoreEntry *core)
-{
- int ret;
-
- /* No more fork()-s => no more per-pid logs */
-
- if (task_alive(current))
- ret = restore_one_alive_task(pid, core);
- else if (current->state == TASK_DEAD)
- ret = restore_one_zombie(core);
- else if (current->state == TASK_HELPER) {
- restore_finish_stage(CR_STATE_RESTORE);
- ret = 0;
- } else {
- pr_err("Unknown state in code %d\n", (int)core->tc->task_state);
- ret = -1;
- }
-
- if (core)
- core_entry__free_unpacked(core, NULL);
- return ret;
-}
-
-/* All arguments should be above stack, because it grows down */
-struct cr_clone_arg {
- /*
- * Reserve some space for clone() to locate arguments
- * and retcode in this place
- */
- char stack[128] __stack_aligned__;
- char stack_ptr[0];
- struct pstree_item *item;
- unsigned long clone_flags;
- int fd;
-
- CoreEntry *core;
-};
-
-static void maybe_clone_parent(struct pstree_item *item,
- struct cr_clone_arg *ca)
-{
- /*
- * zdtm runs in kernel 3.11, which has the problem described below. We
- * avoid this by including the pdeath_sig test. Once users/zdtm migrate
- * off of 3.11, this condition can be simplified to just test the
- * options and not have the pdeath_sig test.
- */
- if (opts.restore_sibling) {
- /*
- * This means we're called from lib's criu_restore_child().
- * In that case create the root task as the child one to+
- * the caller. This is the only way to correctly restore the
- * pdeath_sig of the root task. But also looks nice.
- *
- * Alternatively, if we are --restore-detached, a similar trick is
- * needed to correctly restore pdeath_sig and prevent processes from
- * dying once restored.
- *
- * There were a problem in kernel 3.11 -- CLONE_PARENT can't be
- * set together with CLONE_NEWPID, which has been solved in further
- * versions of the kernels, but we treat 3.11 as a base, so at
- * least warn a user about potential problems.
- */
- rsti(item)->clone_flags |= CLONE_PARENT;
- root_as_sibling = 1;
- if (rsti(item)->clone_flags & CLONE_NEWPID)
- pr_warn("Set CLONE_PARENT | CLONE_NEWPID but it might cause restore problem,"
- "because not all kernels support such clone flags combinations!\n");
- } else if (opts.restore_detach) {
- if (ca->core->thread_core->pdeath_sig)
- pr_warn("Root task has pdeath_sig configured, so it will receive one _right_"
- "after restore on CRIU exit\n");
- }
-}
-
-static inline int fork_with_pid(struct pstree_item *item)
-{
- struct cr_clone_arg ca;
- int ret = -1;
- pid_t pid = item->pid.virt;
-
- if (item->state != TASK_HELPER) {
- struct cr_img *img;
-
- img = open_image(CR_FD_CORE, O_RSTR, pid);
- if (!img)
- return -1;
-
- ret = pb_read_one(img, &ca.core, PB_CORE);
- close_image(img);
-
- if (ret < 0)
- return -1;
-
- if (check_core(ca.core, item))
- return -1;
-
- item->state = ca.core->tc->task_state;
- rsti(item)->cg_set = ca.core->tc->cg_set;
-
- rsti(item)->has_seccomp = ca.core->tc->seccomp_mode != SECCOMP_MODE_DISABLED;
-
- if (item->state == TASK_DEAD)
- rsti(item->parent)->nr_zombies++;
- else if (!task_alive(item)) {
- pr_err("Unknown task state %d\n", item->state);
- return -1;
- }
-
- if (unlikely(item == root_item))
- maybe_clone_parent(item, &ca);
- } else {
- /*
- * Helper entry will not get moved around and thus
- * will live in the parent's cgset.
- */
- rsti(item)->cg_set = rsti(item->parent)->cg_set;
- ca.core = NULL;
- }
-
- ret = -1;
-
- ca.item = item;
- ca.clone_flags = rsti(item)->clone_flags;
-
- BUG_ON(ca.clone_flags & CLONE_VM);
-
- pr_info("Forking task with %d pid (flags 0x%lx)\n", pid, ca.clone_flags);
-
- if (!(ca.clone_flags & CLONE_NEWPID)) {
- char buf[32];
- int len;
-
- ca.fd = open_proc_rw(PROC_GEN, LAST_PID_PATH);
- if (ca.fd < 0) {
- pr_perror("%d: Can't open %s", pid, LAST_PID_PATH);
- goto err;
- }
-
- if (flock(ca.fd, LOCK_EX)) {
- close(ca.fd);
- pr_perror("%d: Can't lock %s", pid, LAST_PID_PATH);
- goto err;
- }
-
- len = snprintf(buf, sizeof(buf), "%d", pid - 1);
- if (write(ca.fd, buf, len) != len) {
- pr_perror("%d: Write %s to %s", pid, buf, LAST_PID_PATH);
- goto err_unlock;
- }
- } else {
- ca.fd = -1;
- BUG_ON(pid != INIT_PID);
- }
-
- /*
- * Some kernel modules, such as netwrok packet generator
- * run kernel thread upon net-namespace creattion taking
- * the @pid we've been requeting via LAST_PID_PATH interface
- * so that we can't restore a take with pid needed.
- *
- * Here is an idea -- unhare net namespace in callee instead.
- */
- ret = clone(restore_task_with_children, ca.stack_ptr,
- (ca.clone_flags & ~CLONE_NEWNET) | SIGCHLD, &ca);
-
- if (ret < 0) {
- pr_perror("Can't fork for %d", pid);
- goto err_unlock;
- }
-
-
- if (item == root_item) {
- item->pid.real = ret;
- pr_debug("PID: real %d virt %d\n",
- item->pid.real, item->pid.virt);
- }
-
- if (opts.pidfile && root_item == item) {
- int pid;
-
- pid = ret;
-
- ret = write_pidfile(pid);
- if (ret < 0) {
- pr_perror("Can't write pidfile");
- kill(pid, SIGKILL);
- }
- }
-
-err_unlock:
- if (ca.fd >= 0) {
- if (flock(ca.fd, LOCK_UN))
- pr_perror("%d: Can't unlock %s", pid, LAST_PID_PATH);
-
- close(ca.fd);
- }
-err:
- if (ca.core)
- core_entry__free_unpacked(ca.core, NULL);
- return ret;
-}
-
-static void sigchld_handler(int signal, siginfo_t *siginfo, void *data)
-{
- struct pstree_item *pi;
- pid_t pid = siginfo->si_pid;
- int status;
- int exit;
-
- exit = (siginfo->si_code == CLD_EXITED);
- status = siginfo->si_status;
-
- /* skip scripts */
- if (!current && root_item->pid.real != pid) {
- pid = waitpid(root_item->pid.real, &status, WNOHANG);
- if (pid <= 0)
- return;
- exit = WIFEXITED(status);
- status = exit ? WEXITSTATUS(status) : WTERMSIG(status);
- }
-
- if (!current && siginfo->si_code == CLD_TRAPPED &&
- siginfo->si_status == SIGCHLD) {
- /* The root task is ptraced. Allow it to handle SIGCHLD */
- ptrace(PTRACE_CONT, siginfo->si_pid, 0, SIGCHLD);
- return;
- }
-
- if (!current || status)
- goto err;
-
- while (pid) {
- pid = waitpid(-1, &status, WNOHANG);
- if (pid <= 0)
- return;
-
- exit = WIFEXITED(status);
- status = exit ? WEXITSTATUS(status) : WTERMSIG(status);
- if (status)
- break;
-
- /* Exited (with zero code) helpers are OK */
- list_for_each_entry(pi, ¤t->children, sibling)
- if (pi->pid.virt == siginfo->si_pid)
- break;
-
- BUG_ON(&pi->sibling == ¤t->children);
- if (pi->state != TASK_HELPER)
- break;
- }
-
-err:
- if (exit)
- pr_err("%d exited, status=%d\n", pid, status);
- else
- pr_err("%d killed by signal %d\n", pid, status);
-
- futex_abort_and_wake(&task_entries->nr_in_progress);
-}
-
-static int criu_signals_setup(void)
-{
- int ret;
- struct sigaction act;
- sigset_t blockmask;
-
- ret = sigaction(SIGCHLD, NULL, &act);
- if (ret < 0) {
- pr_perror("sigaction() failed");
- return -1;
- }
-
- act.sa_flags |= SA_NOCLDSTOP | SA_SIGINFO | SA_RESTART;
- act.sa_sigaction = sigchld_handler;
- sigemptyset(&act.sa_mask);
- sigaddset(&act.sa_mask, SIGCHLD);
-
- ret = sigaction(SIGCHLD, &act, NULL);
- if (ret < 0) {
- pr_perror("sigaction() failed");
- return -1;
- }
-
- /*
- * The block mask will be restored in sigreturn.
- *
- * TODO: This code should be removed, when a freezer will be added.
- */
- sigfillset(&blockmask);
- sigdelset(&blockmask, SIGCHLD);
-
- /*
- * Here we use SIG_SETMASK instead of SIG_BLOCK to avoid the case where
- * we've been forked from a parent who had blocked SIGCHLD. If SIGCHLD
- * is blocked when a task dies (e.g. if the task fails to restore
- * somehow), we hang because our SIGCHLD handler is never run. Since we
- * depend on SIGCHLD being unblocked, let's set the mask explicitly.
- */
- ret = sigprocmask(SIG_SETMASK, &blockmask, NULL);
- if (ret < 0) {
- pr_perror("Can't block signals");
- return -1;
- }
-
- return 0;
-}
-
-static void restore_sid(void)
-{
- pid_t sid;
-
- /*
- * SID can only be reset to pid or inherited from parent.
- * Thus we restore it right here to let our kids inherit
- * one in case they need it.
- *
- * PGIDs are restored late when all tasks are forked and
- * we can call setpgid() on custom values.
- */
-
- if (current->pid.virt == current->sid) {
- pr_info("Restoring %d to %d sid\n", current->pid.virt, current->sid);
- sid = setsid();
- if (sid != current->sid) {
- pr_perror("Can't restore sid (%d)", sid);
- exit(1);
- }
- } else {
- sid = getsid(getpid());
- if (sid != current->sid) {
- /* Skip the root task if it's not init */
- if (current == root_item && root_item->pid.virt != INIT_PID)
- return;
- pr_err("Requested sid %d doesn't match inherited %d\n",
- current->sid, sid);
- exit(1);
- }
- }
-}
-
-static void restore_pgid(void)
-{
- /*
- * Unlike sessions, process groups (a.k.a. pgids) can be joined
- * by any task, provided the task with pid == pgid (group leader)
- * exists. Thus, in order to restore pgid we must make sure that
- * group leader was born and created the group, then join one.
- *
- * We do this _before_ finishing the forking stage to make sure
- * helpers are still with us.
- */
-
- pid_t pgid, my_pgid = current->pgid;
-
- pr_info("Restoring %d to %d pgid\n", current->pid.virt, my_pgid);
-
- pgid = getpgrp();
- if (my_pgid == pgid)
- return;
-
- if (my_pgid != current->pid.virt) {
- struct pstree_item *leader;
-
- /*
- * Wait for leader to become such.
- * Missing leader means we're going to crtools
- * group (-j option).
- */
-
- leader = rsti(current)->pgrp_leader;
- if (leader) {
- BUG_ON(my_pgid != leader->pid.virt);
- futex_wait_until(&rsti(leader)->pgrp_set, 1);
- }
- }
-
- pr_info("\twill call setpgid, mine pgid is %d\n", pgid);
- if (setpgid(0, my_pgid) != 0) {
- pr_perror("Can't restore pgid (%d/%d->%d)", current->pid.virt, pgid, current->pgid);
- exit(1);
- }
-
- if (my_pgid == current->pid.virt)
- futex_set_and_wake(&rsti(current)->pgrp_set, 1);
-}
-
-static int mount_proc(void)
-{
- int fd, ret;
- char proc_mountpoint[] = "crtools-proc.XXXXXX";
-
- if (mkdtemp(proc_mountpoint) == NULL) {
- pr_perror("mkdtemp failed %s", proc_mountpoint);
- return -1;
- }
-
- pr_info("Mount procfs in %s\n", proc_mountpoint);
- if (mount("proc", proc_mountpoint, "proc", MS_MGC_VAL | MS_NOSUID | MS_NOEXEC | MS_NODEV, NULL)) {
- pr_perror("mount failed");
- rmdir(proc_mountpoint);
- return -1;
- }
-
- ret = fd = open_detach_mount(proc_mountpoint);
- if (fd >= 0) {
- ret = set_proc_fd(fd);
- close(fd);
- }
-
- return ret;
-}
-
-/*
- * Tasks cannot change sid (session id) arbitrary, but can either
- * inherit one from ancestor, or create a new one with id equal to
- * their pid. Thus sid-s restore is tied with children creation.
- */
-
-static int create_children_and_session(void)
-{
- int ret;
- struct pstree_item *child;
-
- pr_info("Restoring children in alien sessions:\n");
- list_for_each_entry(child, ¤t->children, sibling) {
- if (!restore_before_setsid(child))
- continue;
-
- BUG_ON(child->born_sid != -1 && getsid(getpid()) != child->born_sid);
-
- ret = fork_with_pid(child);
- if (ret < 0)
- return ret;
- }
-
- if (current->parent)
- restore_sid();
-
- pr_info("Restoring children in our session:\n");
- list_for_each_entry(child, ¤t->children, sibling) {
- if (restore_before_setsid(child))
- continue;
-
- ret = fork_with_pid(child);
- if (ret < 0)
- return ret;
- }
-
- return 0;
-}
-
-static int restore_task_with_children(void *_arg)
-{
- struct cr_clone_arg *ca = _arg;
- pid_t pid;
- int ret;
-
- current = ca->item;
-
- if (current != root_item) {
- char buf[12];
- int fd;
-
- /* Determine PID in CRIU's namespace */
- fd = get_service_fd(CR_PROC_FD_OFF);
- if (fd < 0)
- goto err;
-
- ret = readlinkat(fd, "self", buf, sizeof(buf) - 1);
- if (ret < 0) {
- pr_perror("Unable to read the /proc/self link");
- goto err;
- }
- buf[ret] = '\0';
-
- current->pid.real = atoi(buf);
- pr_debug("PID: real %d virt %d\n",
- current->pid.real, current->pid.virt);
- }
-
- if ( !(ca->clone_flags & CLONE_FILES))
- close_safe(&ca->fd);
-
- if (current->state != TASK_HELPER) {
- ret = clone_service_fd(rsti(current)->service_fd_id);
- if (ret)
- goto err;
- }
-
- pid = getpid();
- if (current->pid.virt != pid) {
- pr_err("Pid %d do not match expected %d\n", pid, current->pid.virt);
- set_task_cr_err(EEXIST);
- goto err;
- }
-
- ret = log_init_by_pid();
- if (ret < 0)
- goto err;
-
- if (ca->clone_flags & CLONE_NEWNET) {
- ret = unshare(CLONE_NEWNET);
- if (ret) {
- pr_perror("Can't unshare net-namespace");
- goto err;
- }
- }
-
- if (!(ca->clone_flags & CLONE_FILES)) {
- ret = close_old_fds();
- if (ret)
- goto err;
- }
-
- /* Restore root task */
- if (current->parent == NULL) {
- if (restore_finish_stage(CR_STATE_RESTORE_NS) < 0)
- goto err;
-
- pr_info("Calling restore_sid() for init\n");
- restore_sid();
-
- /*
- * We need non /proc proc mount for restoring pid and mount
- * namespaces and do not care for the rest of the cases.
- * Thus -- mount proc at custom location for any new namespace
- */
- if (mount_proc())
- goto err;
-
- if (prepare_namespace(current, ca->clone_flags))
- goto err;
-
- if (root_prepare_shared())
- goto err;
-
- if (restore_finish_stage(CR_STATE_RESTORE_SHARED) < 0)
- goto err;
- }
-
- if (restore_task_mnt_ns(current))
- goto err;
-
- if (prepare_mappings())
- goto err;
-
- /*
- * Call this _before_ forking to optimize cgroups
- * restore -- if all tasks live in one set of cgroups
- * we will only move the root one there, others will
- * just have it inherited.
- */
- if (prepare_task_cgroup(current) < 0)
- goto err;
-
- if (prepare_sigactions() < 0)
- goto err;
-
- if (fault_injected(FI_RESTORE_ROOT_ONLY)) {
- pr_info("fault: Restore root task failure!\n");
- BUG();
- }
-
- if (create_children_and_session())
- goto err;
-
-
- if (unmap_guard_pages())
- goto err;
-
- restore_pgid();
-
- if (restore_finish_stage(CR_STATE_FORKING) < 0)
- goto err;
-
- if (current->parent == NULL) {
- if (depopulate_roots_yard())
- goto err;
-
- fini_restore_mntns();
- }
-
- if (restore_one_task(current->pid.virt, ca->core))
- goto err;
-
- return 0;
-
-err:
- if (current->parent == NULL)
- futex_abort_and_wake(&task_entries->nr_in_progress);
- exit(1);
-}
-
-static inline int stage_participants(int next_stage)
-{
- switch (next_stage) {
- case CR_STATE_FAIL:
- return 0;
- case CR_STATE_RESTORE_NS:
- case CR_STATE_RESTORE_SHARED:
- return 1;
- case CR_STATE_FORKING:
- return task_entries->nr_tasks + task_entries->nr_helpers;
- case CR_STATE_RESTORE:
- return task_entries->nr_threads + task_entries->nr_helpers;
- case CR_STATE_RESTORE_SIGCHLD:
- return task_entries->nr_threads;
- case CR_STATE_RESTORE_CREDS:
- return task_entries->nr_threads;
- }
-
- BUG();
- return -1;
-}
-
-static int restore_wait_inprogress_tasks()
-{
- int ret;
- futex_t *np = &task_entries->nr_in_progress;
-
- futex_wait_while_gt(np, 0);
- ret = (int)futex_get(np);
- if (ret < 0) {
- set_cr_errno(get_task_cr_err());
- return ret;
- }
-
- return 0;
-}
-
-static void __restore_switch_stage(int next_stage)
-{
- futex_set(&task_entries->nr_in_progress,
- stage_participants(next_stage));
- futex_set_and_wake(&task_entries->start, next_stage);
-}
-
-static int restore_switch_stage(int next_stage)
-{
- __restore_switch_stage(next_stage);
- return restore_wait_inprogress_tasks();
-}
-
-static int attach_to_tasks(bool root_seized)
-{
- struct pstree_item *item;
-
- for_each_pstree_item(item) {
- pid_t pid = item->pid.real;
- int status, i;
-
- if (!task_alive(item))
- continue;
-
- if (parse_threads(item->pid.real, &item->threads, &item->nr_threads))
- return -1;
-
- for (i = 0; i < item->nr_threads; i++) {
- pid = item->threads[i].real;
-
- if (item != root_item || !root_seized || i != 0) {
- if (ptrace(PTRACE_SEIZE, pid, 0, 0)) {
- pr_perror("Can't attach to %d", pid);
- return -1;
- }
- }
- if (ptrace(PTRACE_INTERRUPT, pid, 0, 0)) {
- pr_perror("Can't interrupt the %d task", pid);
- return -1;
- }
-
-
- if (wait4(pid, &status, __WALL, NULL) != pid) {
- pr_perror("waitpid(%d) failed", pid);
- return -1;
- }
-
- /*
- * Suspend seccomp if necessary. We need to do this because
- * although seccomp is restored at the very end of the
- * restorer blob (and the final sigreturn is ok), here we're
- * doing an munmap in the process, which may be blocked by
- * seccomp and cause the task to be killed.
- */
- if (rsti(item)->has_seccomp && suspend_seccomp(pid) < 0)
- pr_err("failed to suspend seccomp, restore will probably fail...\n");
-
- if (ptrace(PTRACE_CONT, pid, NULL, NULL) ) {
- pr_perror("Unable to resume %d", pid);
- return -1;
- }
- }
- }
-
- return 0;
-}
-
-static int catch_tasks(bool root_seized, enum trace_flags *flag)
-{
- struct pstree_item *item;
-
- for_each_pstree_item(item) {
- pid_t pid = item->pid.real;
- int status, i, ret;
-
- if (!task_alive(item))
- continue;
-
- if (parse_threads(item->pid.real, &item->threads, &item->nr_threads))
- return -1;
-
- for (i = 0; i < item->nr_threads; i++) {
- pid = item->threads[i].real;
-
- if (ptrace(PTRACE_INTERRUPT, pid, 0, 0)) {
- pr_perror("Can't interrupt the %d task", pid);
- return -1;
- }
-
- if (wait4(pid, &status, __WALL, NULL) != pid) {
- pr_perror("waitpid(%d) failed", pid);
- return -1;
- }
-
- ret = ptrace_stop_pie(pid, rsti(item)->breakpoint, flag);
- if (ret < 0)
- return -1;
- }
- }
-
- return 0;
-}
-
-static int clear_breakpoints()
-{
- struct pstree_item *item;
- int ret = 0, i;
-
- for_each_pstree_item(item) {
- if (!task_alive(item))
- continue;
- for (i = 0; i < item->nr_threads; i++)
- ret |= ptrace_flush_breakpoints(item->threads[i].real);
- }
-
- return ret;
-}
-
-static void finalize_restore(void)
-{
- struct pstree_item *item;
-
- for_each_pstree_item(item) {
- pid_t pid = item->pid.real;
- struct parasite_ctl *ctl;
-
- if (!task_alive(item))
- continue;
-
- /* Unmap the restorer blob */
- ctl = parasite_prep_ctl(pid, NULL);
- if (ctl == NULL)
- continue;
-
- parasite_unmap(ctl, (unsigned long)rsti(item)->munmap_restorer);
-
- xfree(ctl);
-
- if (item->state == TASK_STOPPED)
- kill(item->pid.real, SIGSTOP);
- }
-}
-
-static void finalize_restore_detach(int status)
-{
- struct pstree_item *item;
-
- for_each_pstree_item(item) {
- pid_t pid;
- int i;
-
- if (!task_alive(item))
- continue;
-
- for (i = 0; i < item->nr_threads; i++) {
- pid = item->threads[i].real;
- if (pid < 0) {
- BUG_ON(status >= 0);
- break;
- }
-
- if (ptrace(PTRACE_DETACH, pid, NULL, 0))
- pr_perror("Unable to execute %d", pid);
- }
- }
-}
-
-static void ignore_kids(void)
-{
- struct sigaction sa = { .sa_handler = SIG_DFL };
-
- if (sigaction(SIGCHLD, &sa, NULL) < 0)
- pr_perror("Restoring CHLD sigaction failed");
-}
-
-static unsigned int saved_loginuid;
-
-static int prepare_userns_hook(void)
-{
- int ret;
-
- if (!kdat.has_loginuid)
- return 0;
- /*
- * Save old loginuid and set it to INVALID_UID:
- * this value means that loginuid is unset and it will be inherited.
- * After you set some value to /proc/<>/loginuid it can't be changed
- * inside container due to permissions.
- * But you still can set this value if it was unset.
- */
- saved_loginuid = parse_pid_loginuid(getpid(), &ret, false);
- if (ret < 0)
- return -1;
-
- if (prepare_loginuid(INVALID_UID, LOG_ERROR) < 0) {
- pr_err("Setting loginuid for CT init task failed, CAP_AUDIT_CONTROL?");
- return -1;
- }
- return 0;
-}
-
-static void restore_origin_ns_hook(void)
-{
- if (!kdat.has_loginuid)
- return;
-
- /* not critical: it does not affect CT in any way */
- if (prepare_loginuid(saved_loginuid, LOG_ERROR) < 0)
- pr_err("Restore original /proc/self/loginuid failed");
-}
-
-static int restore_root_task(struct pstree_item *init)
-{
- enum trace_flags flag = TRACE_ALL;
- int ret, fd, mnt_ns_fd = -1;
- int clean_remaps = 1;
-
- ret = run_scripts(ACT_PRE_RESTORE);
- if (ret != 0) {
- pr_err("Aborting restore due to pre-restore script ret code %d\n", ret);
- return -1;
- }
-
- fd = open("/proc", O_DIRECTORY | O_RDONLY);
- if (fd < 0) {
- pr_perror("Unable to open /proc");
- return -1;
- }
-
- ret = install_service_fd(CR_PROC_FD_OFF, fd);
- close(fd);
- if (ret < 0)
- return -1;
-
- /*
- * FIXME -- currently we assume that all the tasks live
- * in the same set of namespaces. This is done to debug
- * the ns contents dumping/restoring. Need to revisit
- * this later.
- */
-
- if (init->pid.virt == INIT_PID) {
- if (!(root_ns_mask & CLONE_NEWPID)) {
- pr_err("This process tree can only be restored "
- "in a new pid namespace.\n"
- "criu should be re-executed with the "
- "\"--namespace pid\" option.\n");
- return -1;
- }
- } else if (root_ns_mask & CLONE_NEWPID) {
- pr_err("Can't restore pid namespace without the process init\n");
- return -1;
- }
-
- if (prepare_userns_hook())
- return -1;
-
- if (prepare_namespace_before_tasks())
- return -1;
-
- futex_set(&task_entries->nr_in_progress,
- stage_participants(CR_STATE_RESTORE_NS));
-
- ret = fork_with_pid(init);
- if (ret < 0)
- goto out;
-
- restore_origin_ns_hook();
-
- if (root_as_sibling) {
- struct sigaction act;
- /*
- * Root task will be our sibling. This means, that
- * we will not notice when (if) it dies in SIGCHLD
- * handler, but we should. To do this -- attach to
- * the guy with ptrace (below) and (!) make the kernel
- * deliver us the signal when it will get stopped.
- * It will in case of e.g. segfault before handling
- * the signal.
- */
- sigaction(SIGCHLD, NULL, &act);
- act.sa_flags &= ~SA_NOCLDSTOP;
- sigaction(SIGCHLD, &act, NULL);
-
- if (ptrace(PTRACE_SEIZE, init->pid.real, 0, 0)) {
- pr_perror("Can't attach to init");
- goto out_kill;
- }
- }
-
- /*
- * uid_map and gid_map must be filled from a parent user namespace.
- * prepare_userns_creds() must be called after filling mappings.
- */
- if ((root_ns_mask & CLONE_NEWUSER) && prepare_userns(init))
- goto out_kill;
-
- pr_info("Wait until namespaces are created\n");
- ret = restore_wait_inprogress_tasks();
- if (ret)
- goto out_kill;
-
- if (root_ns_mask & CLONE_NEWNS) {
- mnt_ns_fd = open_proc(init->pid.real, "ns/mnt");
- if (mnt_ns_fd < 0) {
- pr_perror("Can't open init's mntns fd");
- goto out_kill;
- }
- }
-
- ret = run_scripts(ACT_SETUP_NS);
- if (ret)
- goto out_kill;
-
- timing_start(TIME_FORK);
- ret = restore_switch_stage(CR_STATE_RESTORE_SHARED);
- if (ret < 0)
- goto out_kill;
-
- ret = run_scripts(ACT_POST_SETUP_NS);
- if (ret)
- goto out_kill;
-
- ret = restore_switch_stage(CR_STATE_FORKING);
- if (ret < 0)
- goto out_kill;
-
- timing_stop(TIME_FORK);
-
- ret = restore_switch_stage(CR_STATE_RESTORE);
- if (ret < 0)
- goto out_kill;
-
- ret = restore_switch_stage(CR_STATE_RESTORE_SIGCHLD);
- if (ret < 0)
- goto out_kill;
-
- /*
- * The task_entries->nr_zombies is updated in the
- * CR_STATE_RESTORE_SIGCHLD in pie code.
- */
- task_entries->nr_threads -= atomic_read(&task_entries->nr_zombies);
-
- /*
- * There is no need to call try_clean_remaps() after this point,
- * as restore went OK and all ghosts were removed by the openers.
- */
- clean_remaps = 0;
- close_safe(&mnt_ns_fd);
- cleanup_mnt_ns();
-
- ret = stop_usernsd();
- if (ret < 0)
- goto out_kill;
-
- ret = move_veth_to_bridge();
- if (ret < 0)
- goto out_kill;
-
- ret = prepare_cgroup_properties();
- if (ret < 0)
- goto out_kill;
-
- ret = run_scripts(ACT_POST_RESTORE);
- if (ret != 0) {
- pr_err("Aborting restore due to post-restore script ret code %d\n", ret);
- timing_stop(TIME_RESTORE);
- write_stats(RESTORE_STATS);
- goto out_kill;
- }
-
- /* Unlock network before disabling repair mode on sockets */
- network_unlock();
-
- /*
- * Stop getting sigchld, after we resume the tasks they
- * may start to exit poking criu in vain.
- */
- ignore_kids();
-
- /*
- * -------------------------------------------------------------
- * Below this line nothing should fail, because network is unlocked
- */
- attach_to_tasks(root_as_sibling);
-
- ret = restore_switch_stage(CR_STATE_RESTORE_CREDS);
- BUG_ON(ret);
-
- timing_stop(TIME_RESTORE);
-
- ret = catch_tasks(root_as_sibling, &flag);
-
- pr_info("Restore finished successfully. Resuming tasks.\n");
- futex_set_and_wake(&task_entries->start, CR_STATE_COMPLETE);
-
- if (ret == 0)
- ret = parasite_stop_on_syscall(task_entries->nr_threads,
- __NR_rt_sigreturn, flag);
-
- if (clear_breakpoints())
- pr_err("Unable to flush breakpoints\n");
-
- if (ret == 0)
- finalize_restore();
-
- if (restore_freezer_state())
- pr_err("Unable to restore freezer state\n");
-
- fini_cgroup();
-
- /* Detaches from processes and they continue run through sigreturn. */
- finalize_restore_detach(ret);
-
- write_stats(RESTORE_STATS);
-
- if (!opts.restore_detach && !opts.exec_cmd)
- wait(NULL);
-
- return 0;
-
-out_kill:
- /*
- * The processes can be killed only when all of them have been created,
- * otherwise an external proccesses can be killed.
- */
- if (root_ns_mask & CLONE_NEWPID) {
- int status;
-
- /* Kill init */
- if (root_item->pid.real > 0)
- kill(root_item->pid.real, SIGKILL);
-
- if (waitpid(root_item->pid.real, &status, 0) < 0)
- pr_warn("Unable to wait %d: %s",
- root_item->pid.real, strerror(errno));
- } else {
- struct pstree_item *pi;
-
- for_each_pstree_item(pi)
- if (pi->pid.virt > 0)
- kill(pi->pid.virt, SIGKILL);
- }
-
-out:
- fini_cgroup();
- if (clean_remaps)
- try_clean_remaps(mnt_ns_fd);
- cleanup_mnt_ns();
- stop_usernsd();
- __restore_switch_stage(CR_STATE_FAIL);
- pr_err("Restoring FAILED.\n");
- return -1;
-}
-
-static int prepare_task_entries(void)
-{
- task_entries_pos = rst_mem_align_cpos(RM_SHREMAP);
- task_entries = rst_mem_alloc(sizeof(*task_entries), RM_SHREMAP);
- if (!task_entries) {
- pr_perror("Can't map shmem");
- return -1;
- }
-
- task_entries->nr_threads = 0;
- task_entries->nr_tasks = 0;
- task_entries->nr_helpers = 0;
- atomic_set(&task_entries->nr_zombies, 0);
- futex_set(&task_entries->start, CR_STATE_RESTORE_NS);
- mutex_init(&task_entries->userns_sync_lock);
-
- return 0;
-}
-
-int cr_restore_tasks(void)
-{
- int ret = -1;
-
- if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE))
- return -1;
-
- if (check_img_inventory() < 0)
- goto err;
-
- if (init_stats(RESTORE_STATS))
- goto err;
-
- if (kerndat_init_rst())
- goto err;
-
- timing_start(TIME_RESTORE);
-
- if (cpu_init() < 0)
- goto err;
-
- if (vdso_init())
- goto err;
-
- if (opts.cpu_cap & (CPU_CAP_INS | CPU_CAP_CPU)) {
- if (cpu_validate_cpuinfo())
- goto err;
- }
-
- if (prepare_task_entries() < 0)
- goto err;
-
- if (prepare_pstree() < 0)
- goto err;
-
- if (crtools_prepare_shared() < 0)
- goto err;
-
- if (criu_signals_setup() < 0)
- goto err;
-
- ret = restore_root_task(root_item);
-err:
- cr_plugin_fini(CR_PLUGIN_STAGE__RESTORE, ret);
- return ret;
-}
-
-static long restorer_get_vma_hint(struct list_head *tgt_vma_list,
- struct list_head *self_vma_list, long vma_len)
-{
- struct vma_area *t_vma, *s_vma;
- long prev_vma_end = 0;
- struct vma_area end_vma;
- VmaEntry end_e;
-
- end_vma.e = &end_e;
- end_e.start = end_e.end = kdat.task_size;
- prev_vma_end = PAGE_SIZE * 0x10; /* CONFIG_LSM_MMAP_MIN_ADDR=65536 */
-
- s_vma = list_first_entry(self_vma_list, struct vma_area, list);
- t_vma = list_first_entry(tgt_vma_list, struct vma_area, list);
-
- while (1) {
- if (prev_vma_end + vma_len > s_vma->e->start) {
- if (s_vma->list.next == self_vma_list) {
- s_vma = &end_vma;
- continue;
- }
- if (s_vma == &end_vma)
- break;
- if (prev_vma_end < s_vma->e->end)
- prev_vma_end = s_vma->e->end;
- s_vma = list_entry(s_vma->list.next, struct vma_area, list);
- continue;
- }
-
- if (prev_vma_end + vma_len > t_vma->e->start) {
- if (t_vma->list.next == tgt_vma_list) {
- t_vma = &end_vma;
- continue;
- }
- if (t_vma == &end_vma)
- break;
- if (prev_vma_end < t_vma->e->end)
- prev_vma_end = t_vma->e->end;
- t_vma = list_entry(t_vma->list.next, struct vma_area, list);
- continue;
- }
-
- return prev_vma_end;
- }
-
- return -1;
-}
-
-static inline int timeval_valid(struct timeval *tv)
-{
- return (tv->tv_sec >= 0) && ((unsigned long)tv->tv_usec < USEC_PER_SEC);
-}
-
-static inline int decode_itimer(char *n, ItimerEntry *ie, struct itimerval *val)
-{
- if (ie->isec == 0 && ie->iusec == 0) {
- memzero_p(val);
- return 0;
- }
-
- val->it_interval.tv_sec = ie->isec;
- val->it_interval.tv_usec = ie->iusec;
-
- if (!timeval_valid(&val->it_interval)) {
- pr_err("Invalid timer interval\n");
- return -1;
- }
-
- if (ie->vsec == 0 && ie->vusec == 0) {
- /*
- * Remaining time was too short. Set it to
- * interval to make the timer armed and work.
- */
- val->it_value.tv_sec = ie->isec;
- val->it_value.tv_usec = ie->iusec;
- } else {
- val->it_value.tv_sec = ie->vsec;
- val->it_value.tv_usec = ie->vusec;
- }
-
- if (!timeval_valid(&val->it_value)) {
- pr_err("Invalid timer value\n");
- return -1;
- }
-
- pr_info("Restored %s timer to %ld.%ld -> %ld.%ld\n", n,
- val->it_value.tv_sec, val->it_value.tv_usec,
- val->it_interval.tv_sec, val->it_interval.tv_usec);
-
- return 0;
-}
-
-/*
- * Legacy itimers restore from CR_FD_ITIMERS
- */
-
-static int prepare_itimers_from_fd(int pid, struct task_restore_args *args)
-{
- int ret = -1;
- struct cr_img *img;
- ItimerEntry *ie;
-
- img = open_image(CR_FD_ITIMERS, O_RSTR, pid);
- if (!img)
- return -1;
-
- ret = pb_read_one(img, &ie, PB_ITIMER);
- if (ret < 0)
- goto out;
- ret = decode_itimer("real", ie, &args->itimers[0]);
- itimer_entry__free_unpacked(ie, NULL);
- if (ret < 0)
- goto out;
-
- ret = pb_read_one(img, &ie, PB_ITIMER);
- if (ret < 0)
- goto out;
- ret = decode_itimer("virt", ie, &args->itimers[1]);
- itimer_entry__free_unpacked(ie, NULL);
- if (ret < 0)
- goto out;
-
- ret = pb_read_one(img, &ie, PB_ITIMER);
- if (ret < 0)
- goto out;
- ret = decode_itimer("prof", ie, &args->itimers[2]);
- itimer_entry__free_unpacked(ie, NULL);
- if (ret < 0)
- goto out;
-out:
- close_image(img);
- return ret;
-}
-
-static int prepare_itimers(int pid, CoreEntry *core, struct task_restore_args *args)
-{
- int ret = 0;
- TaskTimersEntry *tte = core->tc->timers;
-
- if (!tte)
- return prepare_itimers_from_fd(pid, args);
-
- ret |= decode_itimer("real", tte->real, &args->itimers[0]);
- ret |= decode_itimer("virt", tte->virt, &args->itimers[1]);
- ret |= decode_itimer("prof", tte->prof, &args->itimers[2]);
-
- return ret;
-}
-
-static inline int timespec_valid(struct timespec *ts)
-{
- return (ts->tv_sec >= 0) && ((unsigned long)ts->tv_nsec < NSEC_PER_SEC);
-}
-
-static inline int decode_posix_timer(PosixTimerEntry *pte,
- struct restore_posix_timer *pt)
-{
- pt->val.it_interval.tv_sec = pte->isec;
- pt->val.it_interval.tv_nsec = pte->insec;
-
- if (!timespec_valid(&pt->val.it_interval)) {
- pr_err("Invalid timer interval(posix)\n");
- return -1;
- }
-
- if (pte->vsec == 0 && pte->vnsec == 0) {
- // Remaining time was too short. Set it to
- // interval to make the timer armed and work.
- pt->val.it_value.tv_sec = pte->isec;
- pt->val.it_value.tv_nsec = pte->insec;
- } else {
- pt->val.it_value.tv_sec = pte->vsec;
- pt->val.it_value.tv_nsec = pte->vnsec;
- }
-
- if (!timespec_valid(&pt->val.it_value)) {
- pr_err("Invalid timer value(posix)\n");
- return -1;
- }
-
- pt->spt.it_id = pte->it_id;
- pt->spt.clock_id = pte->clock_id;
- pt->spt.si_signo = pte->si_signo;
- pt->spt.it_sigev_notify = pte->it_sigev_notify;
- pt->spt.sival_ptr = decode_pointer(pte->sival_ptr);
- pt->overrun = pte->overrun;
-
- return 0;
-}
-
-static int cmp_posix_timer_proc_id(const void *p1, const void *p2)
-{
- return ((struct restore_posix_timer *)p1)->spt.it_id - ((struct restore_posix_timer *)p2)->spt.it_id;
-}
-
-static unsigned long posix_timers_cpos;
-static unsigned int posix_timers_nr;
-
-static void sort_posix_timers(void)
-{
- /*
- * This is required for restorer's create_posix_timers(),
- * it will probe them one-by-one for the desired ID, since
- * kernel doesn't provide another API for timer creation
- * with given ID.
- */
-
- if (posix_timers_nr > 0)
- qsort(rst_mem_remap_ptr(posix_timers_cpos, RM_PRIVATE),
- posix_timers_nr,
- sizeof(struct restore_posix_timer),
- cmp_posix_timer_proc_id);
-}
-
-/*
- * Legacy posix timers restoration from CR_FD_POSIX_TIMERS
- */
-
-static int prepare_posix_timers_from_fd(int pid)
-{
- struct cr_img *img;
- int ret = -1;
- struct restore_posix_timer *t;
-
- img = open_image(CR_FD_POSIX_TIMERS, O_RSTR, pid);
- if (!img)
- return -1;
-
- while (1) {
- PosixTimerEntry *pte;
-
- ret = pb_read_one_eof(img, &pte, PB_POSIX_TIMER);
- if (ret <= 0)
- break;
-
- t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE);
- if (!t)
- break;
-
- ret = decode_posix_timer(pte, t);
- if (ret < 0)
- break;
-
- posix_timer_entry__free_unpacked(pte, NULL);
- posix_timers_nr++;
- }
-
- close_image(img);
- if (!ret)
- sort_posix_timers();
-
- return ret;
-}
-
-static int prepare_posix_timers(int pid, CoreEntry *core)
-{
- int i, ret = -1;
- TaskTimersEntry *tte = core->tc->timers;
- struct restore_posix_timer *t;
-
- posix_timers_cpos = rst_mem_align_cpos(RM_PRIVATE);
-
- if (!tte)
- return prepare_posix_timers_from_fd(pid);
-
- posix_timers_nr = tte->n_posix;
- for (i = 0; i < posix_timers_nr; i++) {
- t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE);
- if (!t)
- goto out;
-
- if (decode_posix_timer(tte->posix[i], t))
- goto out;
- }
-
- ret = 0;
- sort_posix_timers();
-out:
- return ret;
-}
-
-static inline int verify_cap_size(CredsEntry *ce)
-{
- return ((ce->n_cap_inh == CR_CAP_SIZE) && (ce->n_cap_eff == CR_CAP_SIZE) &&
- (ce->n_cap_prm == CR_CAP_SIZE) && (ce->n_cap_bnd == CR_CAP_SIZE));
-}
-
-static int prepare_mm(pid_t pid, struct task_restore_args *args)
-{
- int exe_fd, i, ret = -1;
- MmEntry *mm = rsti(current)->mm;
-
- args->mm = *mm;
- args->mm.n_mm_saved_auxv = 0;
- args->mm.mm_saved_auxv = NULL;
-
- if (mm->n_mm_saved_auxv > AT_VECTOR_SIZE) {
- pr_err("Image corrupted on pid %d\n", pid);
- goto out;
- }
-
- args->mm_saved_auxv_size = mm->n_mm_saved_auxv*sizeof(auxv_t);
- for (i = 0; i < mm->n_mm_saved_auxv; ++i) {
- args->mm_saved_auxv[i] = (auxv_t)mm->mm_saved_auxv[i];
- }
-
- exe_fd = open_reg_by_id(mm->exe_file_id);
- if (exe_fd < 0)
- goto out;
-
- args->fd_exe_link = exe_fd;
- ret = 0;
-out:
- return ret;
-}
-
-static void *restorer;
-static unsigned long restorer_len;
-
-static int prepare_restorer_blob(void)
-{
- /*
- * We map anonymous mapping, not mremap the restorer itself later.
- * Otherwise the restorer vma would be tied to criu binary which
- * in turn will lead to set-exe-file prctl to fail with EBUSY.
- */
-
- restorer_len = pie_size(restorer_blob);
- restorer = mmap(NULL, restorer_len,
- PROT_READ | PROT_WRITE | PROT_EXEC,
- MAP_PRIVATE | MAP_ANON, 0, 0);
- if (restorer == MAP_FAILED) {
- pr_perror("Can't map restorer code");
- return -1;
- }
-
- memcpy(restorer, &restorer_blob, sizeof(restorer_blob));
- return 0;
-}
-
-static int remap_restorer_blob(void *addr)
-{
- void *mem;
-
- mem = mremap(restorer, restorer_len, restorer_len,
- MREMAP_FIXED | MREMAP_MAYMOVE, addr);
- if (mem != addr) {
- pr_perror("Can't remap restorer blob");
- return -1;
- }
-
- ELF_RELOCS_APPLY_RESTORER(addr, addr);
- return 0;
-}
-
-static int validate_sched_parm(struct rst_sched_param *sp)
-{
- if ((sp->nice < -20) || (sp->nice > 19))
- return 0;
-
- switch (sp->policy) {
- case SCHED_RR:
- case SCHED_FIFO:
- return ((sp->prio > 0) && (sp->prio < 100));
- case SCHED_IDLE:
- case SCHED_OTHER:
- case SCHED_BATCH:
- return sp->prio == 0;
- }
-
- return 0;
-}
-
-static int prep_sched_info(struct rst_sched_param *sp, ThreadCoreEntry *tc)
-{
- if (!tc->has_sched_policy) {
- sp->policy = SCHED_OTHER;
- sp->nice = 0;
- return 0;
- }
-
- sp->policy = tc->sched_policy;
- sp->nice = tc->sched_nice;
- sp->prio = tc->sched_prio;
-
- if (!validate_sched_parm(sp)) {
- pr_err("Inconsistent sched params received (%d.%d.%d)\n",
- sp->policy, sp->nice, sp->prio);
- return -1;
- }
-
- return 0;
-}
-
-static unsigned long decode_rlim(u_int64_t ival)
-{
- return ival == -1 ? RLIM_INFINITY : ival;
-}
-
-static unsigned long rlims_cpos;
-static unsigned int rlims_nr;
-
-/*
- * Legacy rlimits restore from CR_FD_RLIMIT
- */
-
-static int prepare_rlimits_from_fd(int pid)
-{
- struct rlimit *r;
- int ret;
- struct cr_img *img;
-
- /*
- * Old image -- read from the file.
- */
- img = open_image(CR_FD_RLIMIT, O_RSTR, pid);
- if (!img)
- return -1;
-
- while (1) {
- RlimitEntry *re;
-
- ret = pb_read_one_eof(img, &re, PB_RLIMIT);
- if (ret <= 0)
- break;
-
- r = rst_mem_alloc(sizeof(*r), RM_PRIVATE);
- if (!r) {
- pr_err("Can't allocate memory for resource %d\n",
- rlims_nr);
- return -1;
- }
-
- r->rlim_cur = decode_rlim(re->cur);
- r->rlim_max = decode_rlim(re->max);
- if (r->rlim_cur > r->rlim_max) {
- pr_err("Can't restore cur > max for %d.%d\n",
- pid, rlims_nr);
- r->rlim_cur = r->rlim_max;
- }
-
- rlimit_entry__free_unpacked(re, NULL);
-
- rlims_nr++;
- }
-
- close_image(img);
-
- return 0;
-}
-
-static int prepare_rlimits(int pid, CoreEntry *core)
-{
- int i;
- TaskRlimitsEntry *rls = core->tc->rlimits;
- struct rlimit *r;
-
- rlims_cpos = rst_mem_align_cpos(RM_PRIVATE);
-
- if (!rls)
- return prepare_rlimits_from_fd(pid);
-
- for (i = 0; i < rls->n_rlimits; i++) {
- r = rst_mem_alloc(sizeof(*r), RM_PRIVATE);
- if (!r) {
- pr_err("Can't allocate memory for resource %d\n", i);
- return -1;
- }
-
- r->rlim_cur = decode_rlim(rls->rlimits[i]->cur);
- r->rlim_max = decode_rlim(rls->rlimits[i]->max);
-
- if (r->rlim_cur > r->rlim_max) {
- pr_warn("Can't restore cur > max for %d.%d\n", pid, i);
- r->rlim_cur = r->rlim_max;
- }
- }
-
- rlims_nr = rls->n_rlimits;
- return 0;
-}
-
-static int signal_to_mem(SiginfoEntry *sie)
-{
- siginfo_t *info, *t;
-
- info = (siginfo_t *) sie->siginfo.data;
- t = rst_mem_alloc(sizeof(siginfo_t), RM_PRIVATE);
- if (!t)
- return -1;
-
- memcpy(t, info, sizeof(*info));
-
- return 0;
-}
-
-static int open_signal_image(int type, pid_t pid, unsigned int *nr)
-{
- int ret;
- struct cr_img *img;
-
- img = open_image(type, O_RSTR, pid);
- if (!img)
- return -1;
-
- *nr = 0;
- while (1) {
- SiginfoEntry *sie;
-
- ret = pb_read_one_eof(img, &sie, PB_SIGINFO);
- if (ret <= 0)
- break;
- if (sie->siginfo.len != sizeof(siginfo_t)) {
- pr_err("Unknown image format\n");
- ret = -1;
- break;
- }
-
- ret = signal_to_mem(sie);
- if (ret)
- break;
-
- (*nr)++;
-
- siginfo_entry__free_unpacked(sie, NULL);
- }
-
- close_image(img);
-
- return ret ? : 0;
-}
-
-static int prepare_one_signal_queue(SignalQueueEntry *sqe, unsigned int *nr)
-{
- int i;
-
- for (i = 0; i < sqe->n_signals; i++)
- if (signal_to_mem(sqe->signals[i]))
- return -1;
-
- *nr = sqe->n_signals;
-
- return 0;
-}
-
-static unsigned long siginfo_cpos;
-static unsigned int siginfo_nr, *siginfo_priv_nr;
-
-static int prepare_signals(int pid, CoreEntry *leader_core)
-{
- int ret = -1, i;
-
- siginfo_cpos = rst_mem_align_cpos(RM_PRIVATE);
- siginfo_priv_nr = xmalloc(sizeof(int) * current->nr_threads);
- if (siginfo_priv_nr == NULL)
- goto out;
-
- /* Prepare shared signals */
- if (!leader_core->tc->signals_s)/*backward compatibility*/
- ret = open_signal_image(CR_FD_SIGNAL, pid, &siginfo_nr);
- else
- ret = prepare_one_signal_queue(leader_core->tc->signals_s, &siginfo_nr);
-
- if (ret < 0)
- goto out;
-
- for (i = 0; i < current->nr_threads; i++) {
- if (!current->core[i]->thread_core->signals_p)/*backward compatibility*/
- ret = open_signal_image(CR_FD_PSIGNAL,
- current->threads[i].virt, &siginfo_priv_nr[i]);
- else
- ret = prepare_one_signal_queue(current->core[i]->thread_core->signals_p,
- &siginfo_priv_nr[i]);
- if (ret < 0)
- goto out;
- }
-out:
- return ret;
-}
-
-extern void __gcov_flush(void) __attribute__((weak));
-void __gcov_flush(void) {}
-
-static void rst_reloc_creds(struct thread_restore_args *thread_args,
- unsigned long *creds_pos_next)
-{
- struct thread_creds_args *args;
-
- if (unlikely(!*creds_pos_next))
- return;
-
- args = rst_mem_remap_ptr(*creds_pos_next, RM_PRIVATE);
-
- if (args->lsm_profile)
- args->lsm_profile = rst_mem_remap_ptr(args->mem_lsm_profile_pos, RM_PRIVATE);
- if (args->groups)
- args->groups = rst_mem_remap_ptr(args->mem_groups_pos, RM_PRIVATE);
-
- *creds_pos_next = args->mem_pos_next;
- thread_args->creds_args = args;
-}
-
-static struct thread_creds_args *
-rst_prep_creds_args(CredsEntry *ce, unsigned long *prev_pos)
-{
- unsigned long this_pos;
- struct thread_creds_args *args;
-
- if (!verify_cap_size(ce)) {
- pr_err("Caps size mismatch %d %d %d %d\n",
- (int)ce->n_cap_inh, (int)ce->n_cap_eff,
- (int)ce->n_cap_prm, (int)ce->n_cap_bnd);
- return ERR_PTR(-EINVAL);
- }
-
- this_pos = rst_mem_align_cpos(RM_PRIVATE);
-
- args = rst_mem_alloc(sizeof(*args), RM_PRIVATE);
- if (!args)
- return ERR_PTR(-ENOMEM);
-
- args->cap_last_cap = kdat.last_cap;
- memcpy(&args->creds, ce, sizeof(args->creds));
-
- if (ce->lsm_profile || opts.lsm_supplied) {
- char *rendered, *profile;
-
- profile = ce->lsm_profile;
- if (opts.lsm_supplied)
- profile = opts.lsm_profile;
-
- if (validate_lsm(profile) < 0)
- return ERR_PTR(-EINVAL);
-
- if (profile && render_lsm_profile(profile, &rendered)) {
- return ERR_PTR(-EINVAL);
- }
-
- if (rendered) {
- size_t lsm_profile_len;
- char *lsm_profile;
-
- args->mem_lsm_profile_pos = rst_mem_align_cpos(RM_PRIVATE);
- lsm_profile_len = strlen(rendered);
- lsm_profile = rst_mem_alloc(lsm_profile_len + 1, RM_PRIVATE);
- if (!lsm_profile) {
- xfree(rendered);
- return ERR_PTR(-ENOMEM);
- }
-
- args = rst_mem_remap_ptr(this_pos, RM_PRIVATE);
- args->lsm_profile = lsm_profile;
- strncpy(args->lsm_profile, rendered, lsm_profile_len);
- xfree(rendered);
- }
- } else {
- args->lsm_profile = NULL;
- args->mem_lsm_profile_pos = 0;
- }
-
- /*
- * Zap fields which we cant use.
- */
- args->creds.cap_inh = NULL;
- args->creds.cap_eff = NULL;
- args->creds.cap_prm = NULL;
- args->creds.cap_bnd = NULL;
- args->creds.groups = NULL;
- args->creds.lsm_profile = NULL;
-
- memcpy(args->cap_inh, ce->cap_inh, sizeof(args->cap_inh));
- memcpy(args->cap_eff, ce->cap_eff, sizeof(args->cap_eff));
- memcpy(args->cap_prm, ce->cap_prm, sizeof(args->cap_prm));
- memcpy(args->cap_bnd, ce->cap_bnd, sizeof(args->cap_bnd));
-
- if (ce->n_groups) {
- unsigned int *groups;
-
- args->mem_groups_pos = rst_mem_align_cpos(RM_PRIVATE);
- groups = rst_mem_alloc(ce->n_groups * sizeof(u32), RM_PRIVATE);
- if (!groups)
- return ERR_PTR(-ENOMEM);
- args = rst_mem_remap_ptr(this_pos, RM_PRIVATE);
- args->groups = groups;
- memcpy(args->groups, ce->groups, ce->n_groups * sizeof(u32));
- } else {
- args->groups = NULL;
- args->mem_groups_pos = 0;
- }
-
- args->mem_pos_next = 0;
-
- if (prev_pos) {
- if (*prev_pos) {
- struct thread_creds_args *prev;
-
- prev = rst_mem_remap_ptr(*prev_pos, RM_PRIVATE);
- prev->mem_pos_next = this_pos;
- }
- *prev_pos = this_pos;
- }
- return args;
-}
-
-static int rst_prep_creds_from_img(pid_t pid)
-{
- CredsEntry *ce = NULL;
- struct cr_img *img;
- int ret;
-
- img = open_image(CR_FD_CREDS, O_RSTR, pid);
- if (!img)
- return -ENOENT;
-
- ret = pb_read_one(img, &ce, PB_CREDS);
- close_image(img);
-
- if (ret > 0) {
- struct thread_creds_args *args;
-
- args = rst_prep_creds_args(ce, NULL);
- if (IS_ERR(args))
- ret = PTR_ERR(args);
- else
- ret = 0;
- }
- creds_entry__free_unpacked(ce, NULL);
- return ret;
-}
-
-static int rst_prep_creds(pid_t pid, CoreEntry *core, unsigned long *creds_pos)
-{
- struct thread_creds_args *args = NULL;
- unsigned long this_pos = 0;
- size_t i;
-
- /*
- * This is _really_ very old image
- * format where @thread_core were not
- * present. It means we don't have
- * creds either, just ignore and exit
- * early.
- */
- if (unlikely(!core->thread_core)) {
- *creds_pos = 0;
- return 0;
- }
-
- *creds_pos = rst_mem_align_cpos(RM_PRIVATE);
-
- /*
- * Old format: one Creds per task carried in own image file.
- */
- if (!core->thread_core->creds)
- return rst_prep_creds_from_img(pid);
-
- for (i = 0; i < current->nr_threads; i++) {
- CredsEntry *ce = current->core[i]->thread_core->creds;
-
- args = rst_prep_creds_args(ce, &this_pos);
- if (IS_ERR(args))
- return PTR_ERR(args);
- }
-
- return 0;
-}
-
-static int sigreturn_restore(pid_t pid, CoreEntry *core)
-{
- void *mem = MAP_FAILED;
- void *restore_thread_exec_start;
- void *restore_task_exec_start;
-
- long new_sp, exec_mem_hint;
- long ret;
-
- long restore_bootstrap_len;
- long rst_mem_size;
-
- struct task_restore_args *task_args;
- struct thread_restore_args *thread_args;
- long args_len;
-
- struct vma_area *vma;
- unsigned long tgt_vmas;
-
-#ifdef CONFIG_VDSO
- unsigned long vdso_rt_size = 0;
- unsigned long vdso_rt_delta = 0;
-#endif
-
- unsigned long aio_rings;
- MmEntry *mm = rsti(current)->mm;
-
- int n_seccomp_filters = 0;
- unsigned long seccomp_filter_pos = 0;
-
- struct vm_area_list self_vmas;
- struct vm_area_list *vmas = &rsti(current)->vmas;
- int i;
-
- unsigned long creds_pos = 0;
- unsigned long creds_pos_next;
-
- pr_info("Restore via sigreturn\n");
-
- /* pr_info_vma_list(&self_vma_list); */
-
- BUILD_BUG_ON(sizeof(struct task_restore_args) & 1);
- BUILD_BUG_ON(sizeof(struct thread_restore_args) & 1);
-
- args_len = round_up(sizeof(*task_args) + sizeof(*thread_args) * current->nr_threads, page_size());
- pr_info("%d threads require %ldK of memory\n",
- current->nr_threads, KBYTES(args_len));
-
- /*
- * Copy VMAs to private rst memory so that it's able to
- * walk them and m(un|re)map.
- */
-
- tgt_vmas = rst_mem_align_cpos(RM_PRIVATE);
- list_for_each_entry(vma, &vmas->h, list) {
- VmaEntry *vme;
-
- vme = rst_mem_alloc(sizeof(*vme), RM_PRIVATE);
- if (!vme)
- goto err_nv;
-
- *vme = *vma->e;
-
- if (vma_area_is_private(vma, kdat.task_size))
- vma_premmaped_start(vme) = vma->premmaped_addr;
- }
-
- /*
- * Put info about AIO rings, they will get remapped
- */
-
- aio_rings = rst_mem_align_cpos(RM_PRIVATE);
- for (i = 0; i < mm->n_aios; i++) {
- struct rst_aio_ring *raio;
-
- raio = rst_mem_alloc(sizeof(*raio), RM_PRIVATE);
- if (!raio)
- goto err_nv;
-
- raio->addr = mm->aios[i]->id;
- raio->nr_req = mm->aios[i]->nr_req;
- raio->len = mm->aios[i]->ring_len;
- }
-
- /*
- * Get all the tcp sockets fds into rst memory -- restorer
- * will turn repair off before going sigreturn
- */
- if (rst_tcp_socks_prep())
- goto err_nv;
-
- /*
- * Copy timerfd params for restorer args, we need to proceed
- * timer setting at the very late.
- */
- if (rst_timerfd_prep())
- goto err_nv;
-
- /*
- * Read creds info for every thread and allocate memory
- * needed so we can use this data inside restorer.
- */
- if (rst_prep_creds(pid, core, &creds_pos))
- goto err_nv;
-
- /*
- * We're about to search for free VM area and inject the restorer blob
- * into it. No irrelevent mmaps/mremaps beyond this point, otherwise
- * this unwanted mapping might get overlapped by the restorer.
- */
-
- ret = parse_self_maps_lite(&self_vmas);
- if (ret < 0)
- goto err;
-
- if (seccomp_filters_get_rst_pos(core, &n_seccomp_filters, &seccomp_filter_pos) < 0)
- goto err;
-
- rst_mem_size = rst_mem_lock();
- restore_bootstrap_len = restorer_len + args_len + rst_mem_size;
-
-#ifdef CONFIG_VDSO
- /*
- * Figure out how much memory runtime vdso and vvar will need.
- */
- vdso_rt_size = vdso_vma_size(&vdso_sym_rt);
- if (vdso_rt_size) {
- vdso_rt_delta = ALIGN(restore_bootstrap_len, PAGE_SIZE) - restore_bootstrap_len;
- vdso_rt_size += vdso_rt_delta;
- if (vvar_vma_size(&vdso_sym_rt))
- vdso_rt_size += ALIGN(vvar_vma_size(&vdso_sym_rt), PAGE_SIZE);
- }
-
- restore_bootstrap_len += vdso_rt_size;
-#endif
-
- /*
- * Restorer is a blob (code + args) that will get mapped in some
- * place, that should _not_ intersect with both -- current mappings
- * and mappings of the task we're restoring here. The subsequent
- * call finds the start address for the restorer.
- *
- * After the start address is found we populate it with the restorer
- * parts one by one (some are remap-ed, some are mmap-ed and copied
- * or inited from scratch).
- */
-
- exec_mem_hint = restorer_get_vma_hint(&vmas->h, &self_vmas.h,
- restore_bootstrap_len);
- if (exec_mem_hint == -1) {
- pr_err("No suitable area for task_restore bootstrap (%ldK)\n",
- restore_bootstrap_len);
- goto err;
- }
-
- pr_info("Found bootstrap VMA hint at: 0x%lx (needs ~%ldK)\n", exec_mem_hint,
- KBYTES(restore_bootstrap_len));
-
- ret = remap_restorer_blob((void *)exec_mem_hint);
- if (ret < 0)
- goto err;
-
- /*
- * Prepare a memory map for restorer. Note a thread space
- * might be completely unused so it's here just for convenience.
- */
- restore_thread_exec_start = restorer_sym(exec_mem_hint, arch_export_restore_thread);
- restore_task_exec_start = restorer_sym(exec_mem_hint, arch_export_restore_task);
- rsti(current)->munmap_restorer = restorer_sym(exec_mem_hint, arch_export_unmap);
-
- exec_mem_hint += restorer_len;
-
- /* VMA we need to run task_restore code */
- mem = mmap((void *)exec_mem_hint, args_len,
- PROT_READ | PROT_WRITE,
- MAP_PRIVATE | MAP_ANON | MAP_FIXED, 0, 0);
- if (mem != (void *)exec_mem_hint) {
- pr_err("Can't mmap section for restore code\n");
- goto err;
- }
-
- exec_mem_hint -= restorer_len;
-
- memzero(mem, args_len);
- task_args = mem;
- thread_args = (struct thread_restore_args *)(task_args + 1);
-
- task_args->proc_fd = dup(get_service_fd(PROC_FD_OFF));
- if (task_args->proc_fd < 0) {
- pr_perror("can't dup proc fd");
- goto err;
- }
-
- /*
- * Get a reference to shared memory area which is
- * used to signal if shmem restoration complete
- * from low-level restore code.
- *
- * This shmem area is mapped right after the whole area of
- * sigreturn rt code. Note we didn't allocated it before
- * but this area is taken into account for 'hint' memory
- * address.
- */
-
- mem += args_len;
- if (rst_mem_remap(mem))
- goto err;
-
- task_args->breakpoint = &rsti(current)->breakpoint;
- task_args->task_entries = rst_mem_remap_ptr(task_entries_pos, RM_SHREMAP);
-
- task_args->rst_mem = mem;
- task_args->rst_mem_size = rst_mem_size;
-
- task_args->bootstrap_start = (void *)exec_mem_hint;
- task_args->bootstrap_len = restore_bootstrap_len;
-
- task_args->premmapped_addr = (unsigned long)rsti(current)->premmapped_addr;
- task_args->premmapped_len = rsti(current)->premmapped_len;
-
- task_args->task_size = kdat.task_size;
-
-#define remap_array(name, nr, cpos) do { \
- task_args->name##_n = nr; \
- task_args->name = rst_mem_remap_ptr(cpos, RM_PRIVATE); \
- } while (0)
-
- remap_array(vmas, vmas->nr, tgt_vmas);
- remap_array(posix_timers, posix_timers_nr, posix_timers_cpos);
- remap_array(timerfd, rst_timerfd_nr, rst_timerfd_cpos);
- remap_array(siginfo, siginfo_nr, siginfo_cpos);
- remap_array(tcp_socks, rst_tcp_socks_nr, rst_tcp_socks_cpos);
- remap_array(rings, mm->n_aios, aio_rings);
- remap_array(rlims, rlims_nr, rlims_cpos);
- remap_array(helpers, n_helpers, helpers_pos);
- remap_array(zombies, n_zombies, zombies_pos);
- remap_array(seccomp_filters, n_seccomp_filters, seccomp_filter_pos);
-
-#undef remap_array
-
- if (core->tc->has_seccomp_mode)
- task_args->seccomp_mode = core->tc->seccomp_mode;
-
- /*
- * Arguments for task restoration.
- */
-
- BUG_ON(core->mtype != CORE_ENTRY__MARCH);
-
- task_args->logfd = log_get_fd();
- task_args->loglevel = log_get_loglevel();
- task_args->sigchld_act = sigchld_act;
-
- strncpy(task_args->comm, core->tc->comm, sizeof(task_args->comm));
-
-
- /*
- * Fill up per-thread data.
- */
- creds_pos_next = creds_pos;
- for (i = 0; i < current->nr_threads; i++) {
- CoreEntry *tcore;
- struct rt_sigframe *sigframe;
-
- thread_args[i].pid = current->threads[i].virt;
- thread_args[i].siginfo_n = siginfo_priv_nr[i];
- thread_args[i].siginfo = rst_mem_remap_ptr(siginfo_cpos, RM_PRIVATE);
- thread_args[i].siginfo += siginfo_nr;
- siginfo_nr += thread_args[i].siginfo_n;
-
- /* skip self */
- if (thread_args[i].pid == pid) {
- task_args->t = thread_args + i;
- tcore = core;
- } else
- tcore = current->core[i];
-
- if ((tcore->tc || tcore->ids) && thread_args[i].pid != pid) {
- pr_err("Thread has optional fields present %d\n",
- thread_args[i].pid);
- ret = -1;
- }
-
- if (ret < 0) {
- pr_err("Can't read core data for thread %d\n",
- thread_args[i].pid);
- goto err;
- }
-
- thread_args[i].ta = task_args;
- thread_args[i].gpregs = *CORE_THREAD_ARCH_INFO(tcore)->gpregs;
- thread_args[i].clear_tid_addr = CORE_THREAD_ARCH_INFO(tcore)->clear_tid_addr;
- core_get_tls(tcore, &thread_args[i].tls);
-
- rst_reloc_creds(&thread_args[i], &creds_pos_next);
-
- if (tcore->thread_core) {
- thread_args[i].has_futex = true;
- thread_args[i].futex_rla = tcore->thread_core->futex_rla;
- thread_args[i].futex_rla_len = tcore->thread_core->futex_rla_len;
- thread_args[i].pdeath_sig = tcore->thread_core->pdeath_sig;
- if (tcore->thread_core->pdeath_sig > _KNSIG) {
- pr_err("Pdeath signal is too big\n");
- goto err;
- }
-
- ret = prep_sched_info(&thread_args[i].sp, tcore->thread_core);
- if (ret)
- goto err;
- }
-
- sigframe = (struct rt_sigframe *)thread_args[i].mem_zone.rt_sigframe;
-
- if (construct_sigframe(sigframe, sigframe, tcore))
- goto err;
-
- if (thread_args[i].pid != pid)
- core_entry__free_unpacked(tcore, NULL);
-
- pr_info("Thread %4d stack %8p rt_sigframe %8p\n",
- i, thread_args[i].mem_zone.stack,
- thread_args[i].mem_zone.rt_sigframe);
-
- }
-
-#ifdef CONFIG_VDSO
- /*
- * Restorer needs own copy of vdso parameters. Runtime
- * vdso must be kept non intersecting with anything else,
- * since we need it being accessible even when own
- * self-vmas are unmaped.
- */
- mem += rst_mem_size;
- task_args->vdso_rt_parked_at = (unsigned long)mem + vdso_rt_delta;
- task_args->vdso_sym_rt = vdso_sym_rt;
- task_args->vdso_rt_size = vdso_rt_size;
-#endif
-
- new_sp = restorer_stack(task_args->t);
-
- ret = prepare_itimers(pid, core, task_args);
- if (ret < 0)
- goto err;
-
- ret = prepare_mm(pid, task_args);
- if (ret < 0)
- goto err;
-
- /* No longer need it */
- core_entry__free_unpacked(core, NULL);
- xfree(current->core);
-
- /*
- * Now prepare run-time data for threads restore.
- */
- task_args->nr_threads = current->nr_threads;
- task_args->clone_restore_fn = (void *)restore_thread_exec_start;
- task_args->thread_args = thread_args;
-
- /*
- * Make root and cwd restore _that_ late not to break any
- * attempts to open files by paths above (e.g. /proc).
- */
-
- if (restore_fs(current))
- goto err;
-
- close_image_dir();
- close_proc();
- close_service_fd(ROOT_FD_OFF);
- close_service_fd(USERNSD_SK);
-
- __gcov_flush();
-
- pr_info("task_args: %p\n"
- "task_args->pid: %d\n"
- "task_args->nr_threads: %d\n"
- "task_args->clone_restore_fn: %p\n"
- "task_args->thread_args: %p\n",
- task_args, task_args->t->pid,
- task_args->nr_threads,
- task_args->clone_restore_fn,
- task_args->thread_args);
-
- /*
- * An indirect call to task_restore, note it never returns
- * and restoring core is extremely destructive.
- */
-
- JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, task_args);
-
-err:
- free_mappings(&self_vmas);
-err_nv:
- /* Just to be sure */
- exit(1);
- return -1;
-}
diff --git a/cr-service.c b/cr-service.c
deleted file mode 100644
index a1987e713833..000000000000
--- a/cr-service.c
+++ /dev/null
@@ -1,1101 +0,0 @@
-#ifndef _GNU_SOURCE
-#define _GNU_SOURCE
-#endif
-
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <errno.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <sys/un.h>
-#include <sys/wait.h>
-#include <sys/stat.h>
-#include <arpa/inet.h>
-
-#include "crtools.h"
-#include "cr_options.h"
-#include "util.h"
-#include "log.h"
-#include "cpu.h"
-#include "files.h"
-#include "pstree.h"
-#include "cr-service.h"
-#include "cr-service-const.h"
-#include "page-xfer.h"
-#include "net.h"
-#include "mount.h"
-#include "cgroup.h"
-#include "action-scripts.h"
-#include "sockets.h"
-#include "irmap.h"
-#include "kerndat.h"
-#include "proc_parse.h"
-
-#include "setproctitle.h"
-
-#include "cr-errno.h"
-
-unsigned int service_sk_ino = -1;
-
-static int recv_criu_msg(int socket_fd, CriuReq **req)
-{
- unsigned char *buf;
- int len;
-
- len = recv(socket_fd, NULL, 0, MSG_TRUNC | MSG_PEEK);
- if (len == -1) {
- pr_perror("Can't read request");
- return -1;
- }
-
- buf = xmalloc(len);
- if (!buf)
- return -ENOMEM;
-
- len = recv(socket_fd, buf, len, MSG_TRUNC);
- if (len == -1) {
- pr_perror("Can't read request");
- goto err;
- }
-
- if (len == 0) {
- pr_info("Client exited unexpectedly\n");
- errno = ECONNRESET;
- goto err;
- }
-
- *req = criu_req__unpack(NULL, len, buf);
- if (!*req) {
- pr_perror("Failed unpacking request");
- goto err;
- }
-
- xfree(buf);
- return 0;
-err:
- xfree(buf);
- return -1;
-}
-
-static int send_criu_msg(int socket_fd, CriuResp *msg)
-{
- unsigned char *buf;
- int len;
-
- len = criu_resp__get_packed_size(msg);
-
- buf = xmalloc(len);
- if (!buf)
- return -ENOMEM;
-
- if (criu_resp__pack(msg, buf) != len) {
- pr_perror("Failed packing response");
- goto err;
- }
-
- if (write(socket_fd, buf, len) == -1) {
- pr_perror("Can't send response");
- goto err;
- }
-
- xfree(buf);
- return 0;
-err:
- xfree(buf);
- return -1;
-}
-
-static void send_criu_err(int sk, char *msg)
-{
- CriuResp resp = CRIU_RESP__INIT;
-
- pr_perror("RPC error: %s", msg);
-
- resp.type = CRIU_REQ_TYPE__EMPTY;
- resp.success = false;
- if (get_cr_errno()) {
- resp.has_cr_errno = true;
- resp.cr_errno = get_cr_errno();
- }
-
- send_criu_msg(sk, &resp);
-}
-
-int send_criu_dump_resp(int socket_fd, bool success, bool restored)
-{
- CriuResp msg = CRIU_RESP__INIT;
- CriuDumpResp resp = CRIU_DUMP_RESP__INIT;
-
- msg.type = CRIU_REQ_TYPE__DUMP;
- msg.success = success;
- if (get_cr_errno()) {
- msg.has_cr_errno = true;
- msg.cr_errno = get_cr_errno();
- }
- msg.dump = &resp;
-
- resp.has_restored = true;
- resp.restored = restored;
-
- return send_criu_msg(socket_fd, &msg);
-}
-
-static int send_criu_pre_dump_resp(int socket_fd, bool success)
-{
- CriuResp msg = CRIU_RESP__INIT;
-
- msg.type = CRIU_REQ_TYPE__PRE_DUMP;
- msg.success = success;
- if (get_cr_errno()) {
- msg.has_cr_errno = true;
- msg.cr_errno = get_cr_errno();
- }
-
- return send_criu_msg(socket_fd, &msg);
-}
-
-int send_criu_restore_resp(int socket_fd, bool success, int pid)
-{
- CriuResp msg = CRIU_RESP__INIT;
- CriuRestoreResp resp = CRIU_RESTORE_RESP__INIT;
-
- msg.type = CRIU_REQ_TYPE__RESTORE;
- msg.success = success;
- if (get_cr_errno()) {
- msg.has_cr_errno = true;
- msg.cr_errno = get_cr_errno();
- }
- msg.restore = &resp;
-
- resp.pid = pid;
-
- return send_criu_msg(socket_fd, &msg);
-}
-
-int send_criu_rpc_script(enum script_actions act, char *name, int fd)
-{
- int ret;
- CriuResp msg = CRIU_RESP__INIT;
- CriuReq *req;
- CriuNotify cn = CRIU_NOTIFY__INIT;
-
- msg.type = CRIU_REQ_TYPE__NOTIFY;
- msg.success = true;
- msg.notify = &cn;
- cn.script = name;
-
- switch (act) {
- case ACT_SETUP_NS:
- case ACT_POST_RESTORE:
- /*
- * FIXME pid is required only once on
- * restore. Need some more sane way of
- * checking this.
- */
- cn.has_pid = true;
- cn.pid = root_item->pid.real;
- break;
- default:
- break;
- }
-
- ret = send_criu_msg(fd, &msg);
- if (ret < 0)
- return ret;
-
- ret = recv_criu_msg(fd, &req);
- if (ret < 0)
- return ret;
-
- if (req->type != CRIU_REQ_TYPE__NOTIFY || !req->notify_success) {
- pr_err("RPC client reported script error\n");
- return -1;
- }
-
- criu_req__free_unpacked(req, NULL);
- return 0;
-}
-
-static char images_dir[PATH_MAX];
-
-static int setup_opts_from_req(int sk, CriuOpts *req)
-{
- struct ucred ids;
- struct stat st;
- socklen_t ids_len = sizeof(struct ucred);
- char images_dir_path[PATH_MAX];
- char work_dir_path[PATH_MAX];
- int i;
-
- if (getsockopt(sk, SOL_SOCKET, SO_PEERCRED, &ids, &ids_len)) {
- pr_perror("Can't get socket options");
- goto err;
- }
-
- if (fstat(sk, &st)) {
- pr_perror("Can't get socket stat");
- goto err;
- }
-
- BUG_ON(st.st_ino == -1);
- service_sk_ino = st.st_ino;
-
- /* open images_dir */
- sprintf(images_dir_path, "/proc/%d/fd/%d", ids.pid, req->images_dir_fd);
-
- if (req->parent_img)
- opts.img_parent = req->parent_img;
-
- if (open_image_dir(images_dir_path) < 0) {
- pr_perror("Can't open images directory");
- goto err;
- }
-
- /* get full path to images_dir to use in process title */
- if (readlink(images_dir_path, images_dir, PATH_MAX) == -1) {
- pr_perror("Can't readlink %s", images_dir_path);
- goto err;
- }
-
- /* chdir to work dir */
- if (req->has_work_dir_fd)
- sprintf(work_dir_path, "/proc/%d/fd/%d", ids.pid, req->work_dir_fd);
- else
- strcpy(work_dir_path, images_dir_path);
-
- if (chdir(work_dir_path)) {
- pr_perror("Can't chdir to work_dir");
- goto err;
- }
-
- /* initiate log file in work dir */
- if (req->log_file) {
- if (strchr(req->log_file, '/')) {
- pr_perror("No subdirs are allowed in log_file name");
- goto err;
- }
-
- opts.output = req->log_file;
- } else
- opts.output = DEFAULT_LOG_FILENAME;
-
- log_set_loglevel(req->log_level);
- if (log_init(opts.output) == -1) {
- pr_perror("Can't initiate log");
- goto err;
- }
-
- /* checking flags from client */
- if (req->has_leave_running && req->leave_running)
- opts.final_state = TASK_ALIVE;
-
- if (!req->has_pid) {
- req->has_pid = true;
- req->pid = ids.pid;
- }
-
- if (req->has_ext_unix_sk) {
- opts.ext_unix_sk = req->ext_unix_sk;
- for (i = 0; i < req->n_unix_sk_ino; i++) {
- if (unix_sk_id_add(req->unix_sk_ino[i]->inode) < 0)
- goto err;
- }
- }
-
- if (req->root)
- opts.root = req->root;
-
- if (req->has_rst_sibling) {
- if (!opts.swrk_restore) {
- pr_err("rst_sibling is not allowed in standalone service\n");
- goto err;
- }
-
- opts.restore_sibling = req->rst_sibling;
- }
-
- if (req->has_tcp_established)
- opts.tcp_established_ok = req->tcp_established;
-
- if (req->has_evasive_devices)
- opts.evasive_devices = req->evasive_devices;
-
- if (req->has_shell_job)
- opts.shell_job = req->shell_job;
-
- if (req->has_file_locks)
- opts.handle_file_locks = req->file_locks;
-
- if (req->has_track_mem)
- opts.track_mem = req->track_mem;
-
- if (req->has_link_remap)
- opts.link_remap_ok = req->link_remap;
-
- if (req->has_auto_dedup)
- opts.auto_dedup = req->auto_dedup;
-
- if (req->has_force_irmap)
- opts.force_irmap = req->force_irmap;
-
- if (req->n_exec_cmd > 0) {
- opts.exec_cmd = xmalloc((req->n_exec_cmd + 1) * sizeof(char *));
- memcpy(opts.exec_cmd, req->exec_cmd, req->n_exec_cmd * sizeof(char *));
- opts.exec_cmd[req->n_exec_cmd] = NULL;
- }
-
- if (req->ps) {
- opts.use_page_server = true;
- opts.addr = req->ps->address;
- opts.port = htons((short)req->ps->port);
-
- if (req->ps->has_fd) {
- if (!opts.swrk_restore)
- goto err;
-
- opts.ps_socket = req->ps->fd;
- }
- }
-
- if (req->notify_scripts &&
- add_script(SCRIPT_RPC_NOTIFY, sk))
- goto err;
-
- for (i = 0; i < req->n_veths; i++) {
- if (veth_pair_add(req->veths[i]->if_in, req->veths[i]->if_out))
- goto err;
- }
-
- for (i = 0; i < req->n_ext_mnt; i++) {
- if (ext_mount_add(req->ext_mnt[i]->key, req->ext_mnt[i]->val))
- goto err;
- }
-
- if (req->n_inherit_fd && !opts.swrk_restore) {
- pr_err("inherit_fd is not allowed in standalone service\n");
- goto err;
- }
- for (i = 0; i < req->n_inherit_fd; i++) {
- if (inherit_fd_add(req->inherit_fd[i]->fd, req->inherit_fd[i]->key))
- goto err;
- }
-
- for (i = 0; i < req->n_external; i++)
- if (add_external(req->external[i]))
- goto err;
-
- for (i = 0; i < req->n_cg_root; i++) {
- if (new_cg_root_add(req->cg_root[i]->ctrl,
- req->cg_root[i]->path))
- goto err;
- }
-
- for (i = 0; i < req->n_enable_fs; i++) {
- if (!add_fsname_auto(req->enable_fs[i]))
- goto err;
- }
-
- for (i = 0; i < req->n_skip_mnt; i++) {
- if (!add_skip_mount(req->skip_mnt[i]))
- goto err;
- }
-
- if (req->has_cpu_cap)
- opts.cpu_cap = req->cpu_cap;
-
- /*
- * FIXME: For backward compatibility we setup
- * soft mode here, need to enhance to support
- * other modes as well via separate option
- * probably.
- */
- if (req->has_manage_cgroups)
- opts.manage_cgroups = req->manage_cgroups ? CG_MODE_SOFT : CG_MODE_IGNORE;
-
- /* Override the manage_cgroup if mode is set explicitly */
- if (req->has_manage_cgroups_mode) {
- unsigned int mode;
-
- switch (req->manage_cgroups_mode) {
- case CRIU_CG_MODE__IGNORE:
- mode = CG_MODE_IGNORE;
- break;
- case CRIU_CG_MODE__NONE:
- mode = CG_MODE_NONE;
- break;
- case CRIU_CG_MODE__PROPS:
- mode = CG_MODE_PROPS;
- break;
- case CRIU_CG_MODE__SOFT:
- mode = CG_MODE_SOFT;
- break;
- case CRIU_CG_MODE__FULL:
- mode = CG_MODE_FULL;
- break;
- case CRIU_CG_MODE__STRICT:
- mode = CG_MODE_STRICT;
- break;
- case CRIU_CG_MODE__DEFAULT:
- mode = CG_MODE_DEFAULT;
- break;
- default:
- goto err;
- }
-
- opts.manage_cgroups = mode;
- }
-
- if (req->has_auto_ext_mnt)
- opts.autodetect_ext_mounts = req->auto_ext_mnt;
-
- if (req->has_ext_sharing)
- opts.enable_external_sharing = req->ext_sharing;
-
- if (req->has_ext_masters)
- opts.enable_external_masters = req->ext_masters;
-
- if (req->has_ghost_limit)
- opts.ghost_limit = req->ghost_limit;
-
- if (req->n_irmap_scan_paths) {
- for (i = 0; i < req->n_irmap_scan_paths; i++) {
- if (irmap_scan_path_add(req->irmap_scan_paths[i]))
- goto err;
- }
- }
-
- return 0;
-
-err:
- set_cr_errno(EBADRQC);
- return -1;
-}
-
-static int dump_using_req(int sk, CriuOpts *req)
-{
- bool success = false;
- bool self_dump = !req->pid;
-
- if (setup_opts_from_req(sk, req))
- goto exit;
-
- setproctitle("dump --rpc -t %d -D %s", req->pid, images_dir);
-
- /*
- * FIXME -- cr_dump_tasks() may return code from custom
- * scripts, that can be positive. However, right now we
- * don't have ability to push scripts via RPC, so psitive
- * ret values are impossible here.
- */
- if (cr_dump_tasks(req->pid))
- goto exit;
-
- success = true;
-exit:
- if (req->leave_running || !self_dump || !success) {
- if (send_criu_dump_resp(sk, success, false) == -1) {
- pr_perror("Can't send response");
- success = false;
- }
- }
-
- return success ? 0 : 1;
-}
-
-static int restore_using_req(int sk, CriuOpts *req)
-{
- bool success = false;
-
- /*
- * We can't restore processes under arbitrary task yet.
- * Thus for now we force the detached restore under the
- * cr service task.
- */
-
- opts.restore_detach = true;
-
- if (setup_opts_from_req(sk, req))
- goto exit;
-
- setproctitle("restore --rpc -D %s", images_dir);
-
- if (cr_restore_tasks())
- goto exit;
-
- success = true;
-exit:
- if (send_criu_restore_resp(sk, success,
- root_item ? root_item->pid.real : -1) == -1) {
- pr_perror("Can't send response");
- success = false;
- }
-
- if (success && opts.exec_cmd) {
- int logfd;
-
- logfd = log_get_fd();
- if (dup2(logfd, STDOUT_FILENO) == -1 || dup2(logfd, STDERR_FILENO) == -1) {
- pr_perror("Failed to redirect stdout and stderr to the logfile");
- return 1;
- }
-
- close_pid_proc();
- close(sk);
-
- execvp(opts.exec_cmd[0], opts.exec_cmd);
- pr_perror("Failed to exec cmd %s", opts.exec_cmd[0]);
- success = false;
- }
-
- return success ? 0 : 1;
-}
-
-static int check(int sk)
-{
- CriuResp resp = CRIU_RESP__INIT;
-
- resp.type = CRIU_REQ_TYPE__CHECK;
-
- setproctitle("check --rpc");
-
- /* Check only minimal kernel support */
- opts.check_ms_kernel = true;
-
- if (!cr_check())
- resp.success = true;
-
- return send_criu_msg(sk, &resp);
-}
-
-static int pre_dump_using_req(int sk, CriuOpts *req)
-{
- int pid, status;
- bool success = false;
-
- pid = fork();
- if (pid < 0) {
- pr_perror("Can't fork");
- goto out;
- }
-
- if (pid == 0) {
- int ret = 1;
-
- if (setup_opts_from_req(sk, req))
- goto cout;
-
- setproctitle("pre-dump --rpc -t %d -D %s", req->pid, images_dir);
-
- if (cr_pre_dump_tasks(req->pid))
- goto cout;
-
- ret = 0;
-cout:
- exit(ret);
- }
-
- wait(&status);
- if (!WIFEXITED(status))
- goto out;
- if (WEXITSTATUS(status) != 0)
- goto out;
-
- success = true;
-out:
- if (send_criu_pre_dump_resp(sk, success) == -1) {
- pr_perror("Can't send pre-dump resp");
- success = false;
- }
-
- return success ? 0 : -1;
-}
-
-static int pre_dump_loop(int sk, CriuReq *msg)
-{
- int ret;
-
- do {
- ret = pre_dump_using_req(sk, msg->opts);
- if (ret < 0)
- return ret;
-
- criu_req__free_unpacked(msg, NULL);
- if (recv_criu_msg(sk, &msg) == -1) {
- pr_perror("Can't recv request");
- return -1;
- }
- } while (msg->type == CRIU_REQ_TYPE__PRE_DUMP);
-
- if (msg->type != CRIU_REQ_TYPE__DUMP) {
- send_criu_err(sk, "Bad req seq");
- return -1;
- }
-
- return dump_using_req(sk, msg->opts);
-}
-
-struct ps_info {
- int pid;
- unsigned short port;
-};
-
-static int start_page_server_req(int sk, CriuOpts *req)
-{
- int ret = -1, pid, start_pipe[2];
- ssize_t count;
- bool success = false;
- CriuResp resp = CRIU_RESP__INIT;
- CriuPageServerInfo ps = CRIU_PAGE_SERVER_INFO__INIT;
- struct ps_info info;
-
- if (pipe(start_pipe)) {
- pr_perror("No start pipe");
- goto out;
- }
-
- pid = fork();
- if (pid == 0) {
- close(start_pipe[0]);
-
- if (setup_opts_from_req(sk, req))
- goto out_ch;
-
- setproctitle("page-server --rpc --address %s --port %hu", opts.addr, opts.port);
-
- pr_debug("Starting page server\n");
-
- pid = cr_page_server(true, start_pipe[1]);
- if (pid <= 0)
- goto out_ch;
-
- info.pid = pid;
- info.port = opts.port;
-
- count = write(start_pipe[1], &info, sizeof(info));
- if (count != sizeof(info))
- goto out_ch;
-
- ret = 0;
-out_ch:
- if (ret < 0 && pid > 0)
- kill(pid, SIGKILL);
- close(start_pipe[1]);
- exit(ret);
- }
-
- close(start_pipe[1]);
- wait(&ret);
- if (WIFEXITED(ret)) {
- if (WEXITSTATUS(ret)) {
- pr_err("Child exited with an error\n");
- goto out;
- }
- } else {
- pr_err("Child wasn't terminated normally\n");
- goto out;
- }
-
- count = read(start_pipe[0], &info, sizeof(info));
- close(start_pipe[0]);
- if (count != sizeof(info))
- goto out;
-
- success = true;
- ps.has_pid = true;
- ps.pid = info.pid;
- ps.has_port = true;
- ps.port = info.port;
- resp.ps = &ps;
-
- pr_debug("Page server started\n");
-out:
- resp.type = CRIU_REQ_TYPE__PAGE_SERVER;
- resp.success = success;
- return send_criu_msg(sk, &resp);
-}
-
-static int chk_keepopen_req(CriuReq *msg)
-{
- if (!msg->keep_open)
- return 0;
-
- /*
- * Service may (well, it will) leave some
- * resources leaked after processing e.g.
- * dump or restore requests. Before we audit
- * the code for this, let's first enable
- * mreq RPCs for those requests we know do
- * good work
- */
-
- if (msg->type == CRIU_REQ_TYPE__PAGE_SERVER)
- /* This just fork()-s so no leaks */
- return 0;
- else if (msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP ||
- msg->type == CRIU_REQ_TYPE__CPUINFO_CHECK)
- return 0;
- else if (msg->type == CRIU_REQ_TYPE__FEATURE_CHECK)
- return 0;
-
- return -1;
-}
-
-/*
- * Generic function to handle CRIU_REQ_TYPE__FEATURE_CHECK.
- *
- * The function will have resp.sucess = true for most cases
- * and the actual result will be in resp.features.
- *
- * For each feature which has been requested in msg->features
- * the corresponding parameter will be set in resp.features.
- */
-static int handle_feature_check(int sk, CriuReq * msg)
-{
- CriuResp resp = CRIU_RESP__INIT;
- CriuFeatures feat = CRIU_FEATURES__INIT;
- bool success = false;
- int pid, status;
-
- /* enable setting of an optional message */
- feat.has_mem_track = 1;
- feat.mem_track = false;
-
- /*
- * Check if the requested feature check can be answered.
- *
- * This function is right now hard-coded to memory
- * tracking detection and needs other/better logic to
- * handle multiple feature checks.
- */
- if (msg->features->has_mem_track != 1) {
- pr_warn("Feature checking for unknown feature.\n");
- goto out;
- }
-
- /*
- * From this point on the function will always
- * 'succeed'. If the requested features are supported
- * can be seen if the requested optional parameters are
- * set in the message 'criu_features'.
- */
- success = true;
-
- pid = fork();
- if (pid < 0) {
- pr_perror("Can't fork");
- goto out;
- }
-
- if (pid == 0) {
- int ret = 1;
-
- if (setup_opts_from_req(sk, msg->opts))
- goto cout;
-
- setproctitle("feature-check --rpc -D %s", images_dir);
-
- kerndat_get_dirty_track();
-
- if (kdat.has_dirty_track)
- ret = 0;
-cout:
- exit(ret);
- }
-
- wait(&status);
- if (!WIFEXITED(status) || WEXITSTATUS(status))
- goto out;
-
- feat.mem_track = true;
-out:
- resp.features = &feat;
- resp.type = msg->type;
- resp.success = success;
-
- return send_criu_msg(sk, &resp);
-}
-
-static int handle_cpuinfo(int sk, CriuReq *msg)
-{
- CriuResp resp = CRIU_RESP__INIT;
- bool success = false;
- int pid, status;
-
- pid = fork();
- if (pid < 0) {
- pr_perror("Can't fork");
- goto out;
- }
-
- if (pid == 0) {
- int ret = 1;
-
- if (setup_opts_from_req(sk, msg->opts))
- goto cout;
-
- setproctitle("cpuinfo %s --rpc -D %s",
- msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP ?
- "dump" : "check",
- images_dir);
-
- if (msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP)
- ret = cpuinfo_dump();
- else
- ret = cpuinfo_check();
-cout:
- exit(ret);
- }
-
- wait(&status);
- if (!WIFEXITED(status))
- goto out;
- switch (WEXITSTATUS(status)) {
- case (-ENOTSUP & 0xff):
- resp.has_cr_errno = 1;
- /*
- * Let's return the actual error code and
- * not just (-ENOTSUP & 0xff)
- */
- resp.cr_errno = ENOTSUP;
- break;
- case 0:
- success = true;
- break;
- default:
- break;
- }
-
-out:
- resp.type = msg->type;
- resp.success = success;
-
- return send_criu_msg(sk, &resp);
-}
-
-int cr_service_work(int sk)
-{
- int ret = -1;
- CriuReq *msg = 0;
-
-more:
- if (recv_criu_msg(sk, &msg) == -1) {
- pr_perror("Can't recv request");
- goto err;
- }
-
- if (chk_keepopen_req(msg))
- goto err;
-
- switch (msg->type) {
- case CRIU_REQ_TYPE__DUMP:
- ret = dump_using_req(sk, msg->opts);
- break;
- case CRIU_REQ_TYPE__RESTORE:
- ret = restore_using_req(sk, msg->opts);
- break;
- case CRIU_REQ_TYPE__CHECK:
- ret = check(sk);
- break;
- case CRIU_REQ_TYPE__PRE_DUMP:
- ret = pre_dump_loop(sk, msg);
- break;
- case CRIU_REQ_TYPE__PAGE_SERVER:
- ret = start_page_server_req(sk, msg->opts);
- break;
- case CRIU_REQ_TYPE__CPUINFO_DUMP:
- case CRIU_REQ_TYPE__CPUINFO_CHECK:
- ret = handle_cpuinfo(sk, msg);
- break;
- case CRIU_REQ_TYPE__FEATURE_CHECK:
- ret = handle_feature_check(sk, msg);
- break;
-
- default:
- send_criu_err(sk, "Invalid req");
- break;
- }
-
- if (!ret && msg->keep_open) {
- criu_req__free_unpacked(msg, NULL);
- ret = -1;
- goto more;
- }
-
-err:
- return ret;
-}
-
-static void reap_worker(int signo)
-{
- int saved_errno;
- int status;
- pid_t pid;
-
- saved_errno = errno;
-
- /*
- * As we block SIGCHLD, lets wait for every child that has
- * already changed state.
- */
- while (1) {
- pid = waitpid(-1, &status, WNOHANG);
-
- if (pid <= 0) {
- errno = saved_errno;
- return;
- }
-
- if (WIFEXITED(status))
- pr_info("Worker(pid %d) exited with %d\n",
- pid, WEXITSTATUS(status));
- else if (WIFSIGNALED(status))
- pr_info("Worker(pid %d) was killed by %d\n",
- pid, WTERMSIG(status));
- }
-}
-
-static int setup_sigchld_handler()
-{
- struct sigaction action;
-
- sigemptyset(&action.sa_mask);
- sigaddset(&action.sa_mask, SIGCHLD);
- action.sa_handler = reap_worker;
- action.sa_flags = SA_RESTART;
-
- if (sigaction(SIGCHLD, &action, NULL)) {
- pr_perror("Can't setup SIGCHLD handler");
- return -1;
- }
-
- return 0;
-}
-
-static int restore_sigchld_handler()
-{
- struct sigaction action;
-
- sigemptyset(&action.sa_mask);
- sigaddset(&action.sa_mask, SIGCHLD);
- action.sa_handler = SIG_DFL;
- action.sa_flags = SA_RESTART;
-
- if (sigaction(SIGCHLD, &action, NULL)) {
- pr_perror("Can't restore SIGCHLD handler");
- return -1;
- }
-
- return 0;
-}
-
-int cr_service(bool daemon_mode)
-{
- int server_fd = -1;
- int child_pid;
-
- struct sockaddr_un client_addr;
- socklen_t client_addr_len;
-
- {
- struct sockaddr_un server_addr;
- socklen_t server_addr_len;
-
- server_fd = socket(AF_LOCAL, SOCK_SEQPACKET, 0);
- if (server_fd == -1) {
- pr_perror("Can't initialize service socket");
- goto err;
- }
-
- memset(&server_addr, 0, sizeof(server_addr));
- memset(&client_addr, 0, sizeof(client_addr));
- server_addr.sun_family = AF_LOCAL;
-
- if (opts.addr == NULL) {
- pr_warn("Binding to local dir address!\n");
- opts.addr = CR_DEFAULT_SERVICE_ADDRESS;
- }
-
- strcpy(server_addr.sun_path, opts.addr);
-
- server_addr_len = strlen(server_addr.sun_path)
- + sizeof(server_addr.sun_family);
- client_addr_len = sizeof(client_addr);
-
- unlink(server_addr.sun_path);
-
- if (bind(server_fd, (struct sockaddr *) &server_addr,
- server_addr_len) == -1) {
- pr_perror("Can't bind");
- goto err;
- }
-
- pr_info("The service socket is bound to %s\n", server_addr.sun_path);
-
- /* change service socket permissions, so anyone can connect to it */
- if (chmod(server_addr.sun_path, 0666)) {
- pr_perror("Can't change permissions of the service socket");
- goto err;
- }
-
- if (listen(server_fd, 16) == -1) {
- pr_perror("Can't listen for socket connections");
- goto err;
- }
- }
-
- if (daemon_mode) {
- if (daemon(1, 0) == -1) {
- pr_perror("Can't run service server in the background");
- goto err;
- }
- }
-
- if (opts.pidfile) {
- if (write_pidfile(getpid()) == -1) {
- pr_perror("Can't write pidfile");
- goto err;
- }
- }
-
- if (setup_sigchld_handler())
- goto err;
-
- while (1) {
- int sk;
-
- pr_info("Waiting for connection...\n");
-
- sk = accept(server_fd, &client_addr, &client_addr_len);
- if (sk == -1) {
- pr_perror("Can't accept connection");
- goto err;
- }
-
- pr_info("Connected.\n");
- child_pid = fork();
- if (child_pid == 0) {
- int ret;
-
- if (restore_sigchld_handler())
- exit(1);
-
- close(server_fd);
- init_opts();
- ret = cr_service_work(sk);
- close(sk);
- exit(ret != 0);
- }
-
- if (child_pid < 0)
- pr_perror("Can't fork a child");
-
- close(sk);
- }
-
-err:
- close_safe(&server_fd);
-
- return 1;
-}
diff --git a/cr-show.c b/cr-show.c
deleted file mode 100644
index 91d4d095a071..000000000000
--- a/cr-show.c
+++ /dev/null
@@ -1,574 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <signal.h>
-#include <limits.h>
-#include <unistd.h>
-#include <errno.h>
-#include <string.h>
-#include <ctype.h>
-
-#include <fcntl.h>
-
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include "asm/types.h"
-#include "list.h"
-#include "imgset.h"
-#include "namespaces.h"
-#include "compiler.h"
-#include "cr_options.h"
-#include "util.h"
-#include "sockets.h"
-#include "image.h"
-#include "uts_ns.h"
-#include "ipc_ns.h"
-#include "pstree.h"
-#include "cr-show.h"
-#include "crtools.h"
-
-#include "protobuf.h"
-#include "protobuf/pstree.pb-c.h"
-#include "protobuf/pipe-data.pb-c.h"
-#include "protobuf/siginfo.pb-c.h"
-
-#define DEF_PAGES_PER_LINE 6
-
-
-static LIST_HEAD(pstree_list);
-
-static void pipe_data_handler(struct cr_img *img, void *obj)
-{
- PipeDataEntry *e = obj;
- print_image_data(img, e->bytes, opts.show_pages_content);
-}
-
-static int nice_width_for(unsigned long addr)
-{
- int ret = 3;
-
- while (addr) {
- addr >>= 4;
- ret++;
- }
-
- return ret;
-}
-
-static inline void pr_xdigi(unsigned char *data, size_t len, int pos)
-{
- if (pos < len)
- pr_msg("%02x ", data[pos]);
- else
- pr_msg(" ");
-}
-
-static inline void pr_xsym(unsigned char *data, size_t len, int pos)
-{
- char sym;
-
- if (pos < len)
- sym = data[pos];
- else
- sym = ' ';
-
- pr_msg("%c", isprint(sym) ? sym : '.');
-}
-
-void print_data(unsigned long addr, unsigned char *data, size_t size)
-{
- int i, j, addr_len;
- unsigned zero_line = 0;
-
- addr_len = nice_width_for(addr + size);
-
- for (i = 0; i < size; i += 16) {
- if (*(u64 *)(data + i) == 0 && *(u64 *)(data + i + 8) == 0) {
- if (zero_line == 0)
- zero_line = 1;
- else {
- if (zero_line == 1) {
- pr_msg("*\n");
- zero_line = 2;
- }
-
- continue;
- }
- } else
- zero_line = 0;
-
- pr_msg("%#0*lx: ", addr_len, addr + i);
- for (j = 0; j < 8; j++)
- pr_xdigi(data, size, i + j);
- pr_msg(" ");
- for (j = 8; j < 16; j++)
- pr_xdigi(data, size, i + j);
-
- pr_msg(" |");
- for (j = 0; j < 8; j++)
- pr_xsym(data, size, i + j);
- pr_msg(" ");
- for (j = 8; j < 16; j++)
- pr_xsym(data, size, i + j);
-
- pr_msg("|\n");
- }
-}
-
-void print_image_data(struct cr_img *img, unsigned int length, int show)
-{
- void *data;
- int ret;
-
- if (!show) {
- lseek(img_raw_fd(img), length, SEEK_CUR);
- return;
- }
-
- pr_msg("\n");
-
- data = xmalloc(length);
- if (!data)
- return;
- ret = read_img_buf(img, (unsigned char *)data, length);
- if (ret < 0) {
- xfree(data);
- return;
- }
- print_data(0, (unsigned char *)data, length);
- xfree(data);
-}
-
-static void show_pagemaps(struct cr_img *img, void *obj)
-{
- pb_show_plain_pretty(img, PB_PAGEMAP, "nr_pages:%u");
-}
-
-void show_siginfo(struct cr_img *img)
-{
- int ret;
-
- pr_img_head(CR_FD_SIGNAL);
- while (1) {
- SiginfoEntry *sie;
- siginfo_t *info;
-
- ret = pb_read_one_eof(img, &sie, PB_SIGINFO);
- if (ret <= 0)
- break;
-
- info = (siginfo_t *) sie->siginfo.data;
- pr_msg("signal: si_signo=%d si_code=%x\n",
- info->si_signo, info->si_code);
- siginfo_entry__free_unpacked(sie, NULL);
-
- }
- pr_img_tail(CR_FD_SIGNAL);
-}
-
-static int pstree_item_from_pb(PstreeEntry *e, struct pstree_item *item)
-{
- int i;
-
- item->pid.virt = e->pid;
- item->nr_threads = e->n_threads;
- item->threads = xzalloc(sizeof(struct pid) * e->n_threads);
- if (!item->threads)
- return -1;
-
- for (i = 0; i < item->nr_threads; i++)
- item->threads[i].virt = e->threads[i];
-
- return 0;
-}
-
-static void pstree_handler(struct cr_img *img, void *obj)
-{
- PstreeEntry *e = obj;
- struct pstree_item *item = NULL;
-
- item = xzalloc(sizeof(struct pstree_item));
- if (!item)
- return;
-
- if (pstree_item_from_pb(e, item)) {
- xfree(item);
- return;
- }
-
- list_add_tail(&item->sibling, &pstree_list);
-}
-
-static void show_collect_pstree(struct cr_img *img, int collect)
-{
- pb_show_plain_payload_pretty(img, PB_PSTREE,
- collect ? pstree_handler : NULL, "*:%d");
-}
-
-static inline char *task_state_str(int state)
-{
- switch (state) {
- case TASK_ALIVE:
- return "running/sleeping";
- case TASK_DEAD:
- return "zombie";
- default:
- return "UNKNOWN";
- }
-}
-
-static void show_core_regs(UserX86RegsEntry *regs)
-{
-#define pr_regs4(s, n1, n2, n3, n4) \
- pr_msg("\t%8s: 0x%-16"PRIx64" " \
- "%8s: 0x%-16"PRIx64" " \
- "%8s: 0x%-16"PRIx64" " \
- "%8s: 0x%-16"PRIx64"\n", \
- #n1, s->n1, \
- #n2, s->n2, \
- #n3, s->n3, \
- #n4, s->n4)
-
-#define pr_regs3(s, n1, n2, n3) \
- pr_msg("\t%8s: 0x%-16"PRIx64" " \
- "%8s: 0x%-16"PRIx64" " \
- "%8s: 0x%-16"PRIx64"\n", \
- #n1, s->n1, \
- #n2, s->n2, \
- #n3, s->n3)
-
- pr_msg("\t---[ GP registers set ]---\n");
-
- pr_regs4(regs, cs, ip, ds, es);
- pr_regs4(regs, ss, sp, fs, gs);
- pr_regs4(regs, di, si, dx, cx);
- pr_regs4(regs, ax, r8, r9, r10);
- pr_regs4(regs, r11, r12, r13, r14);
- pr_regs3(regs, r15, bp, bx);
- pr_regs4(regs, orig_ax, flags, fs_base, gs_base);
- pr_msg("\n");
-}
-
-void show_thread_info(ThreadInfoX86 *thread_info)
-{
- if (!thread_info)
- return;
-
- pr_msg("\t---[ Thread info ]---\n");
- pr_msg("\tclear_tid_addr: 0x%"PRIx64"\n", thread_info->clear_tid_addr);
- pr_msg("\n");
-
- show_core_regs(thread_info->gpregs);
-}
-
-static struct {
- u32 magic;
- u32 mask;
- char *hint;
-} magic_hints[] = {
- { .magic = 0x45311224, .mask = 0xffffffff, .hint = "ip route dump", },
- { .magic = 0x47361222, .mask = 0xffffffff, .hint = "ip ifaddr dump", },
- { .magic = 0x00008b1f, .mask = 0x0000ffff, .hint = "gzip file", },
- { },
-};
-
-static void try_hint_magic(u32 magic)
-{
- int i;
-
- for (i = 0; magic_hints[i].hint != 0; i++)
- if ((magic & magic_hints[i].mask) == magic_hints[i].magic)
- pr_msg("This can be %s\n", magic_hints[i].hint);
-}
-
-#define SHOW_PLAIN(name) { name##_MAGIC, PB_##name, false, NULL, NULL, }
-/* nothing special behind this -S, just to avoid heavy patching */
-#define SHOW_PLAINS(name) { name##S_MAGIC, PB_##name, false, NULL, NULL, }
-#define SHOW_VERT(name) { name##_MAGIC, PB_##name, true, NULL, NULL, }
-
-static struct show_image_info show_infos[] = {
- SHOW_VERT(INVENTORY),
- SHOW_VERT(CORE),
- SHOW_VERT(IDS),
- SHOW_VERT(CREDS),
- SHOW_VERT(UTSNS),
- SHOW_VERT(IPC_VAR),
- SHOW_VERT(FS),
- SHOW_VERT(GHOST_FILE),
- SHOW_VERT(MM),
- SHOW_VERT(CGROUP),
-
- SHOW_PLAINS(REG_FILE),
- SHOW_PLAINS(NS_FILE),
- SHOW_PLAIN(EVENTFD_FILE),
- SHOW_PLAIN(EVENTPOLL_FILE),
- SHOW_PLAIN(EVENTPOLL_TFD),
- SHOW_PLAIN(SIGNALFD),
- SHOW_PLAIN(TIMERFD),
- SHOW_PLAIN(INOTIFY_FILE),
- SHOW_PLAIN(INOTIFY_WD),
- SHOW_PLAIN(FANOTIFY_FILE),
- SHOW_PLAIN(FANOTIFY_MARK),
- SHOW_PLAINS(VMA),
- SHOW_PLAINS(PIPE),
- SHOW_PLAIN(FIFO),
- SHOW_PLAIN(SIGACT),
- SHOW_PLAIN(NETLINK_SK),
- SHOW_PLAIN(REMAP_FPATH),
- SHOW_PLAINS(MNT),
- SHOW_PLAINS(TTY_FILE),
- SHOW_PLAIN(TTY_INFO),
- SHOW_PLAIN(RLIMIT),
- SHOW_PLAIN(TUNFILE),
- SHOW_PLAINS(EXT_FILE),
- SHOW_PLAIN(IRMAP_CACHE),
- SHOW_PLAIN(CPUINFO),
- SHOW_PLAIN(USERNS),
- SHOW_PLAIN(NETNS),
-
- { FILE_LOCKS_MAGIC, PB_FILE_LOCK, false, NULL, "3:%u", },
- { TCP_STREAM_MAGIC, PB_TCP_STREAM, true, show_tcp_stream, "1:%u 2:%u 3:%u 4:%u 12:%u", },
- { STATS_MAGIC, PB_STATS, true, NULL, "1.1:%u 1.2:%u 1.3:%u 1.4:%u 1.5:%Lu 1.6:%Lu 1.7:%Lu 1.8:%u", },
- { FDINFO_MAGIC, PB_FDINFO, false, NULL, "flags:%#o fd:%d", },
- { UNIXSK_MAGIC, PB_UNIX_SK, false, NULL, "1:%#x 2:%#x 3:%d 4:%d 5:%d 6:%d 7:%d 8:%#x 11:S", },
- { INETSK_MAGIC, PB_INET_SK, false, NULL, "1:%#x 2:%#x 3:%d 4:%d 5:%d 6:%d 7:%d 8:%d 9:%2x 11:A 12:A", },
- { PACKETSK_MAGIC, PB_PACKET_SOCK, false, NULL, "5:%d", },
- { ITIMERS_MAGIC, PB_ITIMER, false, NULL, "*:%Lu", },
- { POSIX_TIMERS_MAGIC, PB_POSIX_TIMER, false, NULL, "*:%d 5:%Lu 7:%Lu 8:%lu 9:%Lu 10:%Lu", },
- { NETDEV_MAGIC, PB_NETDEV, false, NULL, "2:%d", },
-
- { PAGEMAP_MAGIC, PB_PAGEMAP_HEAD, true, show_pagemaps, NULL, },
- { PIPES_DATA_MAGIC, PB_PIPE_DATA, false, pipe_data_handler, NULL, },
- { FIFO_DATA_MAGIC, PB_PIPE_DATA, false, pipe_data_handler, NULL, },
- { SK_QUEUES_MAGIC, PB_SK_QUEUES, false, sk_queue_data_handler, NULL, },
- { IPCNS_SHM_MAGIC, PB_IPC_SHM, false, ipc_shm_handler, NULL, },
- { IPCNS_SEM_MAGIC, PB_IPC_SEM, false, ipc_sem_handler, NULL, },
- { IPCNS_MSG_MAGIC, PB_IPCNS_MSG_ENT, false, ipc_msg_handler, NULL, },
-
- { }
-};
-
-static int cr_parse_file(void)
-{
- u32 magic;
- int ret = -1, fd;
- struct cr_img *img = NULL;
-
- fd = open(opts.show_dump_file, O_RDONLY);
- if (fd < 0) {
- pr_perror("Can't open %s", opts.show_dump_file);
- goto out;
- }
-
- img = img_from_fd(fd);
- if (!img)
- goto out;
-
- if (read_img(img, &magic) < 0)
- goto out;
-
- ret = cr_parse_fd(img, magic);
-out:
- if (img)
- close_image(img);
- else
- close_safe(&fd);
- return ret;
-}
-
-int cr_parse_fd(struct cr_img *img, u32 magic)
-{
- int ret = 0, i;
-
- if (magic == IMG_COMMON_MAGIC || magic == IMG_SERVICE_MAGIC) {
- if (read_img(img, &magic) < 0)
- goto out;
- }
-
- if (magic == PSTREE_MAGIC) {
- show_collect_pstree(img, 0);
- goto out;
- }
-
- if (magic == SIGNAL_MAGIC || magic == PSIGNAL_MAGIC) {
- show_siginfo(img);
- goto out;
- }
-
- for (i = 0; show_infos[i].magic; i++) {
- struct show_image_info *si;
-
- si = &show_infos[i];
- if (si->magic != magic)
- continue;
-
- do_pb_show_plain(img, si->pb_type, si->single,
- si->payload, si->fmt);
- goto out;
- }
-
- ret = -1;
- pr_err("Unknown magic %#x in %s\n",
- magic, opts.show_dump_file);
- try_hint_magic(magic);
-
-out:
- return ret;
-}
-
-static int cr_show_pstree_item(struct pstree_item *item)
-{
- int ret = -1, i;
- struct cr_img *img;
- struct cr_imgset *cr_imgset = NULL;
- TaskKobjIdsEntry *ids;
-
- cr_imgset = cr_task_imgset_open(item->pid.virt, O_SHOW);
- if (!cr_imgset)
- goto out;
-
- pr_msg("Task %d:\n", item->pid.virt);
- pr_msg("----------------------------------------\n");
-
- cr_parse_fd(img_from_set(cr_imgset, CR_FD_CORE), CORE_MAGIC);
-
- if (item->nr_threads > 1) {
- for (i = 0; i < item->nr_threads; i++) {
-
- if (item->threads[i].virt == item->pid.virt)
- continue;
-
- img = open_image(CR_FD_CORE, O_SHOW, item->threads[i].virt);
- if (!img)
- goto outc;
-
- pr_msg("Thread %d.%d:\n", item->pid.virt, item->threads[i].virt);
- pr_msg("----------------------------------------\n");
-
- cr_parse_fd(img, CORE_MAGIC);
- close_image(img);
- }
- }
-
- pr_msg("Resources for %d:\n", item->pid.virt);
- pr_msg("----------------------------------------\n");
- for (i = _CR_FD_TASK_FROM + 1; i < _CR_FD_TASK_TO; i++)
- if ((i != CR_FD_CORE) && (i != CR_FD_IDS)) {
- pr_msg("* ");
- pr_msg(imgset_template[i].fmt, item->pid.virt);
- pr_msg(":\n");
- cr_parse_fd(img_from_set(cr_imgset, i), imgset_template[i].magic);
- }
-
- img = open_image(CR_FD_RLIMIT, O_SHOW, item->pid.virt);
- if (img) {
- pr_msg("* ");
- pr_msg(imgset_template[CR_FD_RLIMIT].fmt, item->pid.virt);
- pr_msg(":\n");
-
- cr_parse_fd(img, RLIMIT_MAGIC);
- close_image(img);
- }
-
- if (pb_read_one(img_from_set(cr_imgset, CR_FD_IDS), &ids, PB_IDS) > 0) {
- img = open_image(CR_FD_FDINFO, O_SHOW, ids->files_id);
- if (img) {
- pr_msg("* ");
- pr_msg(imgset_template[CR_FD_FDINFO].fmt, ids->files_id);
- pr_msg(":\n");
-
- cr_parse_fd(img, FDINFO_MAGIC);
- close_image(img);
- }
-
- task_kobj_ids_entry__free_unpacked(ids, NULL);
- }
-
- pr_msg("---[ end of task %d ]---\n", item->pid.virt);
-
- ret = 0;
-outc:
- close_cr_imgset(&cr_imgset);
-out:
- return ret;
-}
-
-static int cr_show_pid(int pid)
-{
- int ret;
- struct cr_img *img;
- struct pstree_item item;
-
- img = open_image(CR_FD_PSTREE, O_SHOW);
- if (!img)
- return -1;
-
- while (1) {
- PstreeEntry *pe;
-
- ret = pb_read_one_eof(img, &pe, PB_PSTREE);
- if (ret <= 0) {
- close_image(img);
- return ret;
- }
-
- if (pe->pid == pid) {
- pstree_item_from_pb(pe, &item);
- pstree_entry__free_unpacked(pe, NULL);
- break;
- }
-
- pstree_entry__free_unpacked(pe, NULL);
- }
-
- close_image(img);
-
- return cr_show_pstree_item(&item);
-}
-
-static int cr_show_all(void)
-{
- struct pstree_item *item = NULL, *tmp;
- int ret = -1, pid;
- struct cr_img *img;
-
- img = open_image(CR_FD_PSTREE, O_SHOW);
- if (!img)
- goto out;
- show_collect_pstree(img, 1);
- close_image(img);
-
- pid = list_first_entry(&pstree_list, struct pstree_item, sibling)->pid.virt;
- ret = try_show_namespaces(pid);
- if (ret)
- goto out;
-
- list_for_each_entry(item, &pstree_list, sibling)
- if (cr_show_pstree_item(item))
- break;
-
-out:
- list_for_each_entry_safe(item, tmp, &pstree_list, sibling) {
- list_del(&item->sibling);
- xfree(item->threads);
- xfree(item);
- }
- return ret;
-}
-
-int cr_show(int pid)
-{
- if (isatty(STDOUT_FILENO)) {
- pr_msg("The \"show\" action is deprecated by the CRIT utility.\n");
- pr_msg("To view an image use the \"crit decode -i $name --pretty\" command.\n");
- return -1;
- }
-
- if (opts.show_dump_file)
- return cr_parse_file();
-
- if (pid)
- return cr_show_pid(pid);
-
- return cr_show_all();
-}
diff --git a/criu/Makefile b/criu/Makefile
new file mode 100644
index 000000000000..e4e2afb15064
--- /dev/null
+++ b/criu/Makefile
@@ -0,0 +1,173 @@
+#
+# CRIU version.
+VERSION_MAJOR := 2
+VERSION_MINOR := 0
+VERSION_SUBLEVEL :=
+VERSION_EXTRA :=
+VERSION_NAME :=
+
+export VERSION_MAJOR VERSION_MINOR
+export VERSION_SUBLEVEL VERSION_EXTRA VERSION_NAME
+
+#
+# HOST part is needed to build helper
+# tools such as piegen.
+HOSTCC ?= gcc
+HOSTLD ?= ld
+HOSTCFLAGS ?= $(CFLAGS)
+CFLAGS += $(USERCFLAGS)
+
+export HOSTCC HOSTLD HOSTCFLAGS
+
+ifeq ($(ARCH),x86)
+ SRCARCH := x86
+ LDARCH := i386:x86-64
+ VDSO := y
+endif
+
+ifeq ($(ARCH),arm)
+ SRCARCH := arm
+endif
+
+ifeq ($(ARCH),arm64)
+ ARCH := aarch64
+ SRCARCH := aarch64
+ VDSO := y
+endif
+
+ifeq ($(ARCH),powerpc)
+ ARCH := ppc64
+ SRCARCH := ppc64
+ LDARCH := powerpc:common64
+ VDSO := y
+endif
+
+LDARCH ?= $(SRCARCH)
+
+export SRCARCH LDARCH VDSO
+
+SRCARCH ?= $(ARCH)
+LDARCH ?= $(SRCARCH)
+ARCH_DIR := arch/$(SRCARCH)
+
+export SRCARCH LDARCH ARCH_DIR VDSO
+
+$(if $(wildcard $(ARCH_DIR)),,$(error "The architecture $(ARCH) isn't supported"))
+
+#
+# General flags.
+ccflags-y += -fno-strict-aliasing
+ccflags-y += -iquote $(SRC_DIR)/criu/include
+ccflags-y += -iquote $(SRC_DIR)/images
+ccflags-y += -iquote $(SRC_DIR)/criu/pie
+ccflags-y += -iquote $(SRC_DIR)/criu/$(ARCH_DIR)
+ccflags-y += -iquote $(SRC_DIR)/criu/$(ARCH_DIR)/include
+ccflags-y += -iquote $(SRC_DIR)/
+ccflags-y += -I/usr/include/libnl3
+
+export ccflags-y
+
+LIBS := -lrt -lpthread -lprotobuf-c -ldl -lnl-3
+
+ifeq ($(GMON),1)
+ CFLAGS += -pg
+ GMONLDOPT := -pg
+endif
+
+#
+# piegen tool might be disabled by hands. Don't use it until
+# you know what you're doing.
+ifneq ($(filter ia32 x86 ppc64,$(ARCH)),)
+ ifneq ($(PIEGEN),no)
+ piegen-y := y
+ export piegen-y
+ endif
+endif
+
+#
+# Version header file.
+include Makefile.version
+
+#
+# Configure variables.
+include Makefile.config
+config: $(VERSION_HEADER)
+
+#
+# System calls library.
+SYSCALL-LIB := $(ARCH_DIR)/syscalls.built-in.o
+$(SYSCALL-LIB): config
+ $(Q) $(MAKE) $(call build-as,Makefile.syscalls,$(ARCH_DIR)) all
+PHONY += $(SYSCALL-LIB)
+
+#
+# Architecture dependant part.
+ARCH-LIB := $(ARCH_DIR)/crtools.built-in.o
+$(ARCH-LIB): config $(SYSCALL-LIB)
+ $(Q) $(MAKE) $(call build-as,Makefile,$(ARCH_DIR)) $@
+PHONY += $(ARCH-LIB)
+
+#
+# piegen tool needed for PIE code.
+ifeq ($(piegen-y),y)
+piegen := pie/piegen/piegen
+
+pie/piegen/%: config
+ $(Q) CC=$(HOSTCC) LD=$(HOSTLD) CFLAGS="$(HOSTCFLAGS) $(WARNINGS) $(DEFINES)" $(MAKE) $(build)=pie/piegen $@
+pie/piegen: config
+ $(Q) CC=$(HOSTCC) LD=$(HOSTLD) CFLAGS="$(HOSTCFLAGS) $(WARNINGS) $(DEFINES)" $(MAKE) $(build)=pie/piegen all
+$(piegen): pie/piegen/built-in.o
+ $(call msg-link, $@)
+ $(Q) $(HOSTCC) $(HOSTCFLAGS) $^ $(LDFLAGS) -o $@
+PHONY += pie/piegen
+endif
+
+#
+# PIE library code.
+pie/lib.a: $(ARCH-LIB)
+ $(Q) $(MAKE) $(call build-as,Makefile.library,pie) all
+
+#
+# PIE code blobs themseves.
+pie: $(ARCH_DIR) $(piegen) pie/lib.a
+ $(Q) $(MAKE) $(build)=pie all
+PHONY += pie
+
+#
+# CRIU executable
+PROGRAM-BUILTINS += ../images/built-in.o
+PROGRAM-BUILTINS += built-in.o
+PROGRAM-BUILTINS += pie/lib.a
+PROGRAM-BUILTINS += $(SYSCALL-LIB)
+
+LIBS += arch/$(ARCH)/crtools.built-in.o
+
+built-in.o: pie/lib.a
+ $(Q) $(MAKE) $(call build-as,Makefile.crtools,.) all
+
+criu: $(PROGRAM-BUILTINS) built-in.o
+ $(call msg-link, $@)
+ $(Q) $(CC) $(CFLAGS) $^ $(LIBS) $(LDFLAGS) $(GMONLDOPT) -rdynamic -o $@
+PHONY += criu
+
+#
+# Cleanup everything.
+clean:
+ $(Q) $(MAKE) $(call build-as,Makefile.syscalls,$(ARCH_DIR)) $@
+ $(Q) $(MAKE) $(call build-as,Makefile.library,pie) $@
+ $(Q) $(MAKE) $(call build-as,Makefile.crtools,.) $@
+ $(Q) $(MAKE) $(build)=pie/piegen $@
+ $(Q) $(MAKE) $(build)=pie $@
+ $(Q) $(RM) ./*.{gcda,gcno,gcov}
+ $(Q) $(RM) ./pie/*.{gcda,gcno,gcov}
+ $(Q) $(RM) ./pie/piegen/*.{gcda,gcno,gcov}
+ $(Q) $(RM) -r ./gcov
+ $(Q) $(RM) $(VERSION_HEADER)
+ $(Q) $(RM) $(CONFIG_HEADER)
+
+#
+# Final @all target.
+all: $(PHONY)
+ @true
+
+.PHONY: $(PHONY) clean
diff --git a/criu/Makefile.config b/criu/Makefile.config
new file mode 100644
index 000000000000..a39f4cd911bd
--- /dev/null
+++ b/criu/Makefile.config
@@ -0,0 +1,61 @@
+include $(__nmk_dir)/utils.mk
+include ../scripts/feature-tests.mak
+
+CONFIG_HEADER := include/config.h
+
+ifeq ($(call try-cc,$(LIBBSD_DEV_TEST),-lbsd),y)
+ LIBS += -lbsd
+ DEFINES += -DCONFIG_HAS_LIBBSD
+endif
+
+ifeq ($(call pkg-config-check,libselinux),y)
+ LIBS += -lselinux $(LIBS)
+ DEFINES += -DCONFIG_HAS_SELINUX
+endif
+
+$(CONFIG_HEADER): include/config-base.h
+ $(call msg-gen, $@)
+ $(Q) @echo '#ifndef __CR_CONFIG_H__' > $@
+ $(Q) @echo '#define __CR_CONFIG_H__' >> $@
+ $(Q) @echo '' >> $@
+ $(Q) @echo '#include "config-base.h"' >> $@
+ $(Q) @echo '' >> $@
+ifeq ($(call try-cc,$(TCP_REPAIR_TEST),,$(DEFINES)),y)
+ $(Q) @echo '#define CONFIG_HAS_TCP_REPAIR' >> $@
+ $(Q) @echo '' >> $@
+endif
+ifeq ($(call try-cc,$(PRLIMIT_TEST),,$(DEFINES)),y)
+ $(Q) @echo '#define CONFIG_HAS_PRLIMIT' >> $@
+ $(Q) @echo '' >> $@
+endif
+ifeq ($(call try-cc,$(STRLCPY_TEST),$(LIBS),$(DEFINES)),y)
+ $(Q) @echo '#define CONFIG_HAS_STRLCPY' >> $@
+ $(Q) @echo '' >> $@
+endif
+ifeq ($(call try-cc,$(STRLCAT_TEST),$(LIBS),$(DEFINES)),y)
+ $(Q) @echo '#define CONFIG_HAS_STRLCAT' >> $@
+ $(Q) @echo '' >> $@
+endif
+ifeq ($(call try-cc,$(PTRACE_PEEKSIGINFO_TEST),,$(DEFINES)),y)
+ $(Q) @echo '#define CONFIG_HAS_PEEKSIGINFO_ARGS' >> $@
+ $(Q) @echo '' >> $@
+endif
+ifeq ($(VDSO),y)
+ $(Q) @echo '#define CONFIG_VDSO' >> $@
+ $(Q) @echo '' >> $@
+endif
+ifeq ($(call try-cc,$(SETPROCTITLE_INIT_TEST),-lbsd,$(DEFINES)),y)
+ $(Q) @echo '#define CONFIG_HAS_SETPROCTITLE_INIT' >> $@
+ $(Q) @echo '' >> $@
+endif
+ifeq ($(call try-cc,$(MEMFD_TEST),),y)
+ $(Q) @echo '#define CONFIG_HAS_MEMFD' >> $@
+endif
+ifeq ($(piegen-y),y)
+ $(Q) @echo '#define CONFIG_PIEGEN' >> $@
+ $(Q) @echo '' >> $@
+endif
+ $(Q) @echo '#endif /* __CR_CONFIG_H__ */' >> $@
+
+config: $(CONFIG_HEADER)
+PHONY += config
diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools
new file mode 100644
index 000000000000..5c3f65cad1a2
--- /dev/null
+++ b/criu/Makefile.crtools
@@ -0,0 +1,92 @@
+ccflags-y += -iquote $(ARCH)
+obj-y += action-scripts.o
+obj-y += aio.o
+obj-y += bfd.o
+obj-y += bitmap.o
+obj-y += cgroup.o
+obj-y += cr-check.o
+obj-y += cr-dedup.o
+obj-y += cr-dump.o
+obj-y += cr-errno.o
+obj-y += cr-exec.o
+obj-y += cr-restore.o
+obj-y += cr-service.o
+obj-y += cr-show.o
+obj-y += crtools.o
+obj-y += eventfd.o
+obj-y += eventpoll.o
+obj-y += fault-injection.o
+obj-y += fifo.o
+obj-y += file-ids.o
+obj-y += file-lock.o
+obj-y += files-ext.o
+obj-y += files.o
+obj-y += files-reg.o
+obj-y += fsnotify.o
+obj-y += image-desc.o
+obj-y += image.o
+obj-y += ipc_ns.o
+obj-y += irmap.o
+obj-y += kcmp-ids.o
+obj-y += kerndat.o
+obj-y += libnetlink.o
+obj-y += log.o
+obj-y += lsm.o
+obj-y += mem.o
+obj-y += mount.o
+obj-y += namespaces.o
+obj-y += netfilter.o
+obj-y += net.o
+obj-y += pagemap-cache.o
+obj-y += page-pipe.o
+obj-y += page-read.o
+obj-y += page-xfer.o
+obj-y += parasite-syscall.o
+obj-y += pie/pie-relocs.o
+obj-y += pie-util-fd.o
+obj-y += pie-util.o
+obj-y += pipes.o
+obj-y += plugin.o
+obj-y += proc_parse.o
+obj-y += protobuf-desc.o
+obj-y += protobuf.o
+obj-y += pstree.o
+obj-y += ptrace.o
+obj-y += rbtree.o
+obj-y += rst-malloc.o
+obj-y += seccomp.o
+obj-y += seize.o
+obj-y += shmem.o
+obj-y += sigframe.o
+obj-y += signalfd.o
+obj-y += sk-inet.o
+obj-y += sk-netlink.o
+obj-y += sk-packet.o
+obj-y += sk-queue.o
+obj-y += sk-tcp.o
+obj-y += sk-unix.o
+obj-y += sockets.o
+obj-y += stats.o
+obj-y += string.o
+obj-y += sysctl.o
+obj-y += sysfs_parse.o
+obj-y += timerfd.o
+obj-y += tty.o
+obj-y += tun.o
+obj-y += util.o
+obj-y += uts_ns.o
+
+ifeq ($(VDSO),y)
+obj-y += pie-util-vdso.o
+obj-y += vdso.o
+endif
+
+PROTOBUF_GEN := $(SRC_DIR)/scripts/protobuf-gen.sh
+
+protobuf-desc.c: protobuf-desc-gen.h
+
+protobuf-desc-gen.h: $(PROTOBUF_GEN) include/protobuf-desc.h
+ $(call msg-gen, $@)
+ $(Q) $(SH) $(PROTOBUF_GEN) > $@
+
+cleanup-y += protobuf-desc-gen.h
diff --git a/criu/Makefile.version b/criu/Makefile.version
new file mode 100644
index 000000000000..44dad6c14266
--- /dev/null
+++ b/criu/Makefile.version
@@ -0,0 +1,31 @@
+CRTOOLSVERSION := $(VERSION_MAJOR)$(if $(VERSION_MINOR),.$(VERSION_MINOR))$(if $(VERSION_SUBLEVEL),.$(VERSION_SUBLEVEL))
+VERSION_HEADER := include/version.h
+GITID_FILE := ../.gitid
+GITID := $(shell if [ -d "../.git" ]; then cd .. && git describe; fi)
+
+ifeq ($(GITID),)
+ GITID := 0
+else
+ GITID_FILE_VALUE := $(shell if [ -f '$(GITID_FILE)' ]; then if [ `cat '$(GITID_FILE)'` = $(GITID) ]; then echo y; fi; fi)
+ ifneq ($(GITID_FILE_VALUE),y)
+ .PHONY: $(GITID_FILE)
+ endif
+endif
+
+$(GITID_FILE):
+ $(call msg-gen, $@)
+ $(Q) echo "$(GITID)" > $(GITID_FILE)
+
+$(VERSION_HEADER): Makefile.version $(GITID_FILE)
+ $(call msg-gen, $@)
+ $(Q) echo "/* Autogenerated, do not edit */" > $(VERSION_HEADER)
+ $(Q) echo "#ifndef __CR_VERSION_H__" >> $(VERSION_HEADER)
+ $(Q) echo "#define __CR_VERSION_H__" >> $(VERSION_HEADER)
+ $(Q) echo "#define CRIU_VERSION \"$(CRTOOLSVERSION)\"" >> $(VERSION_HEADER)
+ $(Q) echo "#define CRIU_VERSION_MAJOR " $(VERSION_MAJOR) >> $(VERSION_HEADER)
+ $(Q) echo "#define CRIU_VERSION_MINOR " $(VERSION_MINOR) >> $(VERSION_HEADER)
+ $(Q) echo "#define CRIU_GITID \"$(GITID)\"" >> $(VERSION_HEADER)
+ $(Q) echo "#endif /* __CR_VERSION_H__ */" >> $(VERSION_HEADER)
+
+Makefile.version:
+ @true
diff --git a/criu/action-scripts.c b/criu/action-scripts.c
new file mode 100644
index 000000000000..05aa9d01326e
--- /dev/null
+++ b/criu/action-scripts.c
@@ -0,0 +1,77 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <limits.h>
+#include <stdlib.h>
+
+#include "cr_options.h"
+#include "list.h"
+#include "xmalloc.h"
+#include "log.h"
+#include "servicefd.h"
+#include "cr-service.h"
+#include "action-scripts.h"
+
+static const char *action_names[ACT_MAX] = {
+ [ ACT_PRE_DUMP ] = "pre-dump",
+ [ ACT_POST_DUMP ] = "post-dump",
+ [ ACT_PRE_RESTORE ] = "pre-restore",
+ [ ACT_POST_RESTORE ] = "post-restore",
+ [ ACT_NET_LOCK ] = "network-lock",
+ [ ACT_NET_UNLOCK ] = "network-unlock",
+ [ ACT_SETUP_NS ] = "setup-namespaces",
+ [ ACT_POST_SETUP_NS ] = "post-setup-namespaces",
+};
+
+int run_scripts(enum script_actions act)
+{
+ struct script *script;
+ int ret = 0;
+ char image_dir[PATH_MAX];
+ const char *action = action_names[act];
+
+ pr_debug("Running %s scripts\n", action);
+
+ if (unlikely(list_empty(&opts.scripts)))
+ return 0;
+
+ if (setenv("CRTOOLS_SCRIPT_ACTION", action, 1)) {
+ pr_perror("Can't set CRTOOLS_SCRIPT_ACTION=%s", action);
+ return -1;
+ }
+
+ sprintf(image_dir, "/proc/%ld/fd/%d", (long) getpid(), get_service_fd(IMG_FD_OFF));
+ if (setenv("CRTOOLS_IMAGE_DIR", image_dir, 1)) {
+ pr_perror("Can't set CRTOOLS_IMAGE_DIR=%s", image_dir);
+ return -1;
+ }
+
+ list_for_each_entry(script, &opts.scripts, node) {
+ if (script->path == SCRIPT_RPC_NOTIFY) {
+ pr_debug("\tRPC\n");
+ ret |= send_criu_rpc_script(act, (char *)action, script->arg);
+ } else {
+ pr_debug("\t[%s]\n", script->path);
+ ret |= system(script->path);
+ }
+ }
+
+ unsetenv("CRTOOLS_SCRIPT_ACTION");
+ if (ret)
+ pr_err("One of more action scripts failed\n");
+ return ret;
+}
+
+int add_script(char *path, int arg)
+{
+ struct script *script;
+
+ script = xmalloc(sizeof(struct script));
+ if (script == NULL)
+ return 1;
+
+ script->path = path;
+ script->arg = arg;
+ list_add(&script->node, &opts.scripts);
+
+ return 0;
+}
diff --git a/criu/aio.c b/criu/aio.c
new file mode 100644
index 000000000000..9965efd8c483
--- /dev/null
+++ b/criu/aio.c
@@ -0,0 +1,120 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include "vma.h"
+#include "xmalloc.h"
+#include "aio.h"
+#include "parasite.h"
+#include "parasite-syscall.h"
+#include "protobuf/mm.pb-c.h"
+
+int dump_aio_ring(MmEntry *mme, struct vma_area *vma)
+{
+ int nr = mme->n_aios;
+ AioRingEntry *re;
+
+ pr_info("Dumping AIO ring @%"PRIx64", %u reqs\n",
+ vma->e->start, vma->aio_nr_req);
+
+ mme->aios = xrealloc(mme->aios, (nr + 1) * sizeof(re));
+ if (!mme->aios)
+ return -1;
+
+ re = xmalloc(sizeof(*re));
+ if (!re)
+ return -1;
+
+ aio_ring_entry__init(re);
+ re->id = vma->e->start;
+ re->nr_req = vma->aio_nr_req;
+ re->ring_len = vma->e->end - vma->e->start;
+ mme->aios[nr] = re;
+ mme->n_aios = nr + 1;
+ return 0;
+}
+
+void free_aios(MmEntry *mme)
+{
+ int i;
+
+ if (mme->aios) {
+ for (i = 0; i < mme->n_aios; i++)
+ xfree(mme->aios[i]);
+ xfree(mme->aios);
+ }
+}
+
+static unsigned int aio_estimate_nr_reqs(unsigned int k_max_reqs)
+{
+ /*
+ * Kernel does
+ *
+ * nr_reqs = max(nr_reqs, nr_cpus * 4)
+ * nr_reqs *= 2
+ * nr_reqs += 2
+ * ring = roundup(sizeof(head) + nr_reqs * sizeof(req))
+ * nr_reqs = (ring - sizeof(head)) / sizeof(req)
+ *
+ * And the k_max_reqs here is the resulting value.
+ *
+ * We need to get the initial nr_reqs that would grow
+ * up back to the k_max_reqs.
+ */
+
+ return (k_max_reqs - 2) / 2;
+}
+
+unsigned long aio_rings_args_size(struct vm_area_list *vmas)
+{
+ return sizeof(struct parasite_check_aios_args) +
+ vmas->nr_aios * sizeof(struct parasite_aio);
+}
+
+int parasite_check_aios(struct parasite_ctl *ctl, struct vm_area_list *vmas)
+{
+ struct vma_area *vma;
+ struct parasite_check_aios_args *aa;
+ struct parasite_aio *pa;
+ int i;
+
+ if (!vmas->nr_aios)
+ return 0;
+
+ pr_info("Checking AIO rings\n");
+
+ /*
+ * Go to parasite and
+ * a) check that no requests are currently pengind
+ * b) get the maximum number of requests kernel handles
+ * to estimate what was the user request on ring
+ * creation.
+ */
+
+ aa = parasite_args_s(ctl, aio_rings_args_size(vmas));
+ pa = &aa->ring[0];
+ list_for_each_entry(vma, &vmas->h, list) {
+ if (!vma_area_is(vma, VMA_AREA_AIORING))
+ continue;
+
+ pr_debug(" `- Ring #%ld @%"PRIx64"\n",
+ (long)(pa - &aa->ring[0]), vma->e->start);
+ pa->ctx = vma->e->start;
+ pa->max_reqs = 0;
+ pa->vma_nr_reqs = &vma->aio_nr_req;
+ pa++;
+ }
+ aa->nr_rings = vmas->nr_aios;
+
+ if (parasite_execute_daemon(PARASITE_CMD_CHECK_AIOS, ctl))
+ return -1;
+
+ pa = &aa->ring[0];
+ for (i = 0; i < vmas->nr_aios; i++) {
+ pa = &aa->ring[i];
+ *pa->vma_nr_reqs = aio_estimate_nr_reqs(pa->max_reqs);
+ pr_debug(" `- Ring #%d has %u reqs, estimated to %u\n", i,
+ pa->max_reqs, *pa->vma_nr_reqs);
+ }
+
+ return 0;
+}
diff --git a/criu/arch/aarch64/Makefile b/criu/arch/aarch64/Makefile
new file mode 100644
index 000000000000..652d4821bdfd
--- /dev/null
+++ b/criu/arch/aarch64/Makefile
@@ -0,0 +1,7 @@
+builtin-name := crtools.built-in.o
+
+ccflags-y += -iquote $(obj) -iquote $(SRC_DIR)
+ccflags-y += -iquote $(obj)/include -iquote $(SRC_DIR)/criu/include
+
+obj-y += cpu.o
+obj-y += crtools.o
diff --git a/criu/arch/aarch64/Makefile.syscalls b/criu/arch/aarch64/Makefile.syscalls
new file mode 100644
index 000000000000..9b3b5387bd41
--- /dev/null
+++ b/criu/arch/aarch64/Makefile.syscalls
@@ -0,0 +1,50 @@
+builtin-name := syscalls.built-in.o
+
+SYS-TYPES := ../../include/syscall-types.h
+SYS-CODES := ../../include/syscall-codes.h
+SYS-PROTO := ../../include/syscall.h
+
+SYS-DEF := syscall.def
+SYS-ASM-COMMON := syscall-common.S
+
+SYS-GEN := ../scripts/arm/gen-syscalls.pl
+SYS-GEN-TBL := ../scripts/arm/gen-sys-exec-tbl.pl
+
+asflags-y += -D__ASSEMBLY__ -nostdlib -fomit-frame-pointer
+asflags-y += -fpie -Wstrict-prototypes -Wa,--noexecstack
+asflags-y += -iquote $(obj) -iquote $(obj)/include -iquote $(SRC_DIR)/criu/include
+
+SYS-ASM := syscalls.S
+obj-y += $(SYS-ASM:.S=).o
+
+ARCH_BITS := 64
+
+SYS-EXEC-TBL := sys-exec-tbl.c
+
+$(obj)/$(SYS-ASM): $(obj)/$(SYS-GEN) $(obj)/syscalls/$(SYS-DEF) $(obj)/syscalls/$(SYS-ASM-COMMON)
+ $(E) " GEN " $@
+ $(Q) perl \
+ $(obj)/$(SYS-GEN) \
+ $(obj)/syscalls/$(SYS-DEF) \
+ $(obj)/$(SYS-CODES) \
+ $(obj)/$(SYS-PROTO) \
+ $(obj)/$(SYS-ASM) \
+ syscalls/$(SYS-ASM-COMMON) \
+ $(obj)/$(SYS-TYPES) \
+ $(ARCH_BITS)
+
+$(obj)/syscalls.o: $(obj)/$(SYS-ASM)
+
+$(obj)/$(SYS-EXEC-TBL): $(obj)/$(SYS-GEN-TBL) $(obj)/syscalls/$(SYS-DEF)
+ $(E) " GEN " $@
+ $(Q) perl \
+ $(obj)/$(SYS-GEN-TBL) \
+ $(obj)/syscalls/$(SYS-DEF) \
+ $(obj)/$(SYS-EXEC-TBL) \
+ $(ARCH_BITS)
+
+all-y += $(obj)/$(SYS-EXEC-TBL)
+
+cleanup-y += $(obj)/$(SYS-EXEC-TBL) $(obj)/$(SYS-ASM)
+cleanup-y += $(obj)/$(SYS-CODES)
+cleanup-y += $(obj)/$(SYS-PROTO)
diff --git a/criu/arch/aarch64/cpu.c b/criu/arch/aarch64/cpu.c
new file mode 100644
index 000000000000..040fe14fcfb7
--- /dev/null
+++ b/criu/arch/aarch64/cpu.c
@@ -0,0 +1,45 @@
+#undef LOG_PREFIX
+#define LOG_PREFIX "cpu: "
+
+#include <errno.h>
+#include "cpu.h"
+
+bool cpu_has_feature(unsigned int feature)
+{
+ return false;
+}
+
+int cpu_init(void)
+{
+ return 0;
+}
+
+int cpu_dump_cpuinfo(void)
+{
+ return 0;
+}
+
+int cpu_validate_cpuinfo(void)
+{
+ return 0;
+}
+
+int cpu_dump_cpuinfo_single(void)
+{
+ return -ENOTSUP;
+}
+
+int cpu_validate_image_cpuinfo_single(void)
+{
+ return -ENOTSUP;
+}
+
+int cpuinfo_dump(void)
+{
+ return -ENOTSUP;
+}
+
+int cpuinfo_check(void)
+{
+ return -ENOTSUP;
+}
diff --git a/criu/arch/aarch64/crtools.c b/criu/arch/aarch64/crtools.c
new file mode 100644
index 000000000000..5df7f1d5fd6c
--- /dev/null
+++ b/criu/arch/aarch64/crtools.c
@@ -0,0 +1,233 @@
+#include <string.h>
+#include <unistd.h>
+
+#include <linux/elf.h>
+
+#include "asm/types.h"
+#include "asm/restorer.h"
+#include "compiler.h"
+#include "ptrace.h"
+#include "asm/processor-flags.h"
+#include "protobuf.h"
+#include "protobuf/core.pb-c.h"
+#include "protobuf/creds.pb-c.h"
+#include "parasite-syscall.h"
+#include "log.h"
+#include "util.h"
+#include "cpu.h"
+#include "parasite-syscall.h"
+#include "restorer.h"
+
+
+/*
+ * Injected syscall instruction
+ */
+const char code_syscall[] = {
+ 0x01, 0x00, 0x00, 0xd4, /* SVC #0 */
+ 0x00, 0x00, 0x20, 0xd4 /* BRK #0 */
+};
+
+const int code_syscall_size = round_up(sizeof(code_syscall), sizeof(long));
+
+static inline void __always_unused __check_code_syscall(void)
+{
+ BUILD_BUG_ON(sizeof(code_syscall) != BUILTIN_SYSCALL_SIZE);
+ BUILD_BUG_ON(!is_log2(sizeof(code_syscall)));
+}
+
+void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs)
+{
+ regs->pc = new_ip;
+ if (stack)
+ regs->sp = (unsigned long)stack;
+}
+
+bool arch_can_dump_task(pid_t pid)
+{
+ /*
+ * TODO: Add proper check here
+ */
+ return true;
+}
+
+int syscall_seized(struct parasite_ctl *ctl, int nr, unsigned long *ret,
+ unsigned long arg1,
+ unsigned long arg2,
+ unsigned long arg3,
+ unsigned long arg4,
+ unsigned long arg5,
+ unsigned long arg6)
+{
+ user_regs_struct_t regs = ctl->orig.regs;
+ int err;
+
+ regs.regs[8] = (unsigned long)nr;
+ regs.regs[0] = arg1;
+ regs.regs[1] = arg2;
+ regs.regs[2] = arg3;
+ regs.regs[3] = arg4;
+ regs.regs[4] = arg5;
+ regs.regs[5] = arg6;
+ regs.regs[6] = 0;
+ regs.regs[7] = 0;
+
+ err = __parasite_execute_syscall(ctl, ®s);
+
+ *ret = regs.regs[0];
+ return err;
+}
+
+
+#define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))(src).e
+
+int get_task_regs(pid_t pid, user_regs_struct_t regs, CoreEntry *core)
+{
+ struct iovec iov;
+ struct user_fpsimd_state fpsimd;
+ int i, ret;
+
+ pr_info("Dumping GP/FPU registers for %d\n", pid);
+
+ iov.iov_base = ®s;
+ iov.iov_len = sizeof(user_regs_struct_t);
+ if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov))) {
+ pr_perror("Failed to obtain CPU registers for %d", pid);
+ goto err;
+ }
+
+ iov.iov_base = &fpsimd;
+ iov.iov_len = sizeof(fpsimd);
+ if ((ret = ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov))) {
+ pr_perror("Failed to obtain FPU registers for %d", pid);
+ goto err;
+ }
+
+
+ // Save the Aarch64 CPU state
+ for (i = 0; i < 31; ++i)
+ assign_reg(core->ti_aarch64->gpregs, regs, regs[i]);
+ assign_reg(core->ti_aarch64->gpregs, regs, sp);
+ assign_reg(core->ti_aarch64->gpregs, regs, pc);
+ assign_reg(core->ti_aarch64->gpregs, regs, pstate);
+
+
+ // Save the FP/SIMD state
+ for (i = 0; i < 32; ++i)
+ {
+ core->ti_aarch64->fpsimd->vregs[2*i] = fpsimd.vregs[i];
+ core->ti_aarch64->fpsimd->vregs[2*i + 1] = fpsimd.vregs[i] >> 64;
+ }
+ assign_reg(core->ti_aarch64->fpsimd, fpsimd, fpsr);
+ assign_reg(core->ti_aarch64->fpsimd, fpsimd, fpcr);
+
+ ret = 0;
+
+err:
+ return ret;
+}
+
+int arch_alloc_thread_info(CoreEntry *core)
+{
+ ThreadInfoAarch64 *ti_aarch64;
+ UserAarch64RegsEntry *gpregs;
+ UserAarch64FpsimdContextEntry *fpsimd;
+
+ ti_aarch64 = xmalloc(sizeof(*ti_aarch64));
+ if (!ti_aarch64)
+ goto err;
+ thread_info_aarch64__init(ti_aarch64);
+ core->ti_aarch64 = ti_aarch64;
+
+ gpregs = xmalloc(sizeof(*gpregs));
+ if (!gpregs)
+ goto err;
+ user_aarch64_regs_entry__init(gpregs);
+
+ gpregs->regs = xmalloc(31*sizeof(uint64_t));
+ if (!gpregs->regs)
+ goto err;
+ gpregs->n_regs = 31;
+
+ ti_aarch64->gpregs = gpregs;
+
+ fpsimd = xmalloc(sizeof(*fpsimd));
+ if (!fpsimd)
+ goto err;
+ user_aarch64_fpsimd_context_entry__init(fpsimd);
+ ti_aarch64->fpsimd = fpsimd;
+ fpsimd->vregs = xmalloc(64*sizeof(fpsimd->vregs[0]));
+ fpsimd->n_vregs = 64;
+ if (!fpsimd->vregs)
+ goto err;
+
+ return 0;
+err:
+ return -1;
+}
+
+void arch_free_thread_info(CoreEntry *core)
+{
+ if (CORE_THREAD_ARCH_INFO(core)) {
+ if (CORE_THREAD_ARCH_INFO(core)->fpsimd) {
+ xfree(CORE_THREAD_ARCH_INFO(core)->fpsimd->vregs);
+ xfree(CORE_THREAD_ARCH_INFO(core)->fpsimd);
+ }
+ xfree(CORE_THREAD_ARCH_INFO(core)->gpregs->regs);
+ xfree(CORE_THREAD_ARCH_INFO(core)->gpregs);
+ xfree(CORE_THREAD_ARCH_INFO(core));
+ CORE_THREAD_ARCH_INFO(core) = NULL;
+ }
+}
+
+int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core)
+{
+ int i;
+ struct fpsimd_context *fpsimd = &RT_SIGFRAME_FPU(sigframe);
+
+ if (core->ti_aarch64->fpsimd->n_vregs != 64)
+ return 1;
+
+ for (i = 0; i < 32; ++i)
+ fpsimd->vregs[i] = (__uint128_t)core->ti_aarch64->fpsimd->vregs[2*i] |
+ ((__uint128_t)core->ti_aarch64->fpsimd->vregs[2*i + 1] << 64);
+ assign_reg(fpsimd, *core->ti_aarch64->fpsimd, fpsr);
+ assign_reg(fpsimd, *core->ti_aarch64->fpsimd, fpcr);
+
+ fpsimd->head.magic = FPSIMD_MAGIC;
+ fpsimd->head.size = sizeof(*fpsimd);
+
+ return 0;
+}
+
+void *mmap_seized(
+ struct parasite_ctl *ctl,
+ void *addr, size_t length, int prot,
+ int flags, int fd, off_t offset)
+{
+ unsigned long map;
+ int err;
+
+ err = syscall_seized(ctl, __NR_mmap, &map,
+ (unsigned long)addr, length, prot, flags, fd, offset);
+ if (err < 0 || (long)map < 0)
+ map = 0;
+
+ return (void *)map;
+}
+
+int restore_gpregs(struct rt_sigframe *f, UserRegsEntry *r)
+{
+#define CPREG1(d) f->uc.uc_mcontext.d = r->d
+
+ int i;
+
+ for (i = 0; i < 31; ++i)
+ CPREG1(regs[i]);
+ CPREG1(sp);
+ CPREG1(pc);
+ CPREG1(pstate);
+
+#undef CPREG1
+
+ return 0;
+}
diff --git a/criu/arch/aarch64/include/asm/atomic.h b/criu/arch/aarch64/include/asm/atomic.h
new file mode 100644
index 000000000000..0e1c04f5a714
--- /dev/null
+++ b/criu/arch/aarch64/include/asm/atomic.h
@@ -0,0 +1,98 @@
+#ifndef __CR_ATOMIC_H__
+#define __CR_ATOMIC_H__
+
+typedef struct {
+ int counter;
+} atomic_t;
+
+
+/* Copied from the Linux header arch/arm/include/asm/barrier.h */
+
+#define smp_mb() asm volatile("dmb ish" : : : "memory")
+
+
+/* Copied from the Linux kernel header arch/arm64/include/asm/atomic.h */
+
+static inline int atomic_read(const atomic_t *v)
+{
+ return (*(volatile int *)&(v)->counter);
+}
+
+static inline void atomic_set(atomic_t *v, int i)
+{
+ v->counter = i;
+}
+
+#define atomic_get atomic_read
+
+
+static inline int atomic_add_return(int i, atomic_t *v)
+{
+ unsigned long tmp;
+ int result;
+
+ asm volatile(
+"1: ldxr %w0, %2\n"
+" add %w0, %w0, %w3\n"
+" stlxr %w1, %w0, %2\n"
+" cbnz %w1, 1b"
+ : "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
+ : "Ir" (i)
+ : "cc", "memory");
+
+ smp_mb();
+ return result;
+}
+
+static inline int atomic_sub_return(int i, atomic_t *v)
+{
+ unsigned long tmp;
+ int result;
+
+ asm volatile(
+"1: ldxr %w0, %2\n"
+" sub %w0, %w0, %w3\n"
+" stlxr %w1, %w0, %2\n"
+" cbnz %w1, 1b"
+ : "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
+ : "Ir" (i)
+ : "cc", "memory");
+
+ smp_mb();
+ return result;
+}
+
+static inline int atomic_inc(atomic_t *v) { return atomic_add_return(1, v) - 1; }
+
+static inline int atomic_add(int val, atomic_t *v) { return atomic_add_return(val, v) - val; }
+
+static inline int atomic_dec(atomic_t *v) { return atomic_sub_return(1, v) + 1; }
+
+/* true if the result is 0, or false for all other cases. */
+#define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0)
+
+#define atomic_inc_return(v) (atomic_add_return(1, v))
+
+static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
+{
+ unsigned long tmp;
+ int oldval;
+
+ smp_mb();
+
+ asm volatile("// atomic_cmpxchg\n"
+"1: ldxr %w1, %2\n"
+" cmp %w1, %w3\n"
+" b.ne 2f\n"
+" stxr %w0, %w4, %2\n"
+" cbnz %w0, 1b\n"
+"2:"
+ : "=&r" (tmp), "=&r" (oldval), "+Q" (ptr->counter)
+ : "Ir" (old), "r" (new)
+ : "cc");
+
+ smp_mb();
+ return oldval;
+}
+
+#endif /* __CR_ATOMIC_H__ */
diff --git a/criu/arch/aarch64/include/asm/bitops.h b/criu/arch/aarch64/include/asm/bitops.h
new file mode 100644
index 000000000000..5a750447f25f
--- /dev/null
+++ b/criu/arch/aarch64/include/asm/bitops.h
@@ -0,0 +1,7 @@
+#ifndef __CR_ASM_BITOPS_H__
+#define __CR_ASM_BITOPS_H__
+
+#include "compiler.h"
+#include "asm-generic/bitops.h"
+
+#endif /* __CR_ASM_BITOPS_H__ */
diff --git a/criu/arch/aarch64/include/asm/bitsperlong.h b/criu/arch/aarch64/include/asm/bitsperlong.h
new file mode 100644
index 000000000000..d95727d193e8
--- /dev/null
+++ b/criu/arch/aarch64/include/asm/bitsperlong.h
@@ -0,0 +1,6 @@
+#ifndef __CR_BITSPERLONG_H__
+#define __CR_BITSPERLONG_H__
+
+#define BITS_PER_LONG 64
+
+#endif /* __CR_BITSPERLONG_H__ */
diff --git a/criu/arch/aarch64/include/asm/cpu.h b/criu/arch/aarch64/include/asm/cpu.h
new file mode 100644
index 000000000000..59118c211d10
--- /dev/null
+++ b/criu/arch/aarch64/include/asm/cpu.h
@@ -0,0 +1 @@
+#include <stdbool.h>
diff --git a/criu/arch/aarch64/include/asm/dump.h b/criu/arch/aarch64/include/asm/dump.h
new file mode 100644
index 000000000000..671c424da9d7
--- /dev/null
+++ b/criu/arch/aarch64/include/asm/dump.h
@@ -0,0 +1,14 @@
+#ifndef __CR_ASM_DUMP_H__
+#define __CR_ASM_DUMP_H__
+
+extern int get_task_regs(pid_t pid, user_regs_struct_t regs, CoreEntry *core);
+extern int arch_alloc_thread_info(CoreEntry *core);
+extern void arch_free_thread_info(CoreEntry *core);
+
+
+static inline void core_put_tls(CoreEntry *core, tls_t tls)
+{
+ core->ti_aarch64->tls = tls;
+}
+
+#endif
diff --git a/criu/arch/aarch64/include/asm/fpu.h b/criu/arch/aarch64/include/asm/fpu.h
new file mode 100644
index 000000000000..7f476d541a7d
--- /dev/null
+++ b/criu/arch/aarch64/include/asm/fpu.h
@@ -0,0 +1,4 @@
+#ifndef __CR_ASM_FPU_H__
+#define __CR_ASM_FPU_H__
+
+#endif /* __CR_ASM_FPU_H__ */
diff --git a/criu/arch/aarch64/include/asm/int.h b/criu/arch/aarch64/include/asm/int.h
new file mode 100644
index 000000000000..642804e9b485
--- /dev/null
+++ b/criu/arch/aarch64/include/asm/int.h
@@ -0,0 +1,6 @@
+#ifndef __CR_ASM_INT_H__
+#define __CR_ASM_INT_H__
+
+#include "asm-generic/int.h"
+
+#endif /* __CR_ASM_INT_H__ */
diff --git a/criu/arch/aarch64/include/asm/linkage.h b/criu/arch/aarch64/include/asm/linkage.h
new file mode 100644
index 000000000000..7380642337a0
--- /dev/null
+++ b/criu/arch/aarch64/include/asm/linkage.h
@@ -0,0 +1,24 @@
+#ifndef __CR_LINKAGE_H__
+#define __CR_LINKAGE_H__
+
+#ifdef __ASSEMBLY__
+
+#define __ALIGN .align 4, 0x00
+#define __ALIGN_STR ".align 4, 0x00"
+
+#define GLOBAL(name) \
+ .globl name; \
+ name:
+
+#define ENTRY(name) \
+ .globl name; \
+ .type name, #function; \
+ __ALIGN; \
+ name:
+
+#define END(sym) \
+ .size sym, . - sym
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __CR_LINKAGE_H__ */
diff --git a/criu/arch/aarch64/include/asm/page.h b/criu/arch/aarch64/include/asm/page.h
new file mode 100644
index 000000000000..de1fe5428c50
--- /dev/null
+++ b/criu/arch/aarch64/include/asm/page.h
@@ -0,0 +1,21 @@
+#ifndef __CR_ASM_PAGE_H__
+#define __CR_ASM_PAGE_H__
+
+#include <unistd.h>
+
+#ifndef PAGE_SHIFT
+# define PAGE_SHIFT 12
+#endif
+
+#ifndef PAGE_SIZE
+# define PAGE_SIZE (1UL << PAGE_SHIFT)
+#endif
+
+#ifndef PAGE_MASK
+# define PAGE_MASK (~(PAGE_SIZE - 1))
+#endif
+
+#define PAGE_PFN(addr) ((addr) / PAGE_SIZE)
+#define page_size() sysconf(_SC_PAGESIZE)
+
+#endif /* __CR_ASM_PAGE_H__ */
diff --git a/criu/arch/aarch64/include/asm/parasite-syscall.h b/criu/arch/aarch64/include/asm/parasite-syscall.h
new file mode 100644
index 000000000000..0c07121da737
--- /dev/null
+++ b/criu/arch/aarch64/include/asm/parasite-syscall.h
@@ -0,0 +1,18 @@
+#ifndef __CR_ASM_PARASITE_SYSCALL_H__
+#define __CR_ASM_PARASITE_SYSCALL_H__
+
+struct parasite_ctl;
+
+#define ARCH_SI_TRAP TRAP_BRKPT
+
+
+extern const char code_syscall[];
+extern const int code_syscall_size;
+
+void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs);
+
+void *mmap_seized(struct parasite_ctl *ctl,
+ void *addr, size_t length, int prot,
+ int flags, int fd, off_t offset);
+
+#endif
diff --git a/criu/arch/aarch64/include/asm/parasite.h b/criu/arch/aarch64/include/asm/parasite.h
new file mode 100644
index 000000000000..2a1e1c12e7d7
--- /dev/null
+++ b/criu/arch/aarch64/include/asm/parasite.h
@@ -0,0 +1,11 @@
+#ifndef __ASM_PARASITE_H__
+#define __ASM_PARASITE_H__
+
+static inline void arch_get_tls(tls_t *ptls)
+{
+ tls_t tls;
+ asm("mrs %0, tpidr_el0" : "=r" (tls));
+ *ptls = tls;
+}
+
+#endif
diff --git a/criu/arch/aarch64/include/asm/processor-flags.h b/criu/arch/aarch64/include/asm/processor-flags.h
new file mode 100644
index 000000000000..c1888af36fa0
--- /dev/null
+++ b/criu/arch/aarch64/include/asm/processor-flags.h
@@ -0,0 +1,4 @@
+#ifndef __CR_PROCESSOR_FLAGS_H__
+#define __CR_PROCESSOR_FLAGS_H__
+
+#endif
diff --git a/criu/arch/aarch64/include/asm/restore.h b/criu/arch/aarch64/include/asm/restore.h
new file mode 100644
index 000000000000..69404b0e815e
--- /dev/null
+++ b/criu/arch/aarch64/include/asm/restore.h
@@ -0,0 +1,28 @@
+#ifndef __CR_ASM_RESTORE_H__
+#define __CR_ASM_RESTORE_H__
+
+#include "asm/restorer.h"
+
+#include "protobuf/core.pb-c.h"
+
+#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \
+ task_args) \
+ asm volatile( \
+ "and sp, %0, #~15 \n" \
+ "mov x0, %2 \n" \
+ "br %1 \n" \
+ : \
+ : "r"(new_sp), \
+ "r"(restore_task_exec_start), \
+ "r"(task_args) \
+ : "sp", "x0", "memory")
+
+static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls)
+{
+ *ptls = pcore->ti_aarch64->tls;
+}
+
+
+int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core);
+
+#endif
diff --git a/criu/arch/aarch64/include/asm/restorer.h b/criu/arch/aarch64/include/asm/restorer.h
new file mode 100644
index 000000000000..583f9583b836
--- /dev/null
+++ b/criu/arch/aarch64/include/asm/restorer.h
@@ -0,0 +1,121 @@
+#ifndef __CR_ASM_RESTORER_H__
+#define __CR_ASM_RESTORER_H__
+
+#include <asm/sigcontext.h>
+#include <sys/ucontext.h>
+
+#include "asm/types.h"
+#include "protobuf/core.pb-c.h"
+
+/* Copied from the kernel header arch/arm64/include/uapi/asm/sigcontext.h */
+
+#define FPSIMD_MAGIC 0x46508001
+
+typedef struct fpsimd_context fpu_state_t;
+
+
+struct aux_context {
+ struct fpsimd_context fpsimd;
+ /* additional context to be added before "end" */
+ struct _aarch64_ctx end;
+};
+
+
+// XXX: the idetifier rt_sigcontext is expected to be struct by the CRIU code
+#define rt_sigcontext sigcontext
+
+
+#include "sigframe.h"
+
+
+/* Copied from the kernel source arch/arm64/kernel/signal.c */
+
+struct rt_sigframe {
+ siginfo_t info;
+ struct ucontext uc;
+ u64 fp;
+ u64 lr;
+};
+
+
+#define ARCH_RT_SIGRETURN(new_sp) \
+ asm volatile( \
+ "mov sp, %0 \n" \
+ "mov x8, #"__stringify(__NR_rt_sigreturn)" \n" \
+ "svc #0 \n" \
+ : \
+ : "r"(new_sp) \
+ : "sp", "x8", "memory")
+
+#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \
+ thread_args, clone_restore_fn) \
+ asm volatile( \
+ "clone_emul: \n" \
+ "ldr x1, %2 \n" \
+ "and x1, x1, #~15 \n" \
+ "sub x1, x1, #16 \n" \
+ "stp %5, %6, [x1] \n" \
+ "mov x0, %1 \n" \
+ "mov x2, %3 \n" \
+ "mov x3, %4 \n" \
+ "mov x8, #"__stringify(__NR_clone)" \n" \
+ "svc #0 \n" \
+ \
+ "cbz x0, thread_run \n" \
+ \
+ "mov %0, x0 \n" \
+ "b clone_end \n" \
+ \
+ "thread_run: \n" \
+ "ldp x1, x0, [sp] \n" \
+ "br x1 \n" \
+ \
+ "clone_end: \n" \
+ : "=r"(ret) \
+ : "r"(clone_flags), \
+ "m"(new_sp), \
+ "r"(&parent_tid), \
+ "r"(&thread_args[i].pid), \
+ "r"(clone_restore_fn), \
+ "r"(&thread_args[i]) \
+ : "x0", "x1", "x2", "x3", "x8", "memory")
+
+#define ARCH_FAIL_CORE_RESTORE \
+ asm volatile( \
+ "mov sp, %0 \n" \
+ "mov x0, #0 \n" \
+ "b x0 \n" \
+ : \
+ : "r"(ret) \
+ : "sp", "x0", "memory")
+
+
+#define RT_SIGFRAME_UC(rt_sigframe) rt_sigframe->uc
+#define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(rt_sigframe)->uc.uc_mcontext.pc)
+#define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1)
+#define RT_SIGFRAME_FPU(rt_sigframe) ((struct aux_context*)&(rt_sigframe)->uc.uc_mcontext.__reserved)->fpsimd
+
+#define SIGFRAME_OFFSET 0
+
+
+int restore_gpregs(struct rt_sigframe *f, UserAarch64RegsEntry *r);
+int restore_nonsigframe_gpregs(UserAarch64RegsEntry *r);
+
+static inline int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, fpu_state_t *fpu_state) { return 0; }
+
+static inline void restore_tls(tls_t *ptls)
+{
+ asm("msr tpidr_el0, %0" : : "r" (*ptls));
+}
+
+static inline int ptrace_set_breakpoint(pid_t pid, void *addr)
+{
+ return 0;
+}
+
+static inline int ptrace_flush_breakpoints(pid_t pid)
+{
+ return 0;
+}
+
+#endif
diff --git a/criu/arch/aarch64/include/asm/string.h b/criu/arch/aarch64/include/asm/string.h
new file mode 100644
index 000000000000..2c3a34fbbd3f
--- /dev/null
+++ b/criu/arch/aarch64/include/asm/string.h
@@ -0,0 +1,7 @@
+#ifndef __CR_ASM_STRING_H__
+#define __CR_ASM_STRING_H__
+
+#include "compiler.h"
+#include "asm-generic/string.h"
+
+#endif /* __CR_ASM_STRING_H__ */
diff --git a/criu/arch/aarch64/include/asm/syscall-aux.S b/criu/arch/aarch64/include/asm/syscall-aux.S
new file mode 100644
index 000000000000..00ccf79c30e3
--- /dev/null
+++ b/criu/arch/aarch64/include/asm/syscall-aux.S
@@ -0,0 +1,37 @@
+/**
+ * This source contains emulation of syscalls
+ * that are not implemented in the AArch64 Linux kernel
+ */
+
+ENTRY(sys_open)
+ mov x3, x2
+ mov x2, x1
+ mov x1, x0
+ mov x0, #-100
+ b sys_openat
+END(sys_open)
+
+
+ENTRY(sys_mkdir)
+ mov x3, x2
+ mov x2, x1
+ mov x1, x0
+ mov x0, #-100
+ b sys_mkdirat
+END(sys_mkdir)
+
+
+ENTRY(sys_rmdir)
+ mov x2, #0x200 // flags = AT_REMOVEDIR
+ mov x1, x0
+ mov x0, #-100
+ b sys_unlinkat
+END(sys_rmdir)
+
+
+ENTRY(sys_unlink)
+ mov x2, #0 // flags = 0
+ mov x1, x0
+ mov x0, #-100
+ b sys_unlinkat
+END(sys_unlink)
diff --git a/criu/arch/aarch64/include/asm/syscall-aux.h b/criu/arch/aarch64/include/asm/syscall-aux.h
new file mode 100644
index 000000000000..814c7a9dddc0
--- /dev/null
+++ b/criu/arch/aarch64/include/asm/syscall-aux.h
@@ -0,0 +1 @@
+#define __NR_openat 56
diff --git a/criu/arch/aarch64/include/asm/types.h b/criu/arch/aarch64/include/asm/types.h
new file mode 100644
index 000000000000..d6c890dc0b16
--- /dev/null
+++ b/criu/arch/aarch64/include/asm/types.h
@@ -0,0 +1,102 @@
+#ifndef __CR_ASM_TYPES_H__
+#define __CR_ASM_TYPES_H__
+
+#include <stdbool.h>
+#include <signal.h>
+#include <asm/ptrace.h>
+#include "protobuf/core.pb-c.h"
+
+#include "asm/page.h"
+#include "asm/bitops.h"
+#include "asm/int.h"
+
+
+#define SIGMAX 64
+#define SIGMAX_OLD 31
+
+typedef void rt_signalfn_t(int, siginfo_t *, void *);
+typedef rt_signalfn_t *rt_sighandler_t;
+
+typedef void rt_restorefn_t(void);
+typedef rt_restorefn_t *rt_sigrestore_t;
+
+#define _KNSIG 64
+#define _NSIG_BPW 64
+
+#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW)
+
+typedef struct {
+ unsigned long sig[_KNSIG_WORDS];
+} k_rtsigset_t;
+
+static inline void ksigfillset(k_rtsigset_t *set)
+{
+ int i;
+ for (i = 0; i < _KNSIG_WORDS; i++)
+ set->sig[i] = (unsigned long)-1;
+}
+
+#define SA_RESTORER 0x00000000
+
+typedef struct {
+ rt_sighandler_t rt_sa_handler;
+ unsigned long rt_sa_flags;
+ rt_sigrestore_t rt_sa_restorer;
+ k_rtsigset_t rt_sa_mask;
+} rt_sigaction_t;
+
+/*
+ * Copied from the Linux kernel header arch/arm64/include/uapi/asm/ptrace.h
+ *
+ * A thread ARM CPU context
+ */
+
+typedef struct user_pt_regs user_regs_struct_t;
+
+
+#define ASSIGN_TYPED(a, b) do { a = (typeof(a))b; } while (0)
+#define ASSIGN_MEMBER(a,b,m) do { ASSIGN_TYPED((a)->m, (b)->m); } while (0)
+
+#define REG_RES(regs) ((u64)(regs).regs[0])
+#define REG_IP(regs) ((u64)(regs).pc)
+#define REG_SYSCALL_NR(regs) ((u64)(regs).regs[8])
+
+/*
+ * Range for task size calculated from the following Linux kernel files:
+ * arch/arm64/include/asm/memory.h
+ * arch/arm64/Kconfig
+ *
+ * TODO: handle 32 bit tasks
+ */
+#define TASK_SIZE_MIN (1UL << 39)
+#define TASK_SIZE_MAX (1UL << 48)
+
+int munmap(void *addr, size_t length);
+
+static inline unsigned long task_size() {
+ unsigned long task_size;
+
+ for (task_size = TASK_SIZE_MIN; task_size < TASK_SIZE_MAX; task_size <<= 1)
+ if (munmap((void *)task_size, page_size()))
+ break;
+
+ return task_size;
+}
+
+#define AT_VECTOR_SIZE 40
+
+typedef UserAarch64RegsEntry UserRegsEntry;
+
+#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__AARCH64
+
+#define CORE_THREAD_ARCH_INFO(core) core->ti_aarch64
+
+#define TI_SP(core) ((core)->ti_aarch64->gpregs->sp)
+
+typedef uint64_t auxv_t;
+typedef uint64_t tls_t;
+
+static inline void *decode_pointer(uint64_t v) { return (void*)v; }
+static inline uint64_t encode_pointer(void *p) { return (uint64_t)p; }
+
+#endif /* __CR_ASM_TYPES_H__ */
diff --git a/criu/arch/aarch64/include/asm/vdso.h b/criu/arch/aarch64/include/asm/vdso.h
new file mode 100644
index 000000000000..d015c63877e8
--- /dev/null
+++ b/criu/arch/aarch64/include/asm/vdso.h
@@ -0,0 +1,26 @@
+#ifndef __CR_ASM_VDSO_H__
+#define __CR_ASM_VDSO_H__
+
+#include "asm/int.h"
+#include "asm-generic/vdso.h"
+
+/*
+ * This is a minimal amount of symbols
+ * we should support at the moment.
+ */
+#define VDSO_SYMBOL_MAX 4
+
+#define ARCH_VDSO_SYMBOLS \
+ "__kernel_clock_getres", \
+ "__kernel_clock_gettime", \
+ "__kernel_gettimeofday", \
+ "__kernel_rt_sigreturn"
+
+struct vdso_symtable;
+extern int vdso_redirect_calls(unsigned long base_to,
+ unsigned long base_from,
+ struct vdso_symtable *to,
+ struct vdso_symtable *from);
+extern void write_intraprocedure_branch(unsigned long to, unsigned long from);
+
+#endif /* __CR_ASM_VDSO_H__ */
diff --git a/criu/arch/aarch64/intraprocedure.S b/criu/arch/aarch64/intraprocedure.S
new file mode 100644
index 000000000000..e139dc8b573b
--- /dev/null
+++ b/criu/arch/aarch64/intraprocedure.S
@@ -0,0 +1,22 @@
+.global write_intraprocedure_branch
+
+/* to is x0, from is x1 */
+write_intraprocedure_branch:
+ /* load two 32-bit instructions */
+ ldr x2, loadbranch
+ /* store 64 bits of instructions and 64 bits of destination address */
+ stp x2, x0, [x1]
+ /* perform required cache maintenance and synronization operations */
+ dc cvau, x1
+ dsb ish
+ ic ivau, x1
+ dsb ish
+ isb
+ ret
+
+/* intraprocedure trampoline instructions */
+loadbranch:
+ ldr x16, =destination
+ br x16
+/* label to get relative position of literal pool */
+destination:
diff --git a/criu/arch/aarch64/parasite-head.S b/criu/arch/aarch64/parasite-head.S
new file mode 100644
index 000000000000..7a359061c4b9
--- /dev/null
+++ b/criu/arch/aarch64/parasite-head.S
@@ -0,0 +1,21 @@
+#include "asm/linkage.h"
+#include "parasite.h"
+
+ .section .head.text, "ax"
+ENTRY(__export_parasite_head_start)
+ adr x2, __export_parasite_head_start // get the address of this instruction
+
+ ldr x0, __export_parasite_cmd
+
+ ldr x1, parasite_args_ptr
+ add x1, x1, x2 // fixup __export_parasite_args
+
+ bl parasite_service
+ brk #0 // the instruction BRK #0 generates the signal SIGTRAP in Linux
+
+parasite_args_ptr:
+ .quad __export_parasite_args
+
+__export_parasite_cmd:
+ .quad 0
+END(__export_parasite_head_start)
diff --git a/criu/arch/aarch64/restorer.c b/criu/arch/aarch64/restorer.c
new file mode 100644
index 000000000000..2c61e2d03109
--- /dev/null
+++ b/criu/arch/aarch64/restorer.c
@@ -0,0 +1,15 @@
+#include <unistd.h>
+
+#include "restorer.h"
+#include "asm/restorer.h"
+#include "asm/string.h"
+
+#include "syscall.h"
+#include "log.h"
+#include "asm/fpu.h"
+#include "cpu.h"
+
+int restore_nonsigframe_gpregs(UserRegsEntry *r)
+{
+ return 0;
+}
diff --git a/criu/arch/aarch64/syscalls/syscall-common.S b/criu/arch/aarch64/syscalls/syscall-common.S
new file mode 100644
index 000000000000..81ec20f5516c
--- /dev/null
+++ b/criu/arch/aarch64/syscalls/syscall-common.S
@@ -0,0 +1,19 @@
+#include "asm/linkage.h"
+
+syscall_common:
+ svc #0
+ ret
+
+
+.macro syscall name, nr
+ ENTRY(\name)
+ mov x8, \nr
+ b syscall_common
+ END(\name)
+.endm
+
+
+ENTRY(__cr_restore_rt)
+ mov x8, __NR_rt_sigreturn
+ svc #0
+END(__cr_restore_rt)
diff --git a/criu/arch/aarch64/syscalls/syscall.def b/criu/arch/aarch64/syscalls/syscall.def
new file mode 120000
index 000000000000..e9370a6e5485
--- /dev/null
+++ b/criu/arch/aarch64/syscalls/syscall.def
@@ -0,0 +1 @@
+../../arm/syscalls/syscall.def
\ No newline at end of file
diff --git a/criu/arch/aarch64/vdso-pie.c b/criu/arch/aarch64/vdso-pie.c
new file mode 100644
index 000000000000..0f06c2d191d1
--- /dev/null
+++ b/criu/arch/aarch64/vdso-pie.c
@@ -0,0 +1,35 @@
+#include <unistd.h>
+
+#include "asm/string.h"
+#include "asm/types.h"
+
+#include "syscall.h"
+#include "parasite-vdso.h"
+#include "log.h"
+#include "bug.h"
+
+#ifdef LOG_PREFIX
+# undef LOG_PREFIX
+#endif
+#define LOG_PREFIX "vdso: "
+
+int vdso_redirect_calls(unsigned long base_to, unsigned long base_from,
+ struct vdso_symtable *to,
+ struct vdso_symtable *from)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(to->symbols); i++) {
+ if (vdso_symbol_empty(&from->symbols[i]))
+ continue;
+
+ pr_debug("br: %lx/%lx -> %lx/%lx (index %d)\n",
+ base_from, from->symbols[i].offset,
+ base_to, to->symbols[i].offset, i);
+
+ write_intraprocedure_branch(base_to + to->symbols[i].offset,
+ base_from + from->symbols[i].offset);
+ }
+
+ return 0;
+}
diff --git a/criu/arch/arm/Makefile b/criu/arch/arm/Makefile
new file mode 100644
index 000000000000..5db577340a7d
--- /dev/null
+++ b/criu/arch/arm/Makefile
@@ -0,0 +1,6 @@
+builtin-name := crtools.built-in.o
+
+ccflags-y += -iquote $(obj) -iquote $(SRC_DIR) -iquote $(obj)/include -iquote $(SRC_DIR)/criu/include
+
+obj-y += cpu.o
+obj-y += crtools.o
diff --git a/criu/arch/arm/Makefile.syscalls b/criu/arch/arm/Makefile.syscalls
new file mode 100644
index 000000000000..527960602c79
--- /dev/null
+++ b/criu/arch/arm/Makefile.syscalls
@@ -0,0 +1,50 @@
+builtin-name := syscalls.built-in.o
+
+SYS-TYPES := ../../include/syscall-types.h
+SYS-CODES := ../../include/syscall-codes.h
+SYS-PROTO := ../../include/syscall.h
+
+SYS-DEF := syscall.def
+SYS-ASM-COMMON := syscall-common.S
+
+SYS-GEN := ../scripts/arm/gen-syscalls.pl
+SYS-GEN-TBL := ../scripts/arm/gen-sys-exec-tbl.pl
+
+asflags-y += -D__ASSEMBLY__ -nostdlib -fomit-frame-pointer
+asflags-y += -fpie -Wstrict-prototypes -Wa,--noexecstack
+asflags-y += -iquote $(obj) -iquote $(obj)/include -iquote $(SRC_DIR)/criu/include
+
+SYS-ASM := syscalls.S
+obj-y += $(SYS-ASM:.S=).o
+
+ARCH_BITS := 32
+
+SYS-EXEC-TBL := sys-exec-tbl.c
+
+$(obj)/$(SYS-ASM): $(obj)/$(SYS-GEN) $(obj)/syscalls/$(SYS-DEF) $(obj)/syscalls/$(SYS-ASM-COMMON)
+ $(E) " GEN " $@
+ $(Q) perl \
+ $(obj)/$(SYS-GEN) \
+ $(obj)/syscalls/$(SYS-DEF) \
+ $(obj)/$(SYS-CODES) \
+ $(obj)/$(SYS-PROTO) \
+ $(obj)/$(SYS-ASM) \
+ syscalls/$(SYS-ASM-COMMON) \
+ $(obj)/$(SYS-TYPES) \
+ $(ARCH_BITS)
+
+$(obj)/syscalls.o: $(obj)/$(SYS-ASM)
+
+$(obj)/$(SYS-EXEC-TBL): $(obj)/$(SYS-GEN-TBL) $(obj)/syscalls/$(SYS-DEF)
+ $(E) " GEN " $@
+ $(Q) perl \
+ $(obj)/$(SYS-GEN-TBL) \
+ $(obj)/syscalls/$(SYS-DEF) \
+ $(obj)/$(SYS-EXEC-TBL) \
+ $(ARCH_BITS)
+
+all-y += $(obj)/$(SYS-EXEC-TBL)
+
+cleanup-y += $(obj)/$(SYS-EXEC-TBL) $(obj)/$(SYS-ASM)
+cleanup-y += $(obj)/$(SYS-CODES)
+cleanup-y += $(obj)/$(SYS-PROTO)
diff --git a/criu/arch/arm/cpu.c b/criu/arch/arm/cpu.c
new file mode 100644
index 000000000000..040fe14fcfb7
--- /dev/null
+++ b/criu/arch/arm/cpu.c
@@ -0,0 +1,45 @@
+#undef LOG_PREFIX
+#define LOG_PREFIX "cpu: "
+
+#include <errno.h>
+#include "cpu.h"
+
+bool cpu_has_feature(unsigned int feature)
+{
+ return false;
+}
+
+int cpu_init(void)
+{
+ return 0;
+}
+
+int cpu_dump_cpuinfo(void)
+{
+ return 0;
+}
+
+int cpu_validate_cpuinfo(void)
+{
+ return 0;
+}
+
+int cpu_dump_cpuinfo_single(void)
+{
+ return -ENOTSUP;
+}
+
+int cpu_validate_image_cpuinfo_single(void)
+{
+ return -ENOTSUP;
+}
+
+int cpuinfo_dump(void)
+{
+ return -ENOTSUP;
+}
+
+int cpuinfo_check(void)
+{
+ return -ENOTSUP;
+}
diff --git a/criu/arch/arm/crtools.c b/criu/arch/arm/crtools.c
new file mode 100644
index 000000000000..8ce889463f6a
--- /dev/null
+++ b/criu/arch/arm/crtools.c
@@ -0,0 +1,248 @@
+#include <string.h>
+#include <unistd.h>
+
+#include "asm/types.h"
+#include "asm/restorer.h"
+#include "compiler.h"
+#include "ptrace.h"
+#include "asm/processor-flags.h"
+#include "protobuf.h"
+#include "protobuf/core.pb-c.h"
+#include "protobuf/creds.pb-c.h"
+#include "parasite-syscall.h"
+#include "log.h"
+#include "util.h"
+#include "cpu.h"
+#include "elf.h"
+#include "parasite-syscall.h"
+#include "restorer.h"
+#include "errno.h"
+#include "kerndat.h"
+
+
+/*
+ * Injected syscall instruction
+ */
+const char code_syscall[] = {
+ 0x00, 0x00, 0x00, 0xef, /* SVC #0 */
+ 0xf0, 0x01, 0xf0, 0xe7 /* UDF #32 */
+};
+
+const int code_syscall_size = round_up(sizeof(code_syscall), sizeof(long));
+
+static inline __always_unused void __check_code_syscall(void)
+{
+ BUILD_BUG_ON(sizeof(code_syscall) != BUILTIN_SYSCALL_SIZE);
+ BUILD_BUG_ON(!is_log2(sizeof(code_syscall)));
+}
+
+void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs)
+{
+ regs->ARM_pc = new_ip;
+ if (stack)
+ regs->ARM_sp = (unsigned long)stack;
+
+ /* Make sure flags are in known state */
+ regs->ARM_cpsr &= PSR_f | PSR_s | PSR_x | MODE32_BIT;
+}
+
+bool arch_can_dump_task(pid_t pid)
+{
+ /*
+ * TODO: Add proper check here
+ */
+ return true;
+}
+
+int syscall_seized(struct parasite_ctl *ctl, int nr, unsigned long *ret,
+ unsigned long arg1,
+ unsigned long arg2,
+ unsigned long arg3,
+ unsigned long arg4,
+ unsigned long arg5,
+ unsigned long arg6)
+{
+ user_regs_struct_t regs = ctl->orig.regs;
+ int err;
+
+ regs.ARM_r7 = (unsigned long)nr;
+ regs.ARM_r0 = arg1;
+ regs.ARM_r1 = arg2;
+ regs.ARM_r2 = arg3;
+ regs.ARM_r3 = arg4;
+ regs.ARM_r4 = arg5;
+ regs.ARM_r5 = arg6;
+
+ err = __parasite_execute_syscall(ctl, ®s);
+
+ *ret = regs.ARM_r0;
+ return err;
+}
+
+#define assign_reg(dst, src, e) dst->e = (__typeof__(dst->e))src.ARM_##e
+
+#define PTRACE_GETVFPREGS 27
+int get_task_regs(pid_t pid, user_regs_struct_t regs, CoreEntry *core)
+{
+ struct user_vfp vfp;
+ int ret = -1;
+
+ pr_info("Dumping GP/FPU registers for %d\n", pid);
+
+ if (ptrace(PTRACE_GETVFPREGS, pid, NULL, &vfp)) {
+ pr_perror("Can't obtain FPU registers for %d", pid);
+ goto err;
+ }
+
+ /* Did we come from a system call? */
+ if ((int)regs.ARM_ORIG_r0 >= 0) {
+ /* Restart the system call */
+ switch ((long)(int)regs.ARM_r0) {
+ case -ERESTARTNOHAND:
+ case -ERESTARTSYS:
+ case -ERESTARTNOINTR:
+ regs.ARM_r0 = regs.ARM_ORIG_r0;
+ regs.ARM_pc -= 4;
+ break;
+ case -ERESTART_RESTARTBLOCK:
+ regs.ARM_r0 = __NR_restart_syscall;
+ regs.ARM_pc -= 4;
+ break;
+ }
+ }
+
+
+ // Save the ARM CPU state
+
+ assign_reg(core->ti_arm->gpregs, regs, r0);
+ assign_reg(core->ti_arm->gpregs, regs, r1);
+ assign_reg(core->ti_arm->gpregs, regs, r2);
+ assign_reg(core->ti_arm->gpregs, regs, r3);
+ assign_reg(core->ti_arm->gpregs, regs, r4);
+ assign_reg(core->ti_arm->gpregs, regs, r5);
+ assign_reg(core->ti_arm->gpregs, regs, r6);
+ assign_reg(core->ti_arm->gpregs, regs, r7);
+ assign_reg(core->ti_arm->gpregs, regs, r8);
+ assign_reg(core->ti_arm->gpregs, regs, r9);
+ assign_reg(core->ti_arm->gpregs, regs, r10);
+ assign_reg(core->ti_arm->gpregs, regs, fp);
+ assign_reg(core->ti_arm->gpregs, regs, ip);
+ assign_reg(core->ti_arm->gpregs, regs, sp);
+ assign_reg(core->ti_arm->gpregs, regs, lr);
+ assign_reg(core->ti_arm->gpregs, regs, pc);
+ assign_reg(core->ti_arm->gpregs, regs, cpsr);
+ core->ti_arm->gpregs->orig_r0 = regs.ARM_ORIG_r0;
+
+
+ // Save the VFP state
+
+ memcpy(CORE_THREAD_ARCH_INFO(core)->fpstate->vfp_regs, &vfp.fpregs, sizeof(vfp.fpregs));
+ CORE_THREAD_ARCH_INFO(core)->fpstate->fpscr = vfp.fpscr;
+
+ ret = 0;
+
+err:
+ return ret;
+}
+
+int arch_alloc_thread_info(CoreEntry *core)
+{
+ ThreadInfoArm *ti_arm;
+ UserArmRegsEntry *gpregs;
+ UserArmVfpstateEntry *fpstate;
+
+ ti_arm = xmalloc(sizeof(*ti_arm));
+ if (!ti_arm)
+ goto err;
+ thread_info_arm__init(ti_arm);
+ core->ti_arm = ti_arm;
+
+ gpregs = xmalloc(sizeof(*gpregs));
+ user_arm_regs_entry__init(gpregs);
+ ti_arm->gpregs = gpregs;
+
+ fpstate = xmalloc(sizeof(*fpstate));
+ if (!fpstate)
+ goto err;
+ user_arm_vfpstate_entry__init(fpstate);
+ ti_arm->fpstate = fpstate;
+ fpstate->vfp_regs = xmalloc(32*sizeof(unsigned long long));
+ fpstate->n_vfp_regs = 32;
+ if (!fpstate->vfp_regs)
+ goto err;
+
+ return 0;
+err:
+ return -1;
+}
+
+void arch_free_thread_info(CoreEntry *core)
+{
+ if (CORE_THREAD_ARCH_INFO(core)) {
+ if (CORE_THREAD_ARCH_INFO(core)->fpstate) {
+ xfree(CORE_THREAD_ARCH_INFO(core)->fpstate->vfp_regs);
+ xfree(CORE_THREAD_ARCH_INFO(core)->fpstate);
+ }
+ xfree(CORE_THREAD_ARCH_INFO(core)->gpregs);
+ xfree(CORE_THREAD_ARCH_INFO(core));
+ CORE_THREAD_ARCH_INFO(core) = NULL;
+ }
+}
+
+int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core)
+{
+ struct aux_sigframe *aux = (struct aux_sigframe *)&sigframe->sig.uc.uc_regspace;
+
+ memcpy(&aux->vfp.ufp.fpregs, CORE_THREAD_ARCH_INFO(core)->fpstate->vfp_regs, sizeof(aux->vfp.ufp.fpregs));
+ aux->vfp.ufp.fpscr = CORE_THREAD_ARCH_INFO(core)->fpstate->fpscr;
+ aux->vfp.magic = VFP_MAGIC;
+ aux->vfp.size = VFP_STORAGE_SIZE;
+ return 0;
+}
+
+void *mmap_seized(struct parasite_ctl *ctl,
+ void *addr, size_t length, int prot,
+ int flags, int fd, off_t offset)
+{
+ unsigned long map;
+ int err;
+
+ if (offset & ~PAGE_MASK)
+ return 0;
+
+ err = syscall_seized(ctl, __NR_mmap2, &map,
+ (unsigned long)addr, length, prot, flags, fd, offset >> 12);
+ if (err < 0 || map > kdat.task_size)
+ map = 0;
+
+ return (void *)map;
+}
+
+int restore_gpregs(struct rt_sigframe *f, UserArmRegsEntry *r)
+{
+#define CPREG1(d) f->sig.uc.uc_mcontext.arm_##d = r->d
+#define CPREG2(d, s) f->sig.uc.uc_mcontext.arm_##d = r->s
+
+ CPREG1(r0);
+ CPREG1(r1);
+ CPREG1(r2);
+ CPREG1(r3);
+ CPREG1(r4);
+ CPREG1(r5);
+ CPREG1(r6);
+ CPREG1(r7);
+ CPREG1(r8);
+ CPREG1(r9);
+ CPREG1(r10);
+ CPREG1(fp);
+ CPREG1(ip);
+ CPREG1(sp);
+ CPREG1(lr);
+ CPREG1(pc);
+ CPREG1(cpsr);
+
+#undef CPREG1
+#undef CPREG2
+
+ return 0;
+}
diff --git a/criu/arch/arm/include/asm/atomic.h b/criu/arch/arm/include/asm/atomic.h
new file mode 100644
index 000000000000..cd0df377245c
--- /dev/null
+++ b/criu/arch/arm/include/asm/atomic.h
@@ -0,0 +1,131 @@
+#ifndef __CR_ATOMIC_H__
+#define __CR_ATOMIC_H__
+
+#include "asm/processor.h"
+
+typedef struct {
+ int counter;
+} atomic_t;
+
+
+/* Copied from the Linux kernel header arch/arm/include/asm/atomic.h */
+
+#if defined(CONFIG_ARMV7)
+
+#define smp_mb() __asm__ __volatile__ ("dmb" : : : "memory")
+
+static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
+{
+ int oldval;
+ unsigned long res;
+
+ smp_mb();
+ prefetchw(&ptr->counter);
+
+ do {
+ __asm__ __volatile__("@ atomic_cmpxchg\n"
+ "ldrex %1, [%3]\n"
+ "mov %0, #0\n"
+ "teq %1, %4\n"
+ "strexeq %0, %5, [%3]\n"
+ : "=&r" (res), "=&r" (oldval), "+Qo" (ptr->counter)
+ : "r" (&ptr->counter), "Ir" (old), "r" (new)
+ : "cc");
+ } while (res);
+
+ smp_mb();
+
+ return oldval;
+}
+
+#elif defined(CONFIG_ARMV6)
+
+/* SMP isn't supported for ARMv6 */
+
+#define smp_mb() __asm__ __volatile__ ("mcr p15, 0, %0, c7, c10, 5" : : "r" (0) : "memory")
+
+static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
+{
+ int ret;
+
+ ret = v->counter;
+ if (ret == old)
+ v->counter = new;
+
+ return ret;
+}
+
+#else
+
+#error ARM architecture version (CONFIG_ARMV*) not set or unsupported.
+
+#endif
+
+static inline int atomic_read(const atomic_t *v)
+{
+ return (*(volatile int *)&(v)->counter);
+}
+
+static inline void atomic_set(atomic_t *v, int i)
+{
+ v->counter = i;
+}
+
+#define atomic_get atomic_read
+
+static inline int atomic_add_return(int i, atomic_t *v)
+{
+ unsigned long tmp;
+ int result;
+
+ smp_mb();
+
+ __asm__ __volatile__("@ atomic_add_return\n"
+"1: ldrex %0, [%3]\n"
+" add %0, %0, %4\n"
+" strex %1, %0, [%3]\n"
+" teq %1, #0\n"
+" bne 1b\n"
+ : "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)
+ : "r" (&v->counter), "Ir" (i)
+ : "cc");
+
+ smp_mb();
+
+ return result;
+}
+
+static inline int atomic_sub_return(int i, atomic_t *v)
+{
+ unsigned long tmp;
+ int result;
+
+ smp_mb();
+
+ __asm__ __volatile__("@ atomic_sub_return\n"
+"1: ldrex %0, [%3]\n"
+" sub %0, %0, %4\n"
+" strex %1, %0, [%3]\n"
+" teq %1, #0\n"
+" bne 1b\n"
+ : "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)
+ : "r" (&v->counter), "Ir" (i)
+ : "cc");
+
+ smp_mb();
+
+ return result;
+}
+
+static inline int atomic_inc(atomic_t *v) { return atomic_add_return(1, v) - 1; }
+
+static inline int atomic_add(int val, atomic_t *v) { return atomic_add_return(val, v) - val; }
+
+static inline int atomic_dec(atomic_t *v) { return atomic_sub_return(1, v) + 1; }
+
+/* true if the result is 0, or false for all other cases. */
+#define atomic_dec_and_test(v) (atomic_sub_return(1, v) == 0)
+
+#define atomic_inc_return(v) (atomic_add_return(1, v))
+
+#endif /* __CR_ATOMIC_H__ */
diff --git a/criu/arch/arm/include/asm/bitops.h b/criu/arch/arm/include/asm/bitops.h
new file mode 100644
index 000000000000..5a750447f25f
--- /dev/null
+++ b/criu/arch/arm/include/asm/bitops.h
@@ -0,0 +1,7 @@
+#ifndef __CR_ASM_BITOPS_H__
+#define __CR_ASM_BITOPS_H__
+
+#include "compiler.h"
+#include "asm-generic/bitops.h"
+
+#endif /* __CR_ASM_BITOPS_H__ */
diff --git a/criu/arch/arm/include/asm/bitsperlong.h b/criu/arch/arm/include/asm/bitsperlong.h
new file mode 100644
index 000000000000..43858b765320
--- /dev/null
+++ b/criu/arch/arm/include/asm/bitsperlong.h
@@ -0,0 +1,6 @@
+#ifndef __CR_BITSPERLONG_H__
+#define __CR_BITSPERLONG_H__
+
+#define BITS_PER_LONG 32
+
+#endif /* __CR_BITSPERLONG_H__ */
diff --git a/criu/arch/arm/include/asm/cpu.h b/criu/arch/arm/include/asm/cpu.h
new file mode 100644
index 000000000000..59118c211d10
--- /dev/null
+++ b/criu/arch/arm/include/asm/cpu.h
@@ -0,0 +1 @@
+#include <stdbool.h>
diff --git a/criu/arch/arm/include/asm/dump.h b/criu/arch/arm/include/asm/dump.h
new file mode 100644
index 000000000000..ae1588da8792
--- /dev/null
+++ b/criu/arch/arm/include/asm/dump.h
@@ -0,0 +1,14 @@
+#ifndef __CR_ASM_DUMP_H__
+#define __CR_ASM_DUMP_H__
+
+extern int get_task_regs(pid_t pid, user_regs_struct_t regs, CoreEntry *core);
+extern int arch_alloc_thread_info(CoreEntry *core);
+extern void arch_free_thread_info(CoreEntry *core);
+
+
+static inline void core_put_tls(CoreEntry *core, tls_t tls)
+{
+ core->ti_arm->tls = tls;
+}
+
+#endif
diff --git a/criu/arch/arm/include/asm/fpu.h b/criu/arch/arm/include/asm/fpu.h
new file mode 100644
index 000000000000..7f476d541a7d
--- /dev/null
+++ b/criu/arch/arm/include/asm/fpu.h
@@ -0,0 +1,4 @@
+#ifndef __CR_ASM_FPU_H__
+#define __CR_ASM_FPU_H__
+
+#endif /* __CR_ASM_FPU_H__ */
diff --git a/criu/arch/arm/include/asm/int.h b/criu/arch/arm/include/asm/int.h
new file mode 100644
index 000000000000..642804e9b485
--- /dev/null
+++ b/criu/arch/arm/include/asm/int.h
@@ -0,0 +1,6 @@
+#ifndef __CR_ASM_INT_H__
+#define __CR_ASM_INT_H__
+
+#include "asm-generic/int.h"
+
+#endif /* __CR_ASM_INT_H__ */
diff --git a/criu/arch/arm/include/asm/linkage.h b/criu/arch/arm/include/asm/linkage.h
new file mode 100644
index 000000000000..7380642337a0
--- /dev/null
+++ b/criu/arch/arm/include/asm/linkage.h
@@ -0,0 +1,24 @@
+#ifndef __CR_LINKAGE_H__
+#define __CR_LINKAGE_H__
+
+#ifdef __ASSEMBLY__
+
+#define __ALIGN .align 4, 0x00
+#define __ALIGN_STR ".align 4, 0x00"
+
+#define GLOBAL(name) \
+ .globl name; \
+ name:
+
+#define ENTRY(name) \
+ .globl name; \
+ .type name, #function; \
+ __ALIGN; \
+ name:
+
+#define END(sym) \
+ .size sym, . - sym
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __CR_LINKAGE_H__ */
diff --git a/criu/arch/arm/include/asm/page.h b/criu/arch/arm/include/asm/page.h
new file mode 100644
index 000000000000..134835556c62
--- /dev/null
+++ b/criu/arch/arm/include/asm/page.h
@@ -0,0 +1,19 @@
+#ifndef __CR_ASM_PAGE_H__
+#define __CR_ASM_PAGE_H__
+
+#ifndef PAGE_SHIFT
+# define PAGE_SHIFT 12
+#endif
+
+#ifndef PAGE_SIZE
+# define PAGE_SIZE (1UL << PAGE_SHIFT)
+#endif
+
+#ifndef PAGE_MASK
+# define PAGE_MASK (~(PAGE_SIZE - 1))
+#endif
+
+#define PAGE_PFN(addr) ((addr) / PAGE_SIZE)
+#define page_size() PAGE_SIZE
+
+#endif /* __CR_ASM_PAGE_H__ */
diff --git a/criu/arch/arm/include/asm/parasite-syscall.h b/criu/arch/arm/include/asm/parasite-syscall.h
new file mode 100644
index 000000000000..0c66bf992cad
--- /dev/null
+++ b/criu/arch/arm/include/asm/parasite-syscall.h
@@ -0,0 +1,18 @@
+#ifndef __CR_ASM_PARASITE_SYSCALL_H__
+#define __CR_ASM_PARASITE_SYSCALL_H__
+
+
+#define ARCH_SI_TRAP TRAP_BRKPT
+
+
+extern const char code_syscall[];
+extern const int code_syscall_size;
+
+
+void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs);
+
+void *mmap_seized(struct parasite_ctl *ctl,
+ void *addr, size_t length, int prot,
+ int flags, int fd, off_t offset);
+
+#endif
diff --git a/criu/arch/arm/include/asm/parasite.h b/criu/arch/arm/include/asm/parasite.h
new file mode 100644
index 000000000000..7f62bb9d27be
--- /dev/null
+++ b/criu/arch/arm/include/asm/parasite.h
@@ -0,0 +1,9 @@
+#ifndef __ASM_PARASITE_H__
+#define __ASM_PARASITE_H__
+
+static inline void arch_get_tls(tls_t *ptls)
+{
+ *ptls = ((tls_t (*)())0xffff0fe0)();
+}
+
+#endif
diff --git a/criu/arch/arm/include/asm/processor-flags.h b/criu/arch/arm/include/asm/processor-flags.h
new file mode 100644
index 000000000000..fc00a9e64a2e
--- /dev/null
+++ b/criu/arch/arm/include/asm/processor-flags.h
@@ -0,0 +1,42 @@
+#ifndef __CR_PROCESSOR_FLAGS_H__
+#define __CR_PROCESSOR_FLAGS_H__
+
+/* Copied from the Linux kernel header arch/arm/include/uapi/asm/ptrace.h */
+
+/*
+ * PSR bits
+ */
+#define USR26_MODE 0x00000000
+#define FIQ26_MODE 0x00000001
+#define IRQ26_MODE 0x00000002
+#define SVC26_MODE 0x00000003
+#define USR_MODE 0x00000010
+#define FIQ_MODE 0x00000011
+#define IRQ_MODE 0x00000012
+#define SVC_MODE 0x00000013
+#define ABT_MODE 0x00000017
+#define UND_MODE 0x0000001b
+#define SYSTEM_MODE 0x0000001f
+#define MODE32_BIT 0x00000010
+#define MODE_MASK 0x0000001f
+#define PSR_T_BIT 0x00000020
+#define PSR_F_BIT 0x00000040
+#define PSR_I_BIT 0x00000080
+#define PSR_A_BIT 0x00000100
+#define PSR_E_BIT 0x00000200
+#define PSR_J_BIT 0x01000000
+#define PSR_Q_BIT 0x08000000
+#define PSR_V_BIT 0x10000000
+#define PSR_C_BIT 0x20000000
+#define PSR_Z_BIT 0x40000000
+#define PSR_N_BIT 0x80000000
+
+/*
+ * Groups of PSR bits
+ */
+#define PSR_f 0xff000000 /* Flags */
+#define PSR_s 0x00ff0000 /* Status */
+#define PSR_x 0x0000ff00 /* Extension */
+#define PSR_c 0x000000ff /* Control */
+
+#endif
diff --git a/criu/arch/arm/include/asm/processor.h b/criu/arch/arm/include/asm/processor.h
new file mode 100644
index 000000000000..a390cfd322ec
--- /dev/null
+++ b/criu/arch/arm/include/asm/processor.h
@@ -0,0 +1,28 @@
+#ifndef __CR_PROCESSOR_H__
+#define __CR_PROCESSOR_H__
+
+/* Copied from linux kernel arch/arm/include/asm/unified.h */
+
+#define WASM(instr) #instr
+
+/* Copied from linux kernel arch/arm/include/asm/processor.h */
+
+#define __ALT_SMP_ASM(smp, up) \
+ "9998: " smp "\n" \
+ " .pushsection \".alt.smp.init\", \"a\"\n" \
+ " .long 9998b\n" \
+ " " up "\n" \
+ " .popsection\n"
+
+static inline void prefetchw(const void *ptr)
+{
+ __asm__ __volatile__(
+ ".arch_extension mp\n"
+ __ALT_SMP_ASM(
+ WASM(pldw) "\t%a0",
+ WASM(pld) "\t%a0"
+ )
+ :: "p" (ptr));
+}
+
+#endif /* __CR_PROCESSOR_H__ */
diff --git a/criu/arch/arm/include/asm/restore.h b/criu/arch/arm/include/asm/restore.h
new file mode 100644
index 000000000000..a1e66a5d5aab
--- /dev/null
+++ b/criu/arch/arm/include/asm/restore.h
@@ -0,0 +1,29 @@
+#ifndef __CR_ASM_RESTORE_H__
+#define __CR_ASM_RESTORE_H__
+
+#include "asm/restorer.h"
+
+#include "protobuf/core.pb-c.h"
+
+#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \
+ task_args) \
+ asm volatile( \
+ "mov %%sp, %%%0 \n" \
+ "mov %%r1, %%%1 \n" \
+ "mov %%r0, %%%2 \n" \
+ "bx %%r1 \n" \
+ : \
+ : "r"(new_sp), \
+ "r"(restore_task_exec_start), \
+ "r"(task_args) \
+ : "sp", "r0", "r1", "memory")
+
+static inline void core_get_tls(CoreEntry *pcore, tls_t *ptls)
+{
+ *ptls = pcore->ti_arm->tls;
+}
+
+
+int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core);
+
+#endif
diff --git a/criu/arch/arm/include/asm/restorer.h b/criu/arch/arm/include/asm/restorer.h
new file mode 100644
index 000000000000..8acb2d3e7db0
--- /dev/null
+++ b/criu/arch/arm/include/asm/restorer.h
@@ -0,0 +1,163 @@
+#ifndef __CR_ASM_RESTORER_H__
+#define __CR_ASM_RESTORER_H__
+
+#include "asm/types.h"
+#include "protobuf/core.pb-c.h"
+
+/* Copied from the Linux kernel header arch/arm/include/asm/sigcontext.h */
+
+struct rt_sigcontext {
+ unsigned long trap_no;
+ unsigned long error_code;
+ unsigned long oldmask;
+ unsigned long arm_r0;
+ unsigned long arm_r1;
+ unsigned long arm_r2;
+ unsigned long arm_r3;
+ unsigned long arm_r4;
+ unsigned long arm_r5;
+ unsigned long arm_r6;
+ unsigned long arm_r7;
+ unsigned long arm_r8;
+ unsigned long arm_r9;
+ unsigned long arm_r10;
+ unsigned long arm_fp;
+ unsigned long arm_ip;
+ unsigned long arm_sp;
+ unsigned long arm_lr;
+ unsigned long arm_pc;
+ unsigned long arm_cpsr;
+ unsigned long fault_address;
+};
+
+/* Copied from the Linux kernel header arch/arm/include/asm/ucontext.h */
+
+#define VFP_MAGIC 0x56465001
+#define VFP_STORAGE_SIZE sizeof(struct vfp_sigframe)
+
+struct vfp_sigframe {
+ unsigned long magic;
+ unsigned long size;
+ struct user_vfp ufp;
+ struct user_vfp_exc ufp_exc;
+};
+
+typedef struct vfp_sigframe fpu_state_t;
+
+struct aux_sigframe {
+ /*
+ struct crunch_sigframe crunch;
+ struct iwmmxt_sigframe iwmmxt;
+ */
+
+ struct vfp_sigframe vfp;
+ unsigned long end_magic;
+} __attribute__((__aligned__(8)));
+
+#include "sigframe.h"
+
+struct sigframe {
+ struct rt_ucontext uc;
+ unsigned long retcode[2];
+};
+
+struct rt_sigframe {
+ struct rt_siginfo info;
+ struct sigframe sig;
+};
+
+
+#define ARCH_RT_SIGRETURN(new_sp) \
+ asm volatile( \
+ "mov %%sp, %0 \n" \
+ "mov %%r7, #"__stringify(__NR_rt_sigreturn)" \n" \
+ "svc #0 \n" \
+ : \
+ : "r"(new_sp) \
+ : "sp","memory")
+
+#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \
+ thread_args, clone_restore_fn) \
+ asm volatile( \
+ "clone_emul: \n" \
+ "ldr %%r1, %2 \n" \
+ "sub %%r1, #16 \n" \
+ "mov %%r0, %%%6 \n" \
+ "str %%r0, [%%r1, #4] \n" \
+ "mov %%r0, %%%5 \n" \
+ "str %%r0, [%%r1] \n" \
+ "mov %%r0, %%%1 \n" \
+ "mov %%r2, %%%3 \n" \
+ "mov %%r3, %%%4 \n" \
+ "mov %%r7, #"__stringify(__NR_clone)" \n" \
+ "svc #0 \n" \
+ \
+ "cmp %%r0, #0 \n" \
+ "beq thread_run \n" \
+ \
+ "mov %%%0, %%r0 \n" \
+ "b clone_end \n" \
+ \
+ "thread_run: \n" \
+ "pop { %%r1 } \n" \
+ "pop { %%r0 } \n" \
+ "bx %%r1 \n" \
+ \
+ "clone_end: \n" \
+ : "=r"(ret) \
+ : "r"(clone_flags), \
+ "m"(new_sp), \
+ "r"(&parent_tid), \
+ "r"(&thread_args[i].pid), \
+ "r"(clone_restore_fn), \
+ "r"(&thread_args[i]) \
+ : "r0", "r1", "r2", "r3", "r7", "memory")
+
+#define ARCH_FAIL_CORE_RESTORE \
+ asm volatile( \
+ "mov %%sp, %0 \n" \
+ "mov %%r0, #0 \n" \
+ "bx %%r0 \n" \
+ : \
+ : "r"(ret) \
+ : "memory")
+
+
+#define RT_SIGFRAME_UC(rt_sigframe) rt_sigframe->sig.uc
+#define RT_SIGFRAME_REGIP(rt_sigframe) (rt_sigframe)->sig.uc.uc_mcontext.arm_ip
+#define RT_SIGFRAME_HAS_FPU(rt_sigframe) 1
+#define RT_SIGFRAME_FPU(rt_sigframe) ((struct aux_sigframe *)&sigframe->sig.uc.uc_regspace)->vfp
+
+#define SIGFRAME_OFFSET 0
+
+
+int restore_gpregs(struct rt_sigframe *f, UserArmRegsEntry *r);
+int restore_nonsigframe_gpregs(UserArmRegsEntry *r);
+
+static inline int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, fpu_state_t *fpu_state) { return 0; }
+
+static inline void restore_tls(tls_t *ptls) {
+ asm (
+ "mov %%r7, #15 \n"
+ "lsl %%r7, #16 \n"
+ "mov %%r0, #5 \n"
+ "add %%r7, %%r0 \n" /* r7 = 0xF005 */
+ "ldr %%r0, [%0] \n"
+ "svc #0 \n"
+ :
+ : "r"(ptls)
+ : "r0", "r7"
+ );
+}
+
+static inline int ptrace_set_breakpoint(pid_t pid, void *addr)
+{
+ return 0;
+}
+
+static inline int ptrace_flush_breakpoints(pid_t pid)
+{
+ return 0;
+}
+
+#endif
diff --git a/criu/arch/arm/include/asm/string.h b/criu/arch/arm/include/asm/string.h
new file mode 100644
index 000000000000..2c3a34fbbd3f
--- /dev/null
+++ b/criu/arch/arm/include/asm/string.h
@@ -0,0 +1,7 @@
+#ifndef __CR_ASM_STRING_H__
+#define __CR_ASM_STRING_H__
+
+#include "compiler.h"
+#include "asm-generic/string.h"
+
+#endif /* __CR_ASM_STRING_H__ */
diff --git a/criu/arch/arm/include/asm/syscall-aux.S b/criu/arch/arm/include/asm/syscall-aux.S
new file mode 100644
index 000000000000..8bc01c3eccb2
--- /dev/null
+++ b/criu/arch/arm/include/asm/syscall-aux.S
@@ -0,0 +1,13 @@
+nr_sys_mmap:
+ .long 192
+
+ENTRY(sys_mmap)
+ push {%r4, %r5, %r7, %lr}
+ ldr %r4, [%sp, #16]
+ ldr %r5, [%sp, #20]
+ lsr %r5, #12
+ adr %r7, nr_sys_mmap
+ ldr %r7, [%r7]
+ svc 0x00000000
+ pop {%r4, %r5, %r7, %pc}
+END(sys_mmap)
diff --git a/criu/arch/arm/include/asm/syscall-aux.h b/criu/arch/arm/include/asm/syscall-aux.h
new file mode 100644
index 000000000000..ec8c2d38352a
--- /dev/null
+++ b/criu/arch/arm/include/asm/syscall-aux.h
@@ -0,0 +1,8 @@
+#define __NR_mmap2 192
+
+#define __ARM_NR_BASE 0x0f0000
+#define __ARM_NR_breakpoint (__ARM_NR_BASE+1)
+#define __ARM_NR_cacheflush (__ARM_NR_BASE+2)
+#define __ARM_NR_usr26 (__ARM_NR_BASE+3)
+#define __ARM_NR_usr32 (__ARM_NR_BASE+4)
+#define __ARM_NR_set_tls (__ARM_NR_BASE+5)
diff --git a/criu/arch/arm/include/asm/types.h b/criu/arch/arm/include/asm/types.h
new file mode 100644
index 000000000000..d1d6cc3a2fa5
--- /dev/null
+++ b/criu/arch/arm/include/asm/types.h
@@ -0,0 +1,137 @@
+#ifndef __CR_ASM_TYPES_H__
+#define __CR_ASM_TYPES_H__
+
+#include <stdbool.h>
+#include <signal.h>
+#include "protobuf/core.pb-c.h"
+
+#include "asm/page.h"
+#include "asm/bitops.h"
+#include "asm/int.h"
+
+#define SIGMAX 64
+#define SIGMAX_OLD 31
+
+#define MAJOR(dev) ((dev)>>8)
+#define MINOR(dev) ((dev) & 0xff)
+
+typedef void rt_signalfn_t(int, siginfo_t *, void *);
+typedef rt_signalfn_t *rt_sighandler_t;
+
+typedef void rt_restorefn_t(void);
+typedef rt_restorefn_t *rt_sigrestore_t;
+
+#define _KNSIG 64
+#define _NSIG_BPW 32
+
+#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW)
+
+typedef struct {
+ unsigned long sig[_KNSIG_WORDS];
+} k_rtsigset_t;
+
+static inline void ksigfillset(k_rtsigset_t *set)
+{
+ int i;
+ for (i = 0; i < _KNSIG_WORDS; i++)
+ set->sig[i] = (unsigned long)-1;
+}
+
+#define SA_RESTORER 0x04000000
+
+typedef struct {
+ rt_sighandler_t rt_sa_handler;
+ unsigned long rt_sa_flags;
+ rt_sigrestore_t rt_sa_restorer;
+ k_rtsigset_t rt_sa_mask;
+} rt_sigaction_t;
+
+/*
+ * Copied from the Linux kernel header arch/arm/include/asm/ptrace.h
+ *
+ * A thread ARM CPU context
+ */
+
+typedef struct {
+ long uregs[18];
+} user_regs_struct_t;
+
+#define ARM_cpsr uregs[16]
+#define ARM_pc uregs[15]
+#define ARM_lr uregs[14]
+#define ARM_sp uregs[13]
+#define ARM_ip uregs[12]
+#define ARM_fp uregs[11]
+#define ARM_r10 uregs[10]
+#define ARM_r9 uregs[9]
+#define ARM_r8 uregs[8]
+#define ARM_r7 uregs[7]
+#define ARM_r6 uregs[6]
+#define ARM_r5 uregs[5]
+#define ARM_r4 uregs[4]
+#define ARM_r3 uregs[3]
+#define ARM_r2 uregs[2]
+#define ARM_r1 uregs[1]
+#define ARM_r0 uregs[0]
+#define ARM_ORIG_r0 uregs[17]
+
+
+/* Copied from arch/arm/include/asm/user.h */
+
+struct user_vfp {
+ unsigned long long fpregs[32];
+ unsigned long fpscr;
+};
+
+struct user_vfp_exc {
+ unsigned long fpexc;
+ unsigned long fpinst;
+ unsigned long fpinst2;
+};
+
+#define ASSIGN_TYPED(a, b) do { a = (typeof(a))b; } while (0)
+#define ASSIGN_MEMBER(a,b,m) do { ASSIGN_TYPED((a)->m, (b)->m); } while (0)
+
+#define REG_RES(regs) ((regs).ARM_r0)
+#define REG_IP(regs) ((regs).ARM_pc)
+#define REG_SYSCALL_NR(regs) ((regs).ARM_r7)
+
+/*
+ * Range for task size calculated from the following Linux kernel files:
+ * arch/arm/include/asm/memory.h
+ * arch/arm/Kconfig (PAGE_OFFSET values in Memory split section)
+ */
+#define TASK_SIZE_MIN 0x3f000000
+#define TASK_SIZE_MAX 0xbf000000
+#define SZ_1G 0x40000000
+
+int munmap(void *addr, size_t length);
+
+static inline unsigned long task_size(void)
+{
+ unsigned long task_size;
+
+ for (task_size = TASK_SIZE_MIN; task_size < TASK_SIZE_MAX; task_size += SZ_1G)
+ if (munmap((void *)task_size, page_size()))
+ break;
+
+ return task_size;
+}
+
+#define AT_VECTOR_SIZE 40
+
+typedef UserArmRegsEntry UserRegsEntry;
+
+#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__ARM
+
+#define CORE_THREAD_ARCH_INFO(core) core->ti_arm
+
+#define TI_SP(core) ((core)->ti_arm->gpregs->sp)
+
+typedef u32 auxv_t;
+typedef u32 tls_t;
+
+static inline void *decode_pointer(u64 v) { return (void*)(u32)v; }
+static inline u64 encode_pointer(void *p) { return (u32)p; }
+
+#endif /* __CR_ASM_TYPES_H__ */
diff --git a/criu/arch/arm/parasite-head.S b/criu/arch/arm/parasite-head.S
new file mode 100644
index 000000000000..b15fcbae275b
--- /dev/null
+++ b/criu/arch/arm/parasite-head.S
@@ -0,0 +1,23 @@
+#include "asm/linkage.h"
+#include "parasite.h"
+
+ .section .head.text, "ax"
+ENTRY(__export_parasite_head_start)
+ sub %r2, %pc, #8 @ get the address of this instruction
+
+ adr %r0, __export_parasite_cmd
+ ldr %r0, [%r0]
+
+ adr %r1, parasite_args_ptr
+ ldr %r1, [%r1]
+ add %r1, %r1, %r2 @ fixup __export_parasite_args
+
+ bl parasite_service
+ .byte 0xf0, 0x01, 0xf0, 0xe7 @ the instruction UDF #32 generates the signal SIGTRAP in Linux
+
+parasite_args_ptr:
+ .long __export_parasite_args
+
+__export_parasite_cmd:
+ .long 0
+END(__export_parasite_head_start)
diff --git a/criu/arch/arm/restorer.c b/criu/arch/arm/restorer.c
new file mode 100644
index 000000000000..786feeeb31bd
--- /dev/null
+++ b/criu/arch/arm/restorer.c
@@ -0,0 +1,15 @@
+#include <unistd.h>
+
+#include "restorer.h"
+#include "asm/restorer.h"
+#include "asm/string.h"
+
+#include "syscall.h"
+#include "log.h"
+#include "asm/fpu.h"
+#include "cpu.h"
+
+int restore_nonsigframe_gpregs(UserArmRegsEntry *r)
+{
+ return 0;
+}
diff --git a/criu/arch/arm/syscalls/syscall-common.S b/criu/arch/arm/syscalls/syscall-common.S
new file mode 100644
index 000000000000..c3cbf71050f3
--- /dev/null
+++ b/criu/arch/arm/syscalls/syscall-common.S
@@ -0,0 +1,34 @@
+#include "asm/linkage.h"
+
+@ We use the register R8 unlike libc that uses R12.
+@ This avoids corruption of the register by the stub
+@ for the syscall sys_munmap() when syscalls are hooked
+@ by ptrace(). However we have to make sure that
+@ the compiler doesn't use the register on the route
+@ between parasite_service() and sys_munmap().
+
+syscall_common:
+ ldr %r7, [%r7]
+ add %r8, %sp, #24
+ ldm %r8, {%r4, %r5, %r6}
+ svc 0x00000000
+ pop {%r4, %r5, %r6, %r7, %r8, %pc}
+
+
+.macro syscall name, nr
+ .nr_\name :
+ .long \nr
+
+ ENTRY(\name)
+ push {%r4, %r5, %r6, %r7, %r8, %lr}
+ adr %r7, .nr_\name
+ b syscall_common
+ END(\name)
+.endm
+
+
+ENTRY(__cr_restore_rt)
+ adr %r7, .nr_sys_rt_sigreturn
+ ldr %r7, [%r7]
+ svc #0
+END(__cr_restore_rt)
diff --git a/criu/arch/arm/syscalls/syscall.def b/criu/arch/arm/syscalls/syscall.def
new file mode 100644
index 000000000000..5d57169acb14
--- /dev/null
+++ b/criu/arch/arm/syscalls/syscall.def
@@ -0,0 +1,107 @@
+#
+# System calls table, please make sure the table consist only the syscalls
+# really used somewhere in project.
+#
+# The template is (name and arguments are optinal if you need only __NR_x
+# defined, but no realy entry point in syscalls lib).
+#
+# name/alias code64 code32 arguments
+# -----------------------------------------------------------------------
+#
+read 63 3 (int fd, void *buf, unsigned long count)
+write 64 4 (int fd, const void *buf, unsigned long count)
+open ! 5 (const char *filename, unsigned long flags, unsigned long mode)
+close 57 6 (int fd)
+lseek 62 19 (int fd, unsigned long offset, unsigned long origin)
+mmap 222 ! (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset)
+mprotect 226 125 (const void *addr, unsigned long len, unsigned long prot)
+munmap 215 91 (void *addr, unsigned long len)
+brk 214 45 (void *addr)
+rt_sigaction sigaction 134 174 (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize)
+rt_sigprocmask sigprocmask 135 175 (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize)
+rt_sigreturn 139 173 (void)
+ioctl 29 54 (unsigned int fd, unsigned int cmd, unsigned long arg)
+pread64 67 180 (unsigned int fd, char *buf, size_t count, loff_t pos)
+ptrace 117 26 (long request, pid_t pid, void *addr, void *data)
+mremap 216 163 (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flag, unsigned long new_addr)
+mincore 232 219 (void *addr, unsigned long size, unsigned char *vec)
+madvise 233 220 (unsigned long start, size_t len, int behavior)
+shmat 196 305 (int shmid, void *shmaddr, int shmflag)
+pause 1061 29 (void)
+nanosleep 101 162 (struct timespec *req, struct timespec *rem)
+getitimer 102 105 (int which, const struct itimerval *val)
+setitimer 103 104 (int which, const struct itimerval *val, struct itimerval *old)
+getpid 172 20 (void)
+socket 198 281 (int domain, int type, int protocol)
+connect 203 283 (int sockfd, struct sockaddr *addr, int addrlen)
+sendto 206 290 (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len)
+recvfrom 207 292 (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len)
+sendmsg 211 296 (int sockfd, const struct msghdr *msg, int flags)
+recvmsg 212 297 (int sockfd, struct msghdr *msg, int flags)
+shutdown 210 293 (int sockfd, int how)
+bind 235 282 (int sockfd, const struct sockaddr *addr, int addrlen)
+setsockopt 208 294 (int sockfd, int level, int optname, const void *optval, socklen_t optlen)
+getsockopt 209 295 (int sockfd, int level, int optname, const void *optval, socklen_t *optlen)
+clone 220 120 (unsigned long flags, void *child_stack, void *parent_tid, void *child_tid)
+exit 93 1 (unsigned long error_code)
+wait4 260 114 (int pid, int *status, int options, struct rusage *ru)
+waitid 95 280 (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru)
+kill 129 37 (long pid, int sig)
+fcntl 25 55 (int fd, int type, long arg)
+flock 32 143 (int fd, unsigned long cmd)
+mkdir ! 39 (const char *name, int mode)
+rmdir ! 40 (const char *name)
+unlink ! 10 (char *pathname)
+readlinkat 78 332 (int fd, const char *path, char *buf, int bufsize)
+umask 166 60 (int mask)
+getgroups 158 205 (int gsize, unsigned int *groups)
+setgroups 159 206 (int gsize, unsigned int *groups)
+setresuid 147 164 (int uid, int euid, int suid)
+getresuid 148 165 (int *uid, int *euid, int *suid)
+setresgid 149 170 (int gid, int egid, int sgid)
+getresgid 150 171 (int *gid, int *egid, int *sgid)
+getpgid 155 132 (pid_t pid)
+setfsuid 151 138 (int fsuid)
+setfsgid 152 139 (int fsgid)
+getsid 156 147 (void)
+capget 90 184 (struct cap_header *h, struct cap_data *d)
+capset 91 185 (struct cap_header *h, struct cap_data *d)
+rt_sigqueueinfo 138 178 (pid_t pid, int sig, siginfo_t *info)
+setpriority 140 97 (int which, int who, int nice)
+sched_setscheduler 119 156 (int pid, int policy, struct sched_param *p)
+sigaltstack 132 186 (const void *uss, void *uoss)
+personality 92 136 (unsigned int personality)
+prctl 167 172 (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5)
+arch_prctl ! 17 (int option, unsigned long addr)
+setrlimit 164 75 (int resource, struct krlimit *rlim)
+mount 40 21 (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data)
+umount2 39 52 (char *name, int flags)
+gettid 178 224 (void)
+futex 98 240 (u32 *uaddr, int op, u32 val, struct timespec *utime, u32 *uaddr2, u32 val3)
+set_tid_address 96 256 (int *tid_addr)
+restart_syscall 128 0 (void)
+timer_create 107 257 (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id)
+timer_settime 110 258 (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting)
+timer_gettime 108 259 (int timer_id, const struct itimerspec *setting)
+timer_getoverrun 109 260 (int timer_id)
+timer_delete 111 261 (kernel_timer_t timer_id)
+clock_gettime 113 263 (const clockid_t which_clock, const struct timespec *tp)
+exit_group 94 248 (int error_code)
+set_robust_list 99 338 (struct robust_list_head *head, size_t len)
+get_robust_list 100 339 (int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
+signalfd4 74 355 (int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
+rt_tgsigqueueinfo 240 363 (pid_t tgid, pid_t pid, int sig, siginfo_t *info)
+vmsplice 75 343 (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags)
+timerfd_settime 86 353 (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
+fanotify_init 262 367 (unsigned int flags, unsigned int event_f_flags)
+fanotify_mark 263 368 (int fanotify_fd, unsigned int flags, u64 mask, int dfd, const char *pathname)
+open_by_handle_at 265 371 (int mountdirfd, struct file_handle *handle, int flags)
+setns 268 375 (int fd, int nstype)
+kcmp 272 378 (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2)
+openat 56 322 (int dirfd, const char *pathname, int flags, mode_t mode)
+mkdirat 34 323 (int dirfd, const char *pathname, mode_t mode)
+unlinkat 35 328 (int dirfd, const char *pathname, int flags)
+memfd_create 279 385 (const char *name, unsigned int flags)
+io_setup 0 243 (unsigned nr_events, aio_context_t *ctx)
+io_getevents 4 245 (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo)
+seccomp 277 383 (unsigned int op, unsigned int flags, const char *uargs)
diff --git a/criu/arch/arm/uidiv.S b/criu/arch/arm/uidiv.S
new file mode 100644
index 000000000000..e77f6100c784
--- /dev/null
+++ b/criu/arch/arm/uidiv.S
@@ -0,0 +1,186 @@
+.globl __aeabi_uidiv
+
+work .req r4 @ XXXX is this safe ?
+dividend .req r0
+divisor .req r1
+overdone .req r2
+result .req r2
+curbit .req r3
+
+#define LSYM(x) x
+
+.macro THUMB_DIV_MOD_BODY modulo
+ @ Load the constant 0x10000000 into our work register.
+ mov work, #1
+ lsl work, #28
+LSYM(Loop1):
+ @ Unless the divisor is very big, shift it up in multiples of
+ @ four bits, since this is the amount of unwinding in the main
+ @ division loop. Continue shifting until the divisor is
+ @ larger than the dividend.
+ cmp divisor, work
+ bhs LSYM(Lbignum)
+ cmp divisor, dividend
+ bhs LSYM(Lbignum)
+ lsl divisor, #4
+ lsl curbit, #4
+ b LSYM(Loop1)
+LSYM(Lbignum):
+ @ Set work to 0x80000000
+ lsl work, #3
+LSYM(Loop2):
+ @ For very big divisors, we must shift it a bit at a time, or
+ @ we will be in danger of overflowing.
+ cmp divisor, work
+ bhs LSYM(Loop3)
+ cmp divisor, dividend
+ bhs LSYM(Loop3)
+ lsl divisor, #1
+ lsl curbit, #1
+ b LSYM(Loop2)
+LSYM(Loop3):
+ @ Test for possible subtractions ...
+ .if \modulo
+ @ ... On the final pass, this may subtract too much from the dividend,
+ @ so keep track of which subtractions are done, we can fix them up
+ @ afterwards.
+ mov overdone, #0
+ cmp dividend, divisor
+ blo LSYM(Lover1)
+ sub dividend, dividend, divisor
+LSYM(Lover1):
+ lsr work, divisor, #1
+ cmp dividend, work
+ blo LSYM(Lover2)
+ sub dividend, dividend, work
+ mov ip, curbit
+ mov work, #1
+ ror curbit, work
+ orr overdone, curbit
+ mov curbit, ip
+LSYM(Lover2):
+ lsr work, divisor, #2
+ cmp dividend, work
+ blo LSYM(Lover3)
+ sub dividend, dividend, work
+ mov ip, curbit
+ mov work, #2
+ ror curbit, work
+ orr overdone, curbit
+ mov curbit, ip
+LSYM(Lover3):
+ lsr work, divisor, #3
+ cmp dividend, work
+ blo LSYM(Lover4)
+ sub dividend, dividend, work
+ mov ip, curbit
+ mov work, #3
+ ror curbit, work
+ orr overdone, curbit
+ mov curbit, ip
+LSYM(Lover4):
+ mov ip, curbit
+ .else
+ @ ... and note which bits are done in the result. On the final pass,
+ @ this may subtract too much from the dividend, but the result will be ok,
+ @ since the "bit" will have been shifted out at the bottom.
+ cmp dividend, divisor
+ blo LSYM(Lover1)
+ sub dividend, dividend, divisor
+ orr result, result, curbit
+LSYM(Lover1):
+ lsr work, divisor, #1
+ cmp dividend, work
+ blo LSYM(Lover2)
+ sub dividend, dividend, work
+ lsr work, curbit, #1
+ orr result, work
+LSYM(Lover2):
+ lsr work, divisor, #2
+ cmp dividend, work
+ blo LSYM(Lover3)
+ sub dividend, dividend, work
+ lsr work, curbit, #2
+ orr result, work
+LSYM(Lover3):
+ lsr work, divisor, #3
+ cmp dividend, work
+ blo LSYM(Lover4)
+ sub dividend, dividend, work
+ lsr work, curbit, #3
+ orr result, work
+LSYM(Lover4):
+ .endif
+
+ cmp dividend, #0 @ Early termination?
+ beq LSYM(Lover5)
+ lsr curbit, #4 @ No, any more bits to do?
+ beq LSYM(Lover5)
+ lsr divisor, #4
+ b LSYM(Loop3)
+LSYM(Lover5):
+ .if \modulo
+ @ Any subtractions that we should not have done will be recorded in
+ @ the top three bits of "overdone". Exactly which were not needed
+ @ are governed by the position of the bit, stored in ip.
+ mov work, #0xe
+ lsl work, #28
+ and overdone, work
+ beq LSYM(Lgot_result)
+
+ @ If we terminated early, because dividend became zero, then the
+ @ bit in ip will not be in the bottom nibble, and we should not
+ @ perform the additions below. We must test for this though
+ @ (rather relying upon the TSTs to prevent the additions) since
+ @ the bit in ip could be in the top two bits which might then match
+ @ with one of the smaller RORs.
+ mov curbit, ip
+ mov work, #0x7
+ tst curbit, work
+ beq LSYM(Lgot_result)
+
+ mov curbit, ip
+ mov work, #3
+ ror curbit, work
+ tst overdone, curbit
+ beq LSYM(Lover6)
+ lsr work, divisor, #3
+ add dividend, work
+LSYM(Lover6):
+ mov curbit, ip
+ mov work, #2
+ ror curbit, work
+ tst overdone, curbit
+ beq LSYM(Lover7)
+ lsr work, divisor, #2
+ add dividend, work
+LSYM(Lover7):
+ mov curbit, ip
+ mov work, #1
+ ror curbit, work
+ tst overdone, curbit
+ beq LSYM(Lgot_result)
+ lsr work, divisor, #1
+ add dividend, work
+ .endif
+LSYM(Lgot_result):
+.endm
+
+
+ .thumb
+ .text
+
+__aeabi_uidiv:
+ mov curbit, #1
+ mov result, #0
+
+ push { work }
+ cmp dividend, divisor
+ blo LSYM(Lgot_result)
+
+ THUMB_DIV_MOD_BODY 0
+
+ mov r0, result
+ pop { work }
+
+ bx lr
diff --git a/criu/arch/ppc64/Makefile b/criu/arch/ppc64/Makefile
new file mode 100644
index 000000000000..5db577340a7d
--- /dev/null
+++ b/criu/arch/ppc64/Makefile
@@ -0,0 +1,6 @@
+builtin-name := crtools.built-in.o
+
+ccflags-y += -iquote $(obj) -iquote $(SRC_DIR) -iquote $(obj)/include -iquote $(SRC_DIR)/criu/include
+
+obj-y += cpu.o
+obj-y += crtools.o
diff --git a/criu/arch/ppc64/Makefile.syscalls b/criu/arch/ppc64/Makefile.syscalls
new file mode 100644
index 000000000000..1b28ce7dc0df
--- /dev/null
+++ b/criu/arch/ppc64/Makefile.syscalls
@@ -0,0 +1,50 @@
+builtin-name := syscalls.built-in.o
+
+SYS-TYPES := ../../include/syscall-types.h
+SYS-CODES := ../../include/syscall-codes.h
+SYS-PROTO := ../../include/syscall.h
+SYS-DEF := syscall-ppc64.tbl
+SYS-ASM-COMMON := syscall-common-ppc64.S
+
+asflags-y += -D__ASSEMBLY__ -nostdlib -fomit-frame-pointer
+asflags-y += -fpie -Wstrict-prototypes -Wa,--noexecstack
+asflags-y += -iquote $(obj) -iquote $(obj)/include -iquote $(SRC_DIR)/criu/include
+
+SYS-ASM := syscalls.S
+obj-y += $(SYS-ASM:.S=).o
+
+$(obj)/$(SYS-CODES): $(obj)/syscalls/$(SYS-DEF)
+ $(E) " GEN " $@
+ $(Q) echo "/* Autogenerated, don't edit */" > $@
+ $(Q) echo "#ifndef __ASM_CR_SYSCALL_CODES_H__" >> $@
+ $(Q) echo "#define __ASM_CR_SYSCALL_CODES_H__" >> $@
+ $(Q) cat $< | awk '/^__NR/{print "#define", $$1, $$2}' >> $@
+ $(Q) echo "#endif /* __ASM_CR_SYSCALL_CODES_H__ */" >> $@
+cleanup-y += $(obj)/$(SYS-CODES)
+
+$(obj)/$(SYS-PROTO): $(obj)/syscalls/$(SYS-DEF)
+ $(E) " GEN " $@
+ $(Q) echo "/* Autogenerated, don't edit */" > $@
+ $(Q) echo "#ifndef __ASM_CR_SYSCALL_PROTO_H__" >> $@
+ $(Q) echo "#define __ASM_CR_SYSCALL_PROTO_H__" >> $@
+ $(Q) echo "#include \"syscall-codes.h\"" >> $@
+ $(Q) echo "#include \"syscall-types.h\"" >> $@
+ $(Q) cat $< | awk '/^__NR/{print "extern long", $$3, substr($$0, index($$0,$$4)), ";"}' >> $@
+ $(Q) echo "#endif /* __ASM_CR_SYSCALL_PROTO_H__ */" >> $@
+cleanup-y += $(obj)/$(SYS-PROTO)
+
+$(obj)/$(SYS-ASM): $(obj)/syscalls/$(SYS-DEF) $(obj)/syscalls/$(SYS-ASM-COMMON) $(obj)/$(SYS-CODES) $(obj)/$(SYS-PROTO)
+ $(E) " GEN " $@
+ $(Q) echo "/* Autogenerated, don't edit */" > $@
+ $(Q) echo "#include \"syscall-codes.h\"" >> $@
+ $(Q) echo "#include \"syscalls/$(SYS-ASM-COMMON)\"" >> $@
+ $(Q) cat $< | awk '/^__NR/{print "SYSCALL(", $$3, ",", $$2, ")"}' >> $@
+cleanup-y += $(obj)/$(SYS-ASM)
+
+SYS-EXEC-TBL := sys-exec-tbl.c
+$(obj)/$(SYS-EXEC-TBL): $(obj)/syscalls/$(SYS-DEF) $(obj)/$(SYS-CODES) $(obj)/$(SYS-PROTO)
+ $(E) " GEN " $@
+ $(Q) echo "/* Autogenerated, don't edit */" > $@
+ $(Q) cat $< | awk '/^__NR/{print "SYSCALL(", substr($$3, 5), ",", $$2, ")"}' >> $@
+cleanup-y += $(obj)/$(SYS-EXEC-TBL)
+all-y += $(obj)/$(SYS-EXEC-TBL)
diff --git a/criu/arch/ppc64/cpu.c b/criu/arch/ppc64/cpu.c
new file mode 100644
index 000000000000..d84a782c9047
--- /dev/null
+++ b/criu/arch/ppc64/cpu.c
@@ -0,0 +1,149 @@
+#undef LOG_PREFIX
+#define LOG_PREFIX "cpu: "
+
+#include <sys/auxv.h>
+#include <errno.h>
+#include <asm/cputable.h>
+
+#include "asm/types.h"
+#include "asm/cpu.h"
+
+#include "cr_options.h"
+#include "proc_parse.h"
+#include "util.h"
+#include "log.h"
+#include "cpu.h"
+
+#include "protobuf.h"
+#include "protobuf/cpuinfo.pb-c.h"
+
+static uint64_t hwcap[2];
+
+#ifdef __LITTLE_ENDIAN__
+#define CURRENT_ENDIANNESS CPUINFO_PPC64_ENTRY__ENDIANNESS__LITTLEENDIAN
+#else
+#define CURRENT_ENDIANNESS CPUINFO_PPC64_ENTRY__ENDIANESS__BIGENDIAN
+#endif
+
+int cpu_init(void)
+{
+ hwcap[0] = getauxval(AT_HWCAP);
+ hwcap[1] = getauxval(AT_HWCAP2);
+
+ if (!hwcap[0] || !hwcap[1]) {
+ pr_err("Can't read the hardware capabilities");
+ return -1;
+ }
+ return 0;
+}
+
+int cpu_dump_cpuinfo(void)
+{
+ CpuinfoEntry cpu_info = CPUINFO_ENTRY__INIT;
+ CpuinfoPpc64Entry cpu_ppc64_info = CPUINFO_PPC64_ENTRY__INIT;
+ CpuinfoPpc64Entry *cpu_ppc64_info_ptr = &cpu_ppc64_info;
+ struct cr_img *img;
+ int ret = -1;
+
+ img = open_image(CR_FD_CPUINFO, O_DUMP);
+ if (!img)
+ return -1;
+
+ cpu_info.ppc64_entry = &cpu_ppc64_info_ptr;
+ cpu_info.n_ppc64_entry = 1;
+
+ cpu_ppc64_info.endian = CURRENT_ENDIANNESS;
+ cpu_ppc64_info.n_hwcap = 2;
+ cpu_ppc64_info.hwcap = hwcap;
+
+ ret = pb_write_one(img, &cpu_info, PB_CPUINFO);
+
+ close_image(img);
+ return ret;
+}
+
+int cpu_validate_cpuinfo(void)
+{
+ CpuinfoEntry *cpu_info;
+ CpuinfoPpc64Entry *cpu_ppc64_entry;
+ struct cr_img *img;
+ int ret = -1;
+ img = open_image(CR_FD_CPUINFO, O_RSTR);
+ if (!img)
+ return -1;
+
+ if (pb_read_one(img, &cpu_info, PB_CPUINFO) < 0)
+ goto error;
+
+ if (cpu_info->n_ppc64_entry != 1) {
+ pr_err("No PPC64 related entry in image");
+ goto error;
+ }
+ cpu_ppc64_entry = cpu_info->ppc64_entry[0];
+
+ if (cpu_ppc64_entry->endian != CURRENT_ENDIANNESS) {
+ pr_err("Bad endianness");
+ goto error;
+ }
+
+ if (cpu_ppc64_entry->n_hwcap != 2) {
+ pr_err("Hardware capabilities information missing\n");
+ goto error;
+ }
+
+#define CHECK_FEATURE(s,f) do { \
+ if ((cpu_ppc64_entry->hwcap[s] & f) && !(hwcap[s] & f)) { \
+ pr_err("CPU Feature %s required by image " \
+ "is not supported on host.\n", #f); \
+ goto error; \
+ } \
+ } while(0)
+
+#define REQUIRE_FEATURE(s,f) do { \
+ if (!(cpu_ppc64_entry->hwcap[s] & f)) { \
+ pr_err("CPU Feature %s missing in image.\n", #f); \
+ goto error; \
+ } \
+ } while(0)
+
+ REQUIRE_FEATURE(0, PPC_FEATURE_64);
+ REQUIRE_FEATURE(0, PPC_FEATURE_HAS_FPU);
+ REQUIRE_FEATURE(0, PPC_FEATURE_HAS_MMU);
+ REQUIRE_FEATURE(0, PPC_FEATURE_HAS_VSX);
+ REQUIRE_FEATURE(1, PPC_FEATURE2_ARCH_2_07);
+
+ CHECK_FEATURE(0, PPC_FEATURE_TRUE_LE);
+ CHECK_FEATURE(1, PPC_FEATURE2_HTM);
+ CHECK_FEATURE(1, PPC_FEATURE2_DSCR);
+ CHECK_FEATURE(1, PPC_FEATURE2_EBB);
+ CHECK_FEATURE(1, PPC_FEATURE2_ISEL);
+ CHECK_FEATURE(1, PPC_FEATURE2_TAR);
+ CHECK_FEATURE(1, PPC_FEATURE2_VEC_CRYPTO);
+
+ ret = 0;
+error:
+ close_image(img);
+ return ret;
+}
+
+int cpuinfo_dump(void)
+{
+ if (cpu_init())
+ return -1;
+
+ if (cpu_dump_cpuinfo())
+ return -1;
+
+ return 0;
+}
+
+int cpuinfo_check(void)
+{
+ if (cpu_init())
+ return -1;
+
+ if (cpu_validate_cpuinfo())
+ return 1;
+
+ return 0;
+}
diff --git a/criu/arch/ppc64/crtools.c b/criu/arch/ppc64/crtools.c
new file mode 100644
index 000000000000..b1f4f6381591
--- /dev/null
+++ b/criu/arch/ppc64/crtools.c
@@ -0,0 +1,524 @@
+#include <string.h>
+#include <unistd.h>
+#include <elf.h>
+#include <sys/user.h>
+#include <asm/unistd.h>
+
+#include "asm/types.h"
+#include "asm/fpu.h"
+#include "asm/restorer.h"
+
+#include "cr_options.h"
+#include "compiler.h"
+#include "ptrace.h"
+#include "parasite-syscall.h"
+#include "log.h"
+#include "util.h"
+#include "cpu.h"
+#include "errno.h"
+
+#include "protobuf.h"
+#include "protobuf/core.pb-c.h"
+#include "protobuf/creds.pb-c.h"
+
+#define MSR_VEC (1<<25)
+#define MSR_VSX (1<<23)
+
+/*
+ * Injected syscall instruction
+ */
+const u32 code_syscall[] = {
+ 0x44000002, /* sc */
+ 0x0fe00000 /* twi 31,0,0 */
+};
+
+const int code_syscall_size = sizeof(code_syscall);
+
+static inline void __check_code_syscall(void)
+{
+ BUILD_BUG_ON(sizeof(code_syscall) != BUILTIN_SYSCALL_SIZE);
+ BUILD_BUG_ON(!is_log2(sizeof(code_syscall)));
+}
+
+void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs)
+{
+ /*
+ * OpenPOWER ABI requires that r12 is set to the calling function addressi
+ * to compute the TOC pointer.
+ */
+ regs->gpr[12] = new_ip;
+ regs->nip = new_ip;
+ if (stack)
+ regs->gpr[1] = (unsigned long) stack;
+ regs->trap = 0;
+}
+
+bool arch_can_dump_task(pid_t pid)
+{
+ /*
+ * TODO: We should detect 32bit task when BE support is done.
+ */
+ return true;
+}
+
+int syscall_seized(struct parasite_ctl *ctl, int nr, unsigned long *ret,
+ unsigned long arg1,
+ unsigned long arg2,
+ unsigned long arg3,
+ unsigned long arg4,
+ unsigned long arg5,
+ unsigned long arg6)
+{
+ user_regs_struct_t regs = ctl->orig.regs;
+ int err;
+
+ regs.gpr[0] = (unsigned long)nr;
+ regs.gpr[3] = arg1;
+ regs.gpr[4] = arg2;
+ regs.gpr[5] = arg3;
+ regs.gpr[6] = arg4;
+ regs.gpr[7] = arg5;
+ regs.gpr[8] = arg6;
+
+ err = __parasite_execute_syscall(ctl, ®s);
+
+ *ret = regs.gpr[3];
+ return err;
+}
+
+/* This is the layout of the POWER7 VSX registers and the way they
+ * overlap with the existing FPR and VMX registers.
+ *
+ * VSR doubleword 0 VSR doubleword 1
+ * ----------------------------------------------------------------
+ * VSR[0] | FPR[0] | |
+ * ----------------------------------------------------------------
+ * VSR[1] | FPR[1] | |
+ * ----------------------------------------------------------------
+ * | ... | |
+ * ----------------------------------------------------------------
+ * VSR[30] | FPR[30] | |
+ * ----------------------------------------------------------------
+ * VSR[31] | FPR[31] | |
+ * ----------------------------------------------------------------
+ * VSR[32] | VR[0] |
+ * ----------------------------------------------------------------
+ * VSR[33] | VR[1] |
+ * ----------------------------------------------------------------
+ * | ... |
+ * ----------------------------------------------------------------
+ * VSR[62] | VR[30] |
+ * ----------------------------------------------------------------
+ * VSR[63] | VR[31] |
+ * ----------------------------------------------------------------
+ *
+ * PTRACE_GETFPREGS returns FPR[0..31] + FPSCR
+ * PTRACE_GETVRREGS returns VR[0..31] + VSCR + VRSAVE
+ * PTRACE_GETVSRREGS returns VSR[0..31]
+ *
+ * PTRACE_GETVSRREGS and PTRACE_GETFPREGS are required since we need
+ * to save FPSCR too.
+ */
+static int get_fpu_regs(pid_t pid, CoreEntry *core)
+{
+ uint64_t fpregs[NFPREG];
+ UserPpc64FpstateEntry *fpe;
+ int i;
+
+ if (ptrace(PTRACE_GETFPREGS, pid, 0, (void *)&fpregs) < 0) {
+ pr_perror("Couldn't get floating-point registers");
+ return -1;
+ }
+
+ fpe = xmalloc(sizeof(UserPpc64FpstateEntry));
+ if (!fpe)
+ return -1;
+ user_ppc64_fpstate_entry__init(fpe);
+
+ fpe->n_fpregs = NFPREG;
+ fpe->fpregs = xmalloc(fpe->n_fpregs * sizeof(fpe->fpregs[0]));
+ if (!fpe->fpregs) {
+ xfree(fpe);
+ return -1;
+ }
+
+ /* FPSRC is the last (33th) register in the set */
+ for (i = 0; i < NFPREG; i++)
+ fpe->fpregs[i] = fpregs[i];
+
+ core->ti_ppc64->fpstate = fpe;
+ return 0;
+}
+
+static void put_fpu_regs(mcontext_t *mc, UserPpc64FpstateEntry *fpe)
+{
+ int i;
+ uint64_t *mcfp = (uint64_t *)mc->fp_regs;
+
+ for (i = 0; i < fpe->n_fpregs; i++)
+ mcfp[i] = fpe->fpregs[i];
+}
+
+static int get_altivec_regs(pid_t pid, CoreEntry *core)
+{
+ /* The kernel returns :
+ * 32 Vector registers (128bit)
+ * VSCR (32bit) stored in a 128bit entry (odd)
+ * VRSAVE (32bit) store at the end.
+ *
+ * Kernel setup_sigcontext's comment mentions:
+ * "Userland shall check AT_HWCAP to know whether it can rely on the
+ * v_regs pointer or not"
+ */
+ unsigned char vrregs[33 * 16 + 4];
+ UserPpc64VrstateEntry *vse;
+ uint64_t *p64;
+ uint32_t *p32;
+ int i;
+
+ if (ptrace(PTRACE_GETVRREGS, pid, 0, (void*)&vrregs) < 0) {
+ /* PTRACE_GETVRREGS returns EIO if Altivec is not supported.
+ * This should not happen if msr_vec is set. */
+ if (errno != EIO) {
+ pr_perror("Couldn't get Altivec registers");
+ return -1;
+ }
+ pr_debug("Altivec not supported\n");
+ return 0;
+ }
+
+ pr_debug("Dumping Altivec registers\n");
+
+ vse = xmalloc(sizeof(*vse));
+ if (!vse)
+ return -1;
+ user_ppc64_vrstate_entry__init(vse);
+
+ vse->n_vrregs = 33 * 2; /* protocol buffer store 64bit entries */
+ vse->vrregs = xmalloc(vse->n_vrregs * sizeof(vse->vrregs[0]));
+ if (!vse->vrregs) {
+ xfree(vse);
+ return -1;
+ }
+
+ /* Vectors are 2*64bits entries */
+ for (i = 0; i < 33; i++) {
+ p64 = (uint64_t*) &vrregs[i * 2 * sizeof(uint64_t)];
+ vse->vrregs[i*2] = p64[0];
+ vse->vrregs[i*2 + 1] = p64[1];
+ }
+
+ p32 = (uint32_t*) &vrregs[33 * 2 * sizeof(uint64_t)];
+ vse->vrsave = *p32;
+
+ core->ti_ppc64->vrstate = vse;
+
+ /*
+ * Force the MSR_VEC bit of the restored MSR otherwise the kernel
+ * will not restore them from the signal frame.
+ */
+ core->ti_ppc64->gpregs->msr |= MSR_VEC;
+
+ return 0;
+}
+
+static int put_altivec_regs(mcontext_t *mc, UserPpc64VrstateEntry *vse)
+{
+ vrregset_t *v_regs = (vrregset_t *)(((unsigned long)mc->vmx_reserve + 15) & ~0xful);
+
+ pr_debug("Restoring Altivec registers\n");
+
+ if (vse->n_vrregs != 33*2) {
+ pr_err("Corrupted Altivec dump data\n");
+ return -1;
+ }
+
+ /* Note that this should only be done in the case MSR_VEC is set but
+ * this is not a big deal to do that in all cases.
+ */
+ memcpy(&v_regs->vrregs[0][0], vse->vrregs, sizeof(uint64_t) * 2 * 33);
+ /* vscr has been restored with the previous memcpy which copied 32
+ * 128bits registers + a 128bits field containing the vscr value in
+ * the low part.
+ */
+
+ v_regs->vrsave = vse->vrsave;
+ mc->v_regs = v_regs;
+
+ return 0;
+}
+
+/*
+ * Since the FPR[0-31] is stored in the first double word of VSR[0-31] and
+ * FPR are saved through the FP state, there is no need to save the upper part
+ * of the first 32 VSX registers.
+ * Furthermore, the 32 last VSX registers are also the 32 Altivec registers
+ * already saved, so no need to save them.
+ * As a consequence, only the doubleword 1 of the 32 first VSX registers have
+ * to be saved (the ones are returned by PTRACE_GETVSRREGS).
+ */
+static int get_vsx_regs(pid_t pid, CoreEntry *core)
+{
+ UserPpc64VsxstateEntry *vse;
+ uint64_t vsregs[32];
+ int i;
+
+ if (ptrace(PTRACE_GETVSRREGS, pid, 0, (void*)&vsregs) < 0) {
+ /*
+ * EIO is returned in the case PTRACE_GETVRREGS is not
+ * supported.
+ */
+ if (errno == EIO) {
+ pr_debug("VSX register's dump not supported.\n");
+ return 0;
+ }
+ pr_perror("Couldn't get VSX registers");
+ return -1;
+ }
+
+ pr_debug("Dumping VSX registers\n");
+
+ vse = xmalloc(sizeof(*vse));
+ if (!vse)
+ return -1;
+ user_ppc64_vsxstate_entry__init(vse);
+
+ vse->n_vsxregs = 32;
+ vse->vsxregs = xmalloc(vse->n_vsxregs * sizeof(vse->vsxregs[0]));
+ if (!vse->vsxregs) {
+ xfree(vse);
+ return -1;
+ }
+
+ for (i = 0; i < vse->n_vsxregs; i++)
+ vse->vsxregs[i] = vsregs[i];
+
+ core->ti_ppc64->vsxstate = vse;
+
+ /*
+ * Force the MSR_VSX bit of the restored MSR otherwise the kernel
+ * will not restore them from the signal frame.
+ */
+ core->ti_ppc64->gpregs->msr |= MSR_VSX;
+ return 0;
+}
+
+static int put_vsx_regs(mcontext_t *mc, UserPpc64VsxstateEntry *vse)
+{
+ uint64_t *buf;
+ int i;
+
+ pr_debug("Restoring VSX registers\n");
+ if (!mc->v_regs) {
+ /* VSX implies Altivec so v_regs should be set */
+ pr_err("Internal error\n");
+ return -1;
+ }
+
+ /* point after the Altivec registers */
+ buf = (uint64_t*) (mc->v_regs + 1);
+
+ /* Copy the value saved by get_vsx_regs in the sigframe */
+ for (i=0; i<vse->n_vsxregs; i++)
+ buf[i] = vse->vsxregs[i];
+
+ return 0;
+}
+
+int get_task_regs(pid_t pid, user_regs_struct_t regs, CoreEntry *core)
+{
+ int i;
+
+ pr_info("Dumping GP/FPU registers for %d\n", pid);
+
+ /*
+ * This is inspired by kernel function check_syscall_restart in
+ * arch/powerpc/kernel/signal.c
+ */
+#ifndef TRAP
+#define TRAP(r) ((r).trap & ~0xF)
+#endif
+
+ if (TRAP(regs) == 0x0C00 && regs.ccr & 0x10000000) {
+ /* Restart the system call */
+ switch (regs.gpr[3]) {
+ case ERESTARTNOHAND:
+ case ERESTARTSYS:
+ case ERESTARTNOINTR:
+ regs.gpr[3] = regs.orig_gpr3;
+ regs.nip -= 4;
+ break;
+ case ERESTART_RESTARTBLOCK:
+ regs.gpr[0] = __NR_restart_syscall;
+ regs.nip -= 4;
+ break;
+ }
+ }
+
+ /* Resetting trap since we are now comming from user space. */
+ regs.trap = 0;
+
+#define assign_reg(dst, src, e) do { \
+ dst->e = (__typeof__(dst->e))src.e; \
+} while (0)
+
+ for (i=0; i<32; i++)
+ assign_reg(core->ti_ppc64->gpregs, regs, gpr[i]);
+
+ assign_reg(core->ti_ppc64->gpregs, regs, nip);
+ assign_reg(core->ti_ppc64->gpregs, regs, msr);
+ assign_reg(core->ti_ppc64->gpregs, regs, orig_gpr3);
+ assign_reg(core->ti_ppc64->gpregs, regs, ctr);
+ assign_reg(core->ti_ppc64->gpregs, regs, link);
+ assign_reg(core->ti_ppc64->gpregs, regs, xer);
+ assign_reg(core->ti_ppc64->gpregs, regs, ccr);
+ assign_reg(core->ti_ppc64->gpregs, regs, trap);
+#undef assign_reg
+
+ if (get_fpu_regs(pid, core))
+ return -1;
+
+ if (get_altivec_regs(pid, core))
+ return -1;
+
+ /*
+ * Don't save the VSX registers if Altivec registers are not
+ * supported
+ */
+ if (CORE_THREAD_ARCH_INFO(core)->vrstate && get_vsx_regs(pid, core))
+ return -1;
+
+ return 0;
+}
+
+int arch_alloc_thread_info(CoreEntry *core)
+{
+ ThreadInfoPpc64 *ti_ppc64;
+ UserPpc64RegsEntry *regs;
+
+ ti_ppc64 = xmalloc(sizeof(*ti_ppc64));
+ if(!ti_ppc64)
+ goto err;
+ thread_info_ppc64__init(ti_ppc64);
+ CORE_THREAD_ARCH_INFO(core) = ti_ppc64;
+
+ /* user_ppc64_regs_entry */
+ regs = xmalloc(sizeof(*regs));
+ if (!regs)
+ goto err;
+ user_ppc64_regs_entry__init(regs);
+
+ regs->gpr = xmalloc(32*sizeof(uint64_t));
+ if (!regs->gpr)
+ goto err;
+ regs->n_gpr = 32;
+
+ ti_ppc64->gpregs = regs;
+
+ return 0;
+err:
+ return -1;
+}
+
+void arch_free_thread_info(CoreEntry *core)
+{
+ if (CORE_THREAD_ARCH_INFO(core)) {
+ if (CORE_THREAD_ARCH_INFO(core)->fpstate) {
+ xfree(CORE_THREAD_ARCH_INFO(core)->fpstate->fpregs);
+ xfree(CORE_THREAD_ARCH_INFO(core)->fpstate);
+ }
+ if (CORE_THREAD_ARCH_INFO(core)->vrstate) {
+ xfree(CORE_THREAD_ARCH_INFO(core)->vrstate->vrregs);
+ xfree(CORE_THREAD_ARCH_INFO(core)->vrstate);
+ }
+ if (CORE_THREAD_ARCH_INFO(core)->vsxstate) {
+ xfree(CORE_THREAD_ARCH_INFO(core)->vsxstate->vsxregs);
+ xfree(CORE_THREAD_ARCH_INFO(core)->vsxstate);
+ }
+ xfree(CORE_THREAD_ARCH_INFO(core)->gpregs->gpr);
+ xfree(CORE_THREAD_ARCH_INFO(core)->gpregs);
+ xfree(CORE_THREAD_ARCH_INFO(core));
+ CORE_THREAD_ARCH_INFO(core) = NULL;
+ }
+}
+
+int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core)
+{
+ int ret = 0;
+ if (CORE_THREAD_ARCH_INFO(core)->fpstate)
+ put_fpu_regs(&sigframe->uc.uc_mcontext,
+ CORE_THREAD_ARCH_INFO(core)->fpstate);
+
+ if (CORE_THREAD_ARCH_INFO(core)->vrstate)
+ ret = put_altivec_regs(&sigframe->uc.uc_mcontext,
+ CORE_THREAD_ARCH_INFO(core)->vrstate);
+ else if (core->ti_ppc64->gpregs->msr & MSR_VEC) {
+ pr_err("Internal error\n");
+ ret = -1;
+ }
+
+ if (!ret && CORE_THREAD_ARCH_INFO(core)->vsxstate)
+ ret = put_vsx_regs(&sigframe->uc.uc_mcontext,
+ CORE_THREAD_ARCH_INFO(core)->vsxstate);
+ else if (core->ti_ppc64->gpregs->msr & MSR_VSX) {
+ pr_err("Internal error\n");
+ ret = -1;
+ }
+
+ return ret;
+}
+
+/*
+ * The signal frame has been built using local addresses. Since it has to be
+ * used in the context of the checkpointed process, the v_regs pointer in the
+ * signal frame must be updated to match the address in the remote stack.
+ */
+int sigreturn_prep_fpu_frame(struct rt_sigframe *frame, mcontext_t *rcontext)
+{
+ mcontext_t *lcontext = &frame->uc.uc_mcontext;
+
+ if (lcontext->v_regs) {
+ uint64_t offset = (uint64_t)(lcontext->v_regs) - (uint64_t)lcontext;
+ lcontext->v_regs = (vrregset_t *)((uint64_t)rcontext + offset);
+
+ pr_debug("Updated v_regs:%llx (rcontext:%llx)\n",
+ (unsigned long long) lcontext->v_regs,
+ (unsigned long long) rcontext);
+ }
+ return 0;
+}
+
+int restore_gpregs(struct rt_sigframe *f, UserPpc64RegsEntry *r)
+{
+ int i;
+
+ /* r0 to r31 */
+ for (i=0; i<32; i++)
+ f->uc.uc_mcontext.gp_regs[i] = r->gpr[i];
+
+ f->uc.uc_mcontext.gp_regs[PT_NIP] = r->nip;
+ f->uc.uc_mcontext.gp_regs[PT_MSR] = r->msr;
+ f->uc.uc_mcontext.gp_regs[PT_ORIG_R3] = r->orig_gpr3;
+ f->uc.uc_mcontext.gp_regs[PT_CTR] = r->ctr;
+ f->uc.uc_mcontext.gp_regs[PT_LNK] = r->link;
+ f->uc.uc_mcontext.gp_regs[PT_XER] = r->xer;
+ f->uc.uc_mcontext.gp_regs[PT_CCR] = r->ccr;
+ f->uc.uc_mcontext.gp_regs[PT_TRAP] = r->trap;
+
+ return 0;
+}
+
+void *mmap_seized(struct parasite_ctl *ctl,
+ void *addr, size_t length, int prot,
+ int flags, int fd, off_t offset)
+{
+ unsigned long map = 0;
+ int err;
+
+ err = syscall_seized(ctl, __NR_mmap, &map,
+ (unsigned long)addr, length, prot, flags, fd, offset);
+ if (err < 0 || (long)map < 0)
+ map = 0;
+
+ return (void *)map;
+}
diff --git a/criu/arch/ppc64/include/asm/atomic.h b/criu/arch/ppc64/include/asm/atomic.h
new file mode 100644
index 000000000000..4fa33b1c7005
--- /dev/null
+++ b/criu/arch/ppc64/include/asm/atomic.h
@@ -0,0 +1,112 @@
+#ifndef __CR_ATOMIC_H__
+#define __CR_ATOMIC_H__
+
+/*
+ * PowerPC atomic operations
+ *
+ * Copied from kernel header file arch/powerpc/include/asm/atomic.h
+ */
+
+typedef struct {
+ int counter;
+} atomic_t;
+
+#include "asm/cmpxchg.h"
+
+#define PPC_ATOMIC_ENTRY_BARRIER "lwsync \n"
+#define PPC_ATOMIC_EXIT_BARRIER "sync \n"
+
+#define ATOMIC_INIT(i) { (i) }
+
+static __inline__ int atomic_read(const atomic_t *v)
+{
+ int t;
+
+ __asm__ __volatile__("lwz%U1%X1 %0,%1" : "=r"(t) : "m"(v->counter));
+
+ return t;
+}
+
+static __inline__ void atomic_set(atomic_t *v, int i)
+{
+ __asm__ __volatile__("stw%U0%X0 %1,%0" : "=m"(v->counter) : "r"(i));
+}
+
+#define ATOMIC_OP(op, asm_op) \
+static __inline__ void atomic_##op(int a, atomic_t *v) \
+{ \
+ int t; \
+ \
+ __asm__ __volatile__( \
+"1: lwarx %0,0,%3 # atomic_" #op "\n" \
+ #asm_op " %0,%2,%0\n" \
+" stwcx. %0,0,%3 \n" \
+" bne- 1b\n" \
+ : "=&r" (t), "+m" (v->counter) \
+ : "r" (a), "r" (&v->counter) \
+ : "cc"); \
+} \
+
+ATOMIC_OP(add, add)
+ATOMIC_OP(sub, subf)
+
+#undef ATOMIC_OP
+
+static __inline__ void atomic_inc(atomic_t *v)
+{
+ int t;
+
+ __asm__ __volatile__(
+"1: lwarx %0,0,%2 # atomic_inc\n\
+ addic %0,%0,1\n"
+" stwcx. %0,0,%2 \n\
+ bne- 1b"
+ : "=&r" (t), "+m" (v->counter)
+ : "r" (&v->counter)
+ : "cc", "xer");
+}
+
+static __inline__ int atomic_inc_return(atomic_t *v)
+{
+ int t;
+
+ __asm__ __volatile__(
+ PPC_ATOMIC_ENTRY_BARRIER \
+"1: lwarx %0,0,%1 # atomic_inc_return\n\
+ addic %0,%0,1\n"
+" stwcx. %0,0,%1 \n\
+ bne- 1b \n" \
+ PPC_ATOMIC_EXIT_BARRIER
+ : "=&r" (t)
+ : "r" (&v->counter)
+ : "cc", "xer", "memory");
+
+ return t;
+}
+
+/*
+ * atomic_inc_and_test - increment and test
+ * @v: pointer of type atomic_t
+ *
+ * Atomically increments @v by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+
+static __inline__ void atomic_dec(atomic_t *v)
+{
+ int t;
+
+ __asm__ __volatile__(
+"1: lwarx %0,0,%2 # atomic_dec\n\
+ addic %0,%0,-1\n"
+" stwcx. %0,0,%2\n\
+ bne- 1b"
+ : "=&r" (t), "+m" (v->counter)
+ : "r" (&v->counter)
+ : "cc", "xer");
+}
+
+#define atomic_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n)))
+
+#endif /* __CR_ATOMIC_H__ */
diff --git a/criu/arch/ppc64/include/asm/bitops.h b/criu/arch/ppc64/include/asm/bitops.h
new file mode 100644
index 000000000000..910971981ca9
--- /dev/null
+++ b/criu/arch/ppc64/include/asm/bitops.h
@@ -0,0 +1,174 @@
+#ifndef __CR_BITOPS_H__
+#define __CR_BITOPS_H__
+/*
+ * PowerPC atomic bit operations.
+ *
+ * Merged version by David Gibson <david at gibson.dropbear.id.au>.
+ * Based on ppc64 versions by: Dave Engebretsen, Todd Inglett, Don
+ * Reed, Pat McCarthy, Peter Bergner, Anton Blanchard. They
+ * originally took it from the ppc32 code.
+ *
+ * Within a word, bits are numbered LSB first. Lot's of places make
+ * this assumption by directly testing bits with (val & (1<<nr)).
+ * This can cause confusion for large (> 1 word) bitmaps on a
+ * big-endian system because, unlike little endian, the number of each
+ * bit depends on the word size.
+ *
+ * The bitop functions are defined to work on unsigned longs, so for a
+ * ppc64 system the bits end up numbered:
+ * |63..............0|127............64|191...........128|255...........192|
+ * and on ppc32:
+ * |31.....0|63....32|95....64|127...96|159..128|191..160|223..192|255..224|
+ *
+ * There are a few little-endian macros used mostly for filesystem
+ * bitmaps, these work on similar bit arrays layouts, but
+ * byte-oriented:
+ * |7...0|15...8|23...16|31...24|39...32|47...40|55...48|63...56|
+ *
+ * The main difference is that bit 3-5 (64b) or 3-4 (32b) in the bit
+ * number field needs to be reversed compared to the big-endian bit
+ * fields. This can be achieved by XOR with 0x38 (64b) or 0x18 (32b).
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * --
+ * Copied from the kernel file arch/powerpc/include/asm/bitops.h
+ */
+
+#include "compiler.h"
+
+#include "asm/bitsperlong.h"
+
+#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG)
+
+#define DECLARE_BITMAP(name,bits) \
+ unsigned long name[BITS_TO_LONGS(bits)]
+
+#define __stringify_in_c(...) #__VA_ARGS__
+#define stringify_in_c(...) __stringify_in_c(__VA_ARGS__) " "
+
+#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG))
+#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
+
+/* PPC bit number conversion */
+#define PPC_BITLSHIFT(be) (BITS_PER_LONG - 1 - (be))
+#define PPC_BIT(bit) (1UL << PPC_BITLSHIFT(bit))
+#define PPC_BITMASK(bs, be) ((PPC_BIT(bs) - PPC_BIT(be)) | PPC_BIT(bs))
+
+
+/* Macro for generating the ***_bits() functions */
+#define DEFINE_BITOP(fn, op) \
+static __inline__ void fn(unsigned long mask, \
+ volatile unsigned long *_p) \
+{ \
+ unsigned long old; \
+ unsigned long *p = (unsigned long *)_p; \
+ __asm__ __volatile__ ( \
+"1: ldarx %0,0,%3,0\n" \
+ stringify_in_c(op) "%0,%0,%2\n" \
+ "stdcx. %0,0,%3\n" \
+ "bne- 1b\n" \
+ : "=&r" (old), "+m" (*p) \
+ : "r" (mask), "r" (p) \
+ : "cc", "memory"); \
+}
+
+DEFINE_BITOP(set_bits, or)
+DEFINE_BITOP(clear_bits, andc)
+DEFINE_BITOP(change_bits, xor)
+
+static __inline__ void set_bit(int nr, volatile unsigned long *addr)
+{
+ set_bits(BIT_MASK(nr), addr + BIT_WORD(nr));
+}
+
+static __inline__ void clear_bit(int nr, volatile unsigned long *addr)
+{
+ clear_bits(BIT_MASK(nr), addr + BIT_WORD(nr));
+}
+
+static __inline__ void change_bit(int nr, volatile unsigned long *addr)
+{
+ change_bits(BIT_MASK(nr), addr + BIT_WORD(nr));
+}
+
+static inline int test_bit(int nr, const volatile unsigned long *addr)
+{
+ return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
+}
+
+/*
+ * Return the zero-based bit position (LE, not IBM bit numbering) of
+ * the most significant 1-bit in a double word.
+ */
+static __inline__ __attribute__((const))
+int __ilog2(unsigned long x)
+{
+ int lz;
+
+ asm ("cntlzd %0,%1" : "=r" (lz) : "r" (x));
+ return BITS_PER_LONG - 1 - lz;
+}
+
+
+static __inline__ unsigned long __ffs(unsigned long x)
+{
+ return __ilog2(x & -x);
+}
+
+
+#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
+/*
+ * Find the next set bit in a memory region.
+ */
+static inline
+unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
+ unsigned long offset)
+{
+ const unsigned long *p = addr + BITOP_WORD(offset);
+ unsigned long result = offset & ~(BITS_PER_LONG-1);
+ unsigned long tmp;
+
+ if (offset >= size)
+ return size;
+ size -= result;
+ offset %= BITS_PER_LONG;
+ if (offset) {
+ tmp = *(p++);
+ tmp &= (~0UL << offset);
+ if (size < BITS_PER_LONG)
+ goto found_first;
+ if (tmp)
+ goto found_middle;
+ size -= BITS_PER_LONG;
+ result += BITS_PER_LONG;
+ }
+ while (size & ~(BITS_PER_LONG-1)) {
+ if ((tmp = *(p++)))
+ goto found_middle;
+ result += BITS_PER_LONG;
+ size -= BITS_PER_LONG;
+ }
+ if (!size)
+ return result;
+ tmp = *p;
+
+found_first:
+ tmp &= (~0UL >> (BITS_PER_LONG - size));
+ if (tmp == 0UL) /* Are any bits set? */
+ return result + size; /* Nope. */
+found_middle:
+ return result + __ffs(tmp);
+}
+
+#define for_each_bit(i, bitmask) \
+ for (i = find_next_bit(bitmask, sizeof(bitmask), 0); \
+ i < sizeof(bitmask); \
+ i = find_next_bit(bitmask, sizeof(bitmask), i + 1))
+
+
+#endif /* __CR_BITOPS_H__ */
diff --git a/criu/arch/ppc64/include/asm/bitsperlong.h b/criu/arch/ppc64/include/asm/bitsperlong.h
new file mode 100644
index 000000000000..d95727d193e8
--- /dev/null
+++ b/criu/arch/ppc64/include/asm/bitsperlong.h
@@ -0,0 +1,6 @@
+#ifndef __CR_BITSPERLONG_H__
+#define __CR_BITSPERLONG_H__
+
+#define BITS_PER_LONG 64
+
+#endif /* __CR_BITSPERLONG_H__ */
diff --git a/criu/arch/ppc64/include/asm/cmpxchg.h b/criu/arch/ppc64/include/asm/cmpxchg.h
new file mode 100644
index 000000000000..b93fbdef06c7
--- /dev/null
+++ b/criu/arch/ppc64/include/asm/cmpxchg.h
@@ -0,0 +1,96 @@
+#ifndef __CR_CMPXCHG_H__
+#define __CR_CMPXCHG_H__
+
+/*
+ * Copied from kernel header file arch/powerpc/include/asm/cmpxchg.h
+ */
+
+#define PPC_ACQUIRE_BARRIER "isync \n"
+#define PPC_RELEASE_BARRIER "lwsync \n"
+
+/*
+ * Compare and exchange - if *p == old, set it to new,
+ * and return the old value of *p.
+ */
+
+static __always_inline unsigned long
+__cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new)
+{
+ unsigned int prev;
+
+ __asm__ __volatile__ (
+ PPC_RELEASE_BARRIER \
+"1: lwarx %0,0,%2 # __cmpxchg_u32\n\
+ cmpw 0,%0,%3\n\
+ bne- 2f\n"
+" stwcx. %4,0,%2\n\
+ bne- 1b \n" \
+ PPC_ACQUIRE_BARRIER
+ "\n\
+2:"
+ : "=&r" (prev), "+m" (*p)
+ : "r" (p), "r" (old), "r" (new)
+ : "cc", "memory");
+
+ return prev;
+}
+
+static __always_inline unsigned long
+__cmpxchg_u64(volatile unsigned long *p, unsigned long old, unsigned long new)
+{
+ unsigned long prev;
+
+ __asm__ __volatile__ (
+ PPC_RELEASE_BARRIER \
+"1: ldarx %0,0,%2 # __cmpxchg_u64\n\
+ cmpd 0,%0,%3\n\
+ bne- 2f\n\
+ stdcx. %4,0,%2\n\
+ bne- 1b \n" \
+ PPC_ACQUIRE_BARRIER
+ "\n\
+2:"
+ : "=&r" (prev), "+m" (*p)
+ : "r" (p), "r" (old), "r" (new)
+ : "cc", "memory");
+
+ return prev;
+}
+
+/* This function doesn't exist, so you'll get a linker error
+ if something tries to do an invalid cmpxchg(). */
+#ifdef CR_DEBUG
+static inline void __cmpxchg_called_with_bad_pointer(void)
+{
+ __asm__ __volatile__ (
+ "1: twi 31,0,0 # trap\n"
+ " b 1b"
+ : : : "memory");
+}
+#else
+extern void __cmpxchg_called_with_bad_pointer(void);
+#endif
+
+static __always_inline unsigned long
+__cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
+ unsigned int size)
+{
+ switch (size) {
+ case 4:
+ return __cmpxchg_u32(ptr, old, new);
+ case 8:
+ return __cmpxchg_u64(ptr, old, new);
+ }
+ __cmpxchg_called_with_bad_pointer();
+ return old;
+}
+
+#define cmpxchg(ptr, o, n) \
+ ({ \
+ __typeof__(*(ptr)) _o_ = (o); \
+ __typeof__(*(ptr)) _n_ = (n); \
+ (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_, \
+ (unsigned long)_n_, sizeof(*(ptr))); \
+ })
+
+#endif /* __CR_CMPXCHG_H__ */
diff --git a/criu/arch/ppc64/include/asm/cpu.h b/criu/arch/ppc64/include/asm/cpu.h
new file mode 100644
index 000000000000..59118c211d10
--- /dev/null
+++ b/criu/arch/ppc64/include/asm/cpu.h
@@ -0,0 +1 @@
+#include <stdbool.h>
diff --git a/criu/arch/ppc64/include/asm/dump.h b/criu/arch/ppc64/include/asm/dump.h
new file mode 100644
index 000000000000..1505fd2983b0
--- /dev/null
+++ b/criu/arch/ppc64/include/asm/dump.h
@@ -0,0 +1,11 @@
+#ifndef __CR_ASM_DUMP_H__
+#define __CR_ASM_DUMP_H__
+
+extern int get_task_regs(pid_t pid, user_regs_struct_t regs, CoreEntry *core);
+extern int arch_alloc_thread_info(CoreEntry *core);
+extern void arch_free_thread_info(CoreEntry *core);
+
+
+#define core_put_tls(core, tls)
+
+#endif
diff --git a/criu/arch/ppc64/include/asm/fpu.h b/criu/arch/ppc64/include/asm/fpu.h
new file mode 100644
index 000000000000..7f476d541a7d
--- /dev/null
+++ b/criu/arch/ppc64/include/asm/fpu.h
@@ -0,0 +1,4 @@
+#ifndef __CR_ASM_FPU_H__
+#define __CR_ASM_FPU_H__
+
+#endif /* __CR_ASM_FPU_H__ */
diff --git a/criu/arch/ppc64/include/asm/int.h b/criu/arch/ppc64/include/asm/int.h
new file mode 100644
index 000000000000..642804e9b485
--- /dev/null
+++ b/criu/arch/ppc64/include/asm/int.h
@@ -0,0 +1,6 @@
+#ifndef __CR_ASM_INT_H__
+#define __CR_ASM_INT_H__
+
+#include "asm-generic/int.h"
+
+#endif /* __CR_ASM_INT_H__ */
diff --git a/criu/arch/ppc64/include/asm/linkage.h b/criu/arch/ppc64/include/asm/linkage.h
new file mode 100644
index 000000000000..506edc7114d4
--- /dev/null
+++ b/criu/arch/ppc64/include/asm/linkage.h
@@ -0,0 +1,301 @@
+/*
+ * Various PowerPc assembly definitions
+ *
+ * Copied from the kernel file arch/powerpc/include/asm/ppc_asm.h
+ *
+ * Copyright (C) 1995-1999 Gary Thomas, Paul Mackerras, Cort Dougan.
+ */
+#ifndef __CR_LINKAGE_H__
+#define __CR_LINKAGE_H__
+
+#ifdef __ASSEMBLY__
+
+#define GLOBAL(name) \
+ .globl name; \
+ name:
+
+#define ENTRY(name) \
+ .globl name; \
+ .type name, @function; \
+ name:
+
+#define END(sym) \
+ .size sym, . - sym
+
+
+#define STACKFRAMESIZE 256
+#define __STK_REG(i) (112 + ((i)-14)*8)
+#define STK_REG(i) __STK_REG(__REG_##i)
+
+/* The boring bits... */
+
+/* Condition Register Bit Fields */
+
+#define cr0 0
+#define cr1 1
+#define cr2 2
+#define cr3 3
+#define cr4 4
+#define cr5 5
+#define cr6 6
+#define cr7 7
+
+
+/*
+ * General Purpose Registers (GPRs)
+ *
+ * The lower case r0-r31 should be used in preference to the upper
+ * case R0-R31 as they provide more error checking in the assembler.
+ * Use R0-31 only when really nessesary.
+ */
+
+#define r0 %r0
+#define r1 %r1
+#define r2 %r2
+#define r3 %r3
+#define r4 %r4
+#define r5 %r5
+#define r6 %r6
+#define r7 %r7
+#define r8 %r8
+#define r9 %r9
+#define r10 %r10
+#define r11 %r11
+#define r12 %r12
+#define r13 %r13
+#define r14 %r14
+#define r15 %r15
+#define r16 %r16
+#define r17 %r17
+#define r18 %r18
+#define r19 %r19
+#define r20 %r20
+#define r21 %r21
+#define r22 %r22
+#define r23 %r23
+#define r24 %r24
+#define r25 %r25
+#define r26 %r26
+#define r27 %r27
+#define r28 %r28
+#define r29 %r29
+#define r30 %r30
+#define r31 %r31
+
+
+/* Floating Point Registers (FPRs) */
+
+#define fr0 0
+#define fr1 1
+#define fr2 2
+#define fr3 3
+#define fr4 4
+#define fr5 5
+#define fr6 6
+#define fr7 7
+#define fr8 8
+#define fr9 9
+#define fr10 10
+#define fr11 11
+#define fr12 12
+#define fr13 13
+#define fr14 14
+#define fr15 15
+#define fr16 16
+#define fr17 17
+#define fr18 18
+#define fr19 19
+#define fr20 20
+#define fr21 21
+#define fr22 22
+#define fr23 23
+#define fr24 24
+#define fr25 25
+#define fr26 26
+#define fr27 27
+#define fr28 28
+#define fr29 29
+#define fr30 30
+#define fr31 31
+
+/* AltiVec Registers (VPRs) */
+
+#define vr0 0
+#define vr1 1
+#define vr2 2
+#define vr3 3
+#define vr4 4
+#define vr5 5
+#define vr6 6
+#define vr7 7
+#define vr8 8
+#define vr9 9
+#define vr10 10
+#define vr11 11
+#define vr12 12
+#define vr13 13
+#define vr14 14
+#define vr15 15
+#define vr16 16
+#define vr17 17
+#define vr18 18
+#define vr19 19
+#define vr20 20
+#define vr21 21
+#define vr22 22
+#define vr23 23
+#define vr24 24
+#define vr25 25
+#define vr26 26
+#define vr27 27
+#define vr28 28
+#define vr29 29
+#define vr30 30
+#define vr31 31
+
+/* VSX Registers (VSRs) */
+
+#define vsr0 0
+#define vsr1 1
+#define vsr2 2
+#define vsr3 3
+#define vsr4 4
+#define vsr5 5
+#define vsr6 6
+#define vsr7 7
+#define vsr8 8
+#define vsr9 9
+#define vsr10 10
+#define vsr11 11
+#define vsr12 12
+#define vsr13 13
+#define vsr14 14
+#define vsr15 15
+#define vsr16 16
+#define vsr17 17
+#define vsr18 18
+#define vsr19 19
+#define vsr20 20
+#define vsr21 21
+#define vsr22 22
+#define vsr23 23
+#define vsr24 24
+#define vsr25 25
+#define vsr26 26
+#define vsr27 27
+#define vsr28 28
+#define vsr29 29
+#define vsr30 30
+#define vsr31 31
+#define vsr32 32
+#define vsr33 33
+#define vsr34 34
+#define vsr35 35
+#define vsr36 36
+#define vsr37 37
+#define vsr38 38
+#define vsr39 39
+#define vsr40 40
+#define vsr41 41
+#define vsr42 42
+#define vsr43 43
+#define vsr44 44
+#define vsr45 45
+#define vsr46 46
+#define vsr47 47
+#define vsr48 48
+#define vsr49 49
+#define vsr50 50
+#define vsr51 51
+#define vsr52 52
+#define vsr53 53
+#define vsr54 54
+#define vsr55 55
+#define vsr56 56
+#define vsr57 57
+#define vsr58 58
+#define vsr59 59
+#define vsr60 60
+#define vsr61 61
+#define vsr62 62
+#define vsr63 63
+
+/* SPE Registers (EVPRs) */
+
+#define evr0 0
+#define evr1 1
+#define evr2 2
+#define evr3 3
+#define evr4 4
+#define evr5 5
+#define evr6 6
+#define evr7 7
+#define evr8 8
+#define evr9 9
+#define evr10 10
+#define evr11 11
+#define evr12 12
+#define evr13 13
+#define evr14 14
+#define evr15 15
+#define evr16 16
+#define evr17 17
+#define evr18 18
+#define evr19 19
+#define evr20 20
+#define evr21 21
+#define evr22 22
+#define evr23 23
+#define evr24 24
+#define evr25 25
+#define evr26 26
+#define evr27 27
+#define evr28 28
+#define evr29 29
+#define evr30 30
+#define evr31 31
+
+/* some stab codes */
+#define N_FUN 36
+#define N_RSYM 64
+#define N_SLINE 68
+#define N_SO 100
+
+#define __REG_R0 0
+#define __REG_R1 1
+#define __REG_R2 2
+#define __REG_R3 3
+#define __REG_R4 4
+#define __REG_R5 5
+#define __REG_R6 6
+#define __REG_R7 7
+#define __REG_R8 8
+#define __REG_R9 9
+#define __REG_R10 10
+#define __REG_R11 11
+#define __REG_R12 12
+#define __REG_R13 13
+#define __REG_R14 14
+#define __REG_R15 15
+#define __REG_R16 16
+#define __REG_R17 17
+#define __REG_R18 18
+#define __REG_R19 19
+#define __REG_R20 20
+#define __REG_R21 21
+#define __REG_R22 22
+#define __REG_R23 23
+#define __REG_R24 24
+#define __REG_R25 25
+#define __REG_R26 26
+#define __REG_R27 27
+#define __REG_R28 28
+#define __REG_R29 29
+#define __REG_R30 30
+#define __REG_R31 31
+
+
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __CR_LINKAGE_H__ */
diff --git a/criu/arch/ppc64/include/asm/page.h b/criu/arch/ppc64/include/asm/page.h
new file mode 100644
index 000000000000..9d10455f1c47
--- /dev/null
+++ b/criu/arch/ppc64/include/asm/page.h
@@ -0,0 +1,25 @@
+#ifndef __CR_ASM_PAGE_H__
+#define __CR_ASM_PAGE_H__
+
+#include <unistd.h>
+
+/*
+ * Default config for Pseries is to use 64K pages.
+ * See kernel file arch/powerpc/configs/pseries_*defconfig
+ */
+#ifndef PAGE_SHIFT
+# define PAGE_SHIFT 16
+#endif
+
+#ifndef PAGE_SIZE
+# define PAGE_SIZE (1UL << PAGE_SHIFT)
+#endif
+
+#ifndef PAGE_MASK
+# define PAGE_MASK (~(PAGE_SIZE - 1))
+#endif
+
+#define PAGE_PFN(addr) ((addr) / PAGE_SIZE)
+#define page_size() sysconf(_SC_PAGESIZE)
+
+#endif /* __CR_ASM_PAGE_H__ */
diff --git a/criu/arch/ppc64/include/asm/parasite-syscall.h b/criu/arch/ppc64/include/asm/parasite-syscall.h
new file mode 100644
index 000000000000..7665e207b75e
--- /dev/null
+++ b/criu/arch/ppc64/include/asm/parasite-syscall.h
@@ -0,0 +1,17 @@
+#ifndef __CR_ASM_PARASITE_SYSCALL_H__
+#define __CR_ASM_PARASITE_SYSCALL_H__
+
+struct parasite_ctl;
+
+#define ARCH_SI_TRAP TRAP_BRKPT
+
+extern const char code_syscall[];
+extern const int code_syscall_size;
+
+void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs);
+
+void *mmap_seized(struct parasite_ctl *ctl,
+ void *addr, size_t length, int prot,
+ int flags, int fd, off_t offset);
+
+#endif
diff --git a/criu/arch/ppc64/include/asm/parasite.h b/criu/arch/ppc64/include/asm/parasite.h
new file mode 100644
index 000000000000..fdbc340b05e2
--- /dev/null
+++ b/criu/arch/ppc64/include/asm/parasite.h
@@ -0,0 +1,7 @@
+#ifndef __ASM_PARASITE_H__
+#define __ASM_PARASITE_H__
+
+/* TLS is accessed through r13, which is already processed */
+static inline void arch_get_tls(tls_t *ptls) { (void)ptls; }
+
+#endif
diff --git a/criu/arch/ppc64/include/asm/prlimit.h b/criu/arch/ppc64/include/asm/prlimit.h
new file mode 100644
index 000000000000..6746ba0e6f19
--- /dev/null
+++ b/criu/arch/ppc64/include/asm/prlimit.h
@@ -0,0 +1,14 @@
+#ifndef __CR_PRLIMIT_H__
+#define __CR_PRLIMIT_H__
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+
+#ifndef CONFIG_HAS_PRLIMIT
+extern int prlimit(pid_t pid, int resource, const struct rlimit *new_rlimit, struct rlimit *old_rlimit);
+#endif
+
+#endif /* __CR_PRLIMIT_H__ */
diff --git a/criu/arch/ppc64/include/asm/processor-flags.h b/criu/arch/ppc64/include/asm/processor-flags.h
new file mode 100644
index 000000000000..c1888af36fa0
--- /dev/null
+++ b/criu/arch/ppc64/include/asm/processor-flags.h
@@ -0,0 +1,4 @@
+#ifndef __CR_PROCESSOR_FLAGS_H__
+#define __CR_PROCESSOR_FLAGS_H__
+
+#endif
diff --git a/criu/arch/ppc64/include/asm/restore.h b/criu/arch/ppc64/include/asm/restore.h
new file mode 100644
index 000000000000..325ff96e1018
--- /dev/null
+++ b/criu/arch/ppc64/include/asm/restore.h
@@ -0,0 +1,31 @@
+#ifndef __CR_ASM_RESTORE_H__
+#define __CR_ASM_RESTORE_H__
+
+#include "asm/restorer.h"
+
+#include "protobuf/core.pb-c.h"
+
+/*
+ * Set R2 to blob + 8000 which is the default value
+ * Jump to restore_task_exec_start + 8 since R2 is already set (local call)
+ */
+#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \
+ task_args) \
+ asm volatile( \
+ "mr 1,%0 \n" \
+ "mr 12,%1 \n" \
+ "mtctr 12 \n" \
+ "mr 3,%2 \n" \
+ "bctr \n" \
+ : \
+ : "r"(new_sp), \
+ "r"((unsigned long)restore_task_exec_start), \
+ "r"(task_args) \
+ : "sp", "1", "2", "3", "12", "memory")
+
+/* There is nothing to do since TLS is accessed through r13 */
+#define core_get_tls(pcore, ptls)
+
+int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core);
+
+#endif /* __CR_ASM_RESTORE_H__ */
diff --git a/criu/arch/ppc64/include/asm/restorer.h b/criu/arch/ppc64/include/asm/restorer.h
new file mode 100644
index 000000000000..e728f133535e
--- /dev/null
+++ b/criu/arch/ppc64/include/asm/restorer.h
@@ -0,0 +1,133 @@
+#ifndef __CR_ASM_RESTORER_H__
+#define __CR_ASM_RESTORER_H__
+
+#include <asm/ptrace.h>
+#include <asm/elf.h>
+#include <asm/types.h>
+
+/*
+ * sigcontext structure defined in file
+ * /usr/include/powerpc64le-linux-gnu/bits/sigcontext.h,
+ * included from /usr/include/signal.h
+ *
+ * Kernel definition can be found in arch/powerpc/include/uapi/asm/sigcontext.h
+ */
+#include <signal.h>
+
+// XXX: the idetifier rt_sigcontext is expected to be struct by the CRIU code
+#define rt_sigcontext sigcontext
+
+#include "sigframe.h"
+#define SIGFRAME_OFFSET 0
+
+/* Copied from the Linux kernel header arch/powerpc/include/asm/ptrace.h */
+#define USER_REDZONE_SIZE 512
+
+/* Copied from the Linux kernel source file arch/powerpc/kernel/signal_64.c */
+#define TRAMP_SIZE 6
+
+/*
+ * ucontext defined in /usr/include/powerpc64le-linux-gnu/sys/ucontext.h
+ */
+struct rt_sigframe {
+ /* sys_rt_sigreturn requires the ucontext be the first field */
+ struct ucontext uc;
+#if 1
+ /*
+ * XXX: Assuming that transactional is turned on by default in
+ * most of the Linux distribution.
+ */
+ struct ucontext uc_transact;
+#endif
+ unsigned long _unused[2];
+ unsigned int tramp[TRAMP_SIZE];
+ struct rt_siginfo *pinfo;
+ void *puc;
+ struct rt_siginfo info;
+ /* New 64 bit little-endian ABI allows redzone of 512 bytes below sp */
+ char abigap[USER_REDZONE_SIZE];
+} __attribute__ ((aligned (16)));
+
+#define ARCH_RT_SIGRETURN(new_sp) \
+ asm volatile( \
+ "mr 1, %0 \n" \
+ "li 0, "__stringify(__NR_rt_sigreturn)" \n" \
+ "sc \n" \
+ : \
+ : "r"(new_sp) \
+ : "1", "memory")
+
+/*
+ * Clone trampoline
+ *
+ * See glibc sysdeps/powerpc/powerpc64/sysdep.h for FRAME_MIN_SIZE defines
+ */
+#if _CALL_ELF != 2
+#error Only supporting ABIv2.
+#else
+#define FRAME_MIN_SIZE_PARM 96
+#endif
+#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \
+ thread_args, clone_restore_fn) \
+ asm volatile( \
+ "clone_emul: \n" \
+ "/* Save fn, args, stack across syscall. */ \n" \
+ "mr 14, %5 /* clone_restore_fn in r14 */ \n" \
+ "mr 15, %6 /* &thread_args[i] in r15 */ \n" \
+ "mr 3, %1 /* clone_flags */ \n" \
+ "ld 4, %2 /* new_sp */ \n" \
+ "mr 5, %3 /* &parent_tid */ \n" \
+ "li 6, 0 /* tls = 0 ? */ \n" \
+ "mr 7, %4 /* &thread_args[i].pid */ \n" \
+ "li 0,"__stringify(__NR_clone)" \n" \
+ "sc \n" \
+ "/* Check for child process. */ \n" \
+ "cmpdi cr1,3,0 \n" \
+ "crandc cr1*4+eq,cr1*4+eq,cr0*4+so \n" \
+ "bne- cr1,clone_end \n" \
+ "/* child */ \n" \
+ "addi 14, 14, 8 /* jump over r2 fixup */ \n" \
+ "mtctr 14 \n" \
+ "mr 3,15 \n" \
+ "bctr \n" \
+ "clone_end: \n" \
+ "mr %0,3 \n" \
+ : "=r"(ret) /* %0 */ \
+ : "r"(clone_flags), /* %1 */ \
+ "m"(new_sp), /* %2 */ \
+ "r"(&parent_tid), /* %3 */ \
+ "r"(&thread_args[i].pid), /* %4 */ \
+ "r"(clone_restore_fn), /* %5 */ \
+ "r"(&thread_args[i]) /* %6 */ \
+ : "memory","0","3","4","5","6","7","14","15")
+
+#define RT_SIGFRAME_UC(rt_sigframe) rt_sigframe->uc
+#define RT_SIGFRAME_REGIP(rt_sigframe) ((long unsigned int)(rt_sigframe)->uc.uc_mcontext.gp_regs[PT_NIP])
+#define RT_SIGFRAME_HAS_FPU(rt_sigframe) (1)
+#define RT_SIGFRAME_FPU(rt_sigframe) ((rt_sigframe)->uc.uc_mcontext)
+
+int restore_gpregs(struct rt_sigframe *f, UserPpc64RegsEntry *r);
+int restore_nonsigframe_gpregs(UserPpc64RegsEntry *r);
+
+/* Nothing to do, TLS is accessed through r13 */
+static inline void restore_tls(tls_t *ptls) { (void)ptls; }
+
+static inline int ptrace_set_breakpoint(pid_t pid, void *addr)
+{
+ return 0;
+}
+
+static inline int ptrace_flush_breakpoints(pid_t pid)
+{
+ return 0;
+}
+
+int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe,
+ mcontext_t *sigcontext);
+
+/*
+ * Defined in arch/ppc64/syscall-common-ppc64.S
+ */
+unsigned long sys_shmat(int shmid, const void *shmaddr, int shmflg);
+
+#endif /*__CR_ASM_RESTORER_H__*/
diff --git a/criu/arch/ppc64/include/asm/string.h b/criu/arch/ppc64/include/asm/string.h
new file mode 100644
index 000000000000..4531b3ba6c26
--- /dev/null
+++ b/criu/arch/ppc64/include/asm/string.h
@@ -0,0 +1,28 @@
+#ifndef __CR_ASM_STRING_H__
+#define __CR_ASM_STRING_H__
+
+#include "compiler.h"
+
+#define HAS_BUILTIN_MEMCPY
+#define HAS_BUILTIN_MEMCMP
+
+#include "asm-generic/string.h"
+
+#ifdef CR_NOGLIBC
+extern void memcpy_power7(void *to, const void *from, unsigned long n);
+static inline void *builtin_memcpy(void *to, const void *from, unsigned long n)
+{
+ if (n)
+ memcpy_power7(to, from, n);
+ return to;
+}
+extern int builtin_memcmp(const void *cs, const void *ct, size_t count);
+#else
+/*
+ * When building with the C library, call its services
+ */
+#define builtin_memcpy memcpy
+#define builtin_memcmp memcmp
+#endif
+
+#endif /* __CR_ASM_STRING_H__ */
diff --git a/criu/arch/ppc64/include/asm/types.h b/criu/arch/ppc64/include/asm/types.h
new file mode 100644
index 000000000000..3412dc75d6a0
--- /dev/null
+++ b/criu/arch/ppc64/include/asm/types.h
@@ -0,0 +1,113 @@
+#ifndef __CR_ASM_TYPES_H__
+#define __CR_ASM_TYPES_H__
+
+#include <stdbool.h>
+#include <signal.h>
+#include "protobuf/core.pb-c.h"
+
+#include "asm/page.h"
+#include "asm/bitops.h"
+#include "asm/int.h"
+
+/*
+ * Copied from kernel header include/uapi/asm-generic/signal-defs.h
+ */
+typedef void rt_signalfn_t(int, siginfo_t *, void *);
+typedef rt_signalfn_t *rt_sighandler_t;
+
+typedef void rt_restorefn_t(void);
+typedef rt_restorefn_t *rt_sigrestore_t;
+
+#define SIGMAX_OLD 31
+#define SIGMAX 64
+
+/*Copied from the Linux kernel arch/powerpc/include/uapi/asm/signal.h */
+#define _KNSIG 64
+#define _NSIG_BPW 64
+#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW)
+
+typedef struct {
+ uint64_t sig[_KNSIG_WORDS];
+} k_rtsigset_t;
+
+static inline void ksigfillset(k_rtsigset_t *set)
+{
+ int i;
+ for (i = 0; i < _KNSIG_WORDS; i++)
+ set->sig[i] = (unsigned long)-1;
+}
+
+/* Copied from the Linux kernel arch/powerpc/include/uapi/asm/signal.h */
+#define SA_RESTORER 0x04000000U
+
+typedef struct {
+ rt_sighandler_t rt_sa_handler;
+ unsigned long rt_sa_flags;
+ rt_sigrestore_t rt_sa_restorer;
+ k_rtsigset_t rt_sa_mask; /* mask last for extensibility */
+} rt_sigaction_t;
+
+/*
+ * Copied from kernel header arch/powerpc/include/uapi/asm/ptrace.h
+ */
+typedef struct {
+ unsigned long gpr[32];
+ unsigned long nip;
+ unsigned long msr;
+ unsigned long orig_gpr3; /* Used for restarting system calls */
+ unsigned long ctr;
+ unsigned long link;
+ unsigned long xer;
+ unsigned long ccr;
+ unsigned long softe; /* Soft enabled/disabled */
+ unsigned long trap; /* Reason for being here */
+ /* N.B. for critical exceptions on 4xx, the dar and dsisr
+ fields are overloaded to hold srr0 and srr1. */
+ unsigned long dar; /* Fault registers */
+ unsigned long dsisr; /* on 4xx/Book-E used for ESR */
+ unsigned long result; /* Result of a system call */
+} user_regs_struct_t;
+
+typedef UserPpc64RegsEntry UserRegsEntry;
+
+#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__PPC64
+
+#define ASSIGN_TYPED(a, b) do { a = (typeof(a))b; } while (0)
+#define ASSIGN_MEMBER(a,b,m) do { ASSIGN_TYPED((a)->m, (b)->m); } while (0)
+
+#define REG_RES(regs) ((u64)(regs).gpr[3])
+#define REG_IP(regs) ((u64)(regs).nip)
+#define REG_SYSCALL_NR(regs) ((u64)(regs).gpr[0])
+
+
+#define CORE_THREAD_ARCH_INFO(core) core->ti_ppc64
+
+/*
+ * Copied from the following kernel header files :
+ * include/linux/auxvec.h
+ * arch/powerpc/include/uapi/asm/auxvec.h
+ * include/linux/mm_types.h
+ */
+#define AT_VECTOR_SIZE_BASE 20
+#define AT_VECTOR_SIZE_ARCH 6
+#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))
+
+typedef uint64_t auxv_t;
+
+/* Not used but the structure parasite_dump_thread needs a tls_t field */
+typedef uint64_t tls_t;
+
+/*
+ * Copied for the Linux kernel arch/powerpc/include/asm/processor.h
+ *
+ * NOTE: 32bit tasks are not supported.
+ */
+#define TASK_SIZE_USER64 (0x0000400000000000UL)
+#define TASK_SIZE TASK_SIZE_USER64
+
+static inline unsigned long task_size() { return TASK_SIZE; }
+
+static inline void *decode_pointer(uint64_t v) { return (void*)v; }
+static inline uint64_t encode_pointer(void *p) { return (uint64_t)p; }
+
+#endif /* __CR_ASM_TYPES_H__ */
diff --git a/criu/arch/ppc64/include/asm/vdso.h b/criu/arch/ppc64/include/asm/vdso.h
new file mode 100644
index 000000000000..ed94e4cf0160
--- /dev/null
+++ b/criu/arch/ppc64/include/asm/vdso.h
@@ -0,0 +1,34 @@
+#ifndef __CR_ASM_VDSO_H__
+#define __CR_ASM_VDSO_H__
+
+#include "asm/int.h"
+#include "asm-generic/vdso.h"
+
+/* This definition is used in pie/util-vdso.c to initialize the vdso symbol
+ * name string table 'vdso_symbols'
+ *
+ * Poke from kernel file arch/powerpc/kernel/vdso64/vdso64.lds.S
+ *
+ * Note that '__kernel_datapage_offset' is not a service but mostly a data
+ * inside the text page which should not be used as is from user space.
+ */
+#define VDSO_SYMBOL_MAX 10
+#define ARCH_VDSO_SYMBOLS \
+ "__kernel_clock_getres", \
+ "__kernel_clock_gettime", \
+ "__kernel_get_syscall_map", \
+ "__kernel_get_tbfreq", \
+ "__kernel_getcpu", \
+ "__kernel_gettimeofday", \
+ "__kernel_sigtramp_rt64", \
+ "__kernel_sync_dicache", \
+ "__kernel_sync_dicache_p5", \
+ "__kernel_time"
+
+struct vdso_symtable;
+extern int vdso_redirect_calls(unsigned long base_to,
+ unsigned long base_from,
+ struct vdso_symtable *to,
+ struct vdso_symtable *from);
+
+#endif /* __CR_ASM_VDSO_H__ */
diff --git a/criu/arch/ppc64/memcmp_64.S b/criu/arch/ppc64/memcmp_64.S
new file mode 100644
index 000000000000..16c2b0cd8280
--- /dev/null
+++ b/criu/arch/ppc64/memcmp_64.S
@@ -0,0 +1,236 @@
+/*
+ * Author: Anton Blanchard <anton at au.ibm.com>
+ * Copyright 2015 IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * --
+ * Copied form the linux file arch/powerpc/lib/memcmp_64.S
+ */
+#include "asm/linkage.h"
+
+#define off8 r6
+#define off16 r7
+#define off24 r8
+
+#define rA r9
+#define rB r10
+#define rC r11
+#define rD r27
+#define rE r28
+#define rF r29
+#define rG r30
+#define rH r31
+
+#ifdef __LITTLE_ENDIAN__
+#define LD ldbrx
+#else
+#define LD ldx
+#endif
+
+ENTRY(builtin_memcmp)
+ cmpdi cr1,r5,0
+
+ /* Use the short loop if both strings are not 8B aligned */
+ or r6,r3,r4
+ andi. r6,r6,7
+
+ /* Use the short loop if length is less than 32B */
+ cmpdi cr6,r5,31
+
+ beq cr1,.Lzero
+ bne .Lshort
+ bgt cr6,.Llong
+
+.Lshort:
+ mtctr r5
+
+1: lbz rA,0(r3)
+ lbz rB,0(r4)
+ subf. rC,rB,rA
+ bne .Lnon_zero
+ bdz .Lzero
+
+ lbz rA,1(r3)
+ lbz rB,1(r4)
+ subf. rC,rB,rA
+ bne .Lnon_zero
+ bdz .Lzero
+
+ lbz rA,2(r3)
+ lbz rB,2(r4)
+ subf. rC,rB,rA
+ bne .Lnon_zero
+ bdz .Lzero
+
+ lbz rA,3(r3)
+ lbz rB,3(r4)
+ subf. rC,rB,rA
+ bne .Lnon_zero
+
+ addi r3,r3,4
+ addi r4,r4,4
+
+ bdnz 1b
+
+.Lzero:
+ li r3,0
+ blr
+
+.Lnon_zero:
+ mr r3,rC
+ blr
+
+.Llong:
+ li off8,8
+ li off16,16
+ li off24,24
+
+ std r31,-8(r1)
+ std r30,-16(r1)
+ std r29,-24(r1)
+ std r28,-32(r1)
+ std r27,-40(r1)
+
+ srdi r0,r5,5
+ mtctr r0
+ andi. r5,r5,31
+
+ LD rA,0,r3
+ LD rB,0,r4
+
+ LD rC,off8,r3
+ LD rD,off8,r4
+
+ LD rE,off16,r3
+ LD rF,off16,r4
+
+ LD rG,off24,r3
+ LD rH,off24,r4
+ cmpld cr0,rA,rB
+
+ addi r3,r3,32
+ addi r4,r4,32
+
+ bdz .Lfirst32
+
+ LD rA,0,r3
+ LD rB,0,r4
+ cmpld cr1,rC,rD
+
+ LD rC,off8,r3
+ LD rD,off8,r4
+ cmpld cr6,rE,rF
+
+ LD rE,off16,r3
+ LD rF,off16,r4
+ cmpld cr7,rG,rH
+ bne cr0,.LcmpAB
+
+ LD rG,off24,r3
+ LD rH,off24,r4
+ cmpld cr0,rA,rB
+ bne cr1,.LcmpCD
+
+ addi r3,r3,32
+ addi r4,r4,32
+
+ bdz .Lsecond32
+
+ .balign 16
+
+1: LD rA,0,r3
+ LD rB,0,r4
+ cmpld cr1,rC,rD
+ bne cr6,.LcmpEF
+
+ LD rC,off8,r3
+ LD rD,off8,r4
+ cmpld cr6,rE,rF
+ bne cr7,.LcmpGH
+
+ LD rE,off16,r3
+ LD rF,off16,r4
+ cmpld cr7,rG,rH
+ bne cr0,.LcmpAB
+
+ LD rG,off24,r3
+ LD rH,off24,r4
+ cmpld cr0,rA,rB
+ bne cr1,.LcmpCD
+
+ addi r3,r3,32
+ addi r4,r4,32
+
+ bdnz 1b
+
+.Lsecond32:
+ cmpld cr1,rC,rD
+ bne cr6,.LcmpEF
+
+ cmpld cr6,rE,rF
+ bne cr7,.LcmpGH
+
+ cmpld cr7,rG,rH
+ bne cr0,.LcmpAB
+
+ bne cr1,.LcmpCD
+ bne cr6,.LcmpEF
+ bne cr7,.LcmpGH
+
+.Ltail:
+ ld r31,-8(r1)
+ ld r30,-16(r1)
+ ld r29,-24(r1)
+ ld r28,-32(r1)
+ ld r27,-40(r1)
+
+ cmpdi r5,0
+ beq .Lzero
+ b .Lshort
+
+.Lfirst32:
+ cmpld cr1,rC,rD
+ cmpld cr6,rE,rF
+ cmpld cr7,rG,rH
+
+ bne cr0,.LcmpAB
+ bne cr1,.LcmpCD
+ bne cr6,.LcmpEF
+ bne cr7,.LcmpGH
+
+ b .Ltail
+
+.LcmpAB:
+ li r3,1
+ bgt cr0,.Lout
+ li r3,-1
+ b .Lout
+
+.LcmpCD:
+ li r3,1
+ bgt cr1,.Lout
+ li r3,-1
+ b .Lout
+
+.LcmpEF:
+ li r3,1
+ bgt cr6,.Lout
+ li r3,-1
+ b .Lout
+
+.LcmpGH:
+ li r3,1
+ bgt cr7,.Lout
+ li r3,-1
+
+.Lout:
+ ld r31,-8(r1)
+ ld r30,-16(r1)
+ ld r29,-24(r1)
+ ld r28,-32(r1)
+ ld r27,-40(r1)
+ blr
diff --git a/criu/arch/ppc64/memcpy_power7.S b/criu/arch/ppc64/memcpy_power7.S
new file mode 100644
index 000000000000..a29d0e8f2ada
--- /dev/null
+++ b/criu/arch/ppc64/memcpy_power7.S
@@ -0,0 +1,213 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2012
+ *
+ * Author: Anton Blanchard <anton at au.ibm.com>
+ *
+ * --
+ * Copied from the kernel file arch/powerpc/lib/memcpy_power7.S
+ * Altivec support has been removed so we don't taint restored process.
+ */
+#include "asm/linkage.h"
+
+/*
+ * When building the parasite code, the compiler may rely on the C library
+ * service memcpy to initialise big local variable in the stack.
+ */
+ENTRY(memcpy)
+ENTRY(memcpy_power7)
+ cmpldi r5,16
+ std r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
+ blt .Lshort_copy
+
+.Lnonvmx_copy:
+ /* Get the source 8B aligned */
+ neg r6,r4
+ mtocrf 0x01,r6
+ clrldi r6,r6,(64-3)
+
+ bf cr7*4+3,1f
+ lbz r0,0(r4)
+ addi r4,r4,1
+ stb r0,0(r3)
+ addi r3,r3,1
+
+1: bf cr7*4+2,2f
+ lhz r0,0(r4)
+ addi r4,r4,2
+ sth r0,0(r3)
+ addi r3,r3,2
+
+2: bf cr7*4+1,3f
+ lwz r0,0(r4)
+ addi r4,r4,4
+ stw r0,0(r3)
+ addi r3,r3,4
+
+3: sub r5,r5,r6
+ cmpldi r5,128
+ blt 5f
+
+ mflr r0
+ stdu r1,-STACKFRAMESIZE(r1)
+ std r14,STK_REG(R14)(r1)
+ std r15,STK_REG(R15)(r1)
+ std r16,STK_REG(R16)(r1)
+ std r17,STK_REG(R17)(r1)
+ std r18,STK_REG(R18)(r1)
+ std r19,STK_REG(R19)(r1)
+ std r20,STK_REG(R20)(r1)
+ std r21,STK_REG(R21)(r1)
+ std r22,STK_REG(R22)(r1)
+ std r0,STACKFRAMESIZE+16(r1)
+
+ srdi r6,r5,7
+ mtctr r6
+
+ /* Now do cacheline (128B) sized loads and stores. */
+ .align 5
+4:
+ ld r0,0(r4)
+ ld r6,8(r4)
+ ld r7,16(r4)
+ ld r8,24(r4)
+ ld r9,32(r4)
+ ld r10,40(r4)
+ ld r11,48(r4)
+ ld r12,56(r4)
+ ld r14,64(r4)
+ ld r15,72(r4)
+ ld r16,80(r4)
+ ld r17,88(r4)
+ ld r18,96(r4)
+ ld r19,104(r4)
+ ld r20,112(r4)
+ ld r21,120(r4)
+ addi r4,r4,128
+ std r0,0(r3)
+ std r6,8(r3)
+ std r7,16(r3)
+ std r8,24(r3)
+ std r9,32(r3)
+ std r10,40(r3)
+ std r11,48(r3)
+ std r12,56(r3)
+ std r14,64(r3)
+ std r15,72(r3)
+ std r16,80(r3)
+ std r17,88(r3)
+ std r18,96(r3)
+ std r19,104(r3)
+ std r20,112(r3)
+ std r21,120(r3)
+ addi r3,r3,128
+ bdnz 4b
+
+ clrldi r5,r5,(64-7)
+
+ ld r14,STK_REG(R14)(r1)
+ ld r15,STK_REG(R15)(r1)
+ ld r16,STK_REG(R16)(r1)
+ ld r17,STK_REG(R17)(r1)
+ ld r18,STK_REG(R18)(r1)
+ ld r19,STK_REG(R19)(r1)
+ ld r20,STK_REG(R20)(r1)
+ ld r21,STK_REG(R21)(r1)
+ ld r22,STK_REG(R22)(r1)
+ addi r1,r1,STACKFRAMESIZE
+
+ /* Up to 127B to go */
+5: srdi r6,r5,4
+ mtocrf 0x01,r6
+
+6: bf cr7*4+1,7f
+ ld r0,0(r4)
+ ld r6,8(r4)
+ ld r7,16(r4)
+ ld r8,24(r4)
+ ld r9,32(r4)
+ ld r10,40(r4)
+ ld r11,48(r4)
+ ld r12,56(r4)
+ addi r4,r4,64
+ std r0,0(r3)
+ std r6,8(r3)
+ std r7,16(r3)
+ std r8,24(r3)
+ std r9,32(r3)
+ std r10,40(r3)
+ std r11,48(r3)
+ std r12,56(r3)
+ addi r3,r3,64
+
+ /* Up to 63B to go */
+7: bf cr7*4+2,8f
+ ld r0,0(r4)
+ ld r6,8(r4)
+ ld r7,16(r4)
+ ld r8,24(r4)
+ addi r4,r4,32
+ std r0,0(r3)
+ std r6,8(r3)
+ std r7,16(r3)
+ std r8,24(r3)
+ addi r3,r3,32
+
+ /* Up to 31B to go */
+8: bf cr7*4+3,9f
+ ld r0,0(r4)
+ ld r6,8(r4)
+ addi r4,r4,16
+ std r0,0(r3)
+ std r6,8(r3)
+ addi r3,r3,16
+
+9: clrldi r5,r5,(64-4)
+
+ /* Up to 15B to go */
+.Lshort_copy:
+ mtocrf 0x01,r5
+ bf cr7*4+0,12f
+ lwz r0,0(r4) /* Less chance of a reject with word ops */
+ lwz r6,4(r4)
+ addi r4,r4,8
+ stw r0,0(r3)
+ stw r6,4(r3)
+ addi r3,r3,8
+
+12: bf cr7*4+1,13f
+ lwz r0,0(r4)
+ addi r4,r4,4
+ stw r0,0(r3)
+ addi r3,r3,4
+
+13: bf cr7*4+2,14f
+ lhz r0,0(r4)
+ addi r4,r4,2
+ sth r0,0(r3)
+ addi r3,r3,2
+
+14: bf cr7*4+3,15f
+ lbz r0,0(r4)
+ stb r0,0(r3)
+
+15: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
+ blr
+
+.Lunwind_stack_nonvmx_copy:
+ addi r1,r1,STACKFRAMESIZE
+ b .Lnonvmx_copy
+
diff --git a/criu/arch/ppc64/misc.S b/criu/arch/ppc64/misc.S
new file mode 100644
index 000000000000..4ee188d554d3
--- /dev/null
+++ b/criu/arch/ppc64/misc.S
@@ -0,0 +1,197 @@
+/*
+ * This is from linux/arch/powerpc/lib/crtsavres.S:
+ *
+ * Special support for eabi and SVR4
+ *
+ * Copyright (C) 1995, 1996, 1998, 2000, 2001 Free Software Foundation, Inc.
+ * Copyright 2008 Freescale Semiconductor, Inc.
+ * Written By Michael Meissner
+ *
+ * Based on gcc/config/rs6000/crtsavres.asm from gcc
+ * 64 bit additions from reading the PPC elf64abi document.
+ *
+ * This file is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * In addition to the permissions in the GNU General Public License, the
+ * Free Software Foundation gives you unlimited permission to link the
+ * compiled version of this file with other programs, and to distribute
+ * those programs without any restriction coming from the use of this
+ * file. (The General Public License restrictions do apply in other
+ * respects; for example, they cover modification of the file, and
+ * distribution when not linked into another program.)
+ *
+ * This file is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; see the file COPYING. If not, write to
+ * the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ *
+ * As a special exception, if you link this library with files
+ * compiled with GCC to produce an executable, this does not cause
+ * the resulting executable to be covered by the GNU General Public License.
+ * This exception does not however invalidate any other reasons why
+ * the executable file might be covered by the GNU General Public License.
+ */
+
+#define r0 0
+#define r1 1
+#define r2 2
+#define r3 3
+#define r4 4
+#define r5 5
+#define r6 6
+#define r7 7
+#define r8 8
+#define r9 9
+#define r10 10
+#define r11 11
+#define r12 12
+#define r13 13
+#define r14 14
+#define r15 15
+#define r16 16
+#define r17 17
+#define r18 18
+#define r19 19
+#define r20 20
+#define r21 21
+#define r22 22
+#define r23 23
+#define r24 24
+#define r25 25
+#define r26 26
+#define r27 27
+#define r28 28
+#define r29 29
+#define r30 30
+#define r31 31
+
+ .text
+
+.globl _savegpr0_14
+_savegpr0_14:
+ std r14,-144(r1)
+.globl _savegpr0_15
+_savegpr0_15:
+ std r15,-136(r1)
+.globl _savegpr0_16
+_savegpr0_16:
+ std r16,-128(r1)
+.globl _savegpr0_17
+_savegpr0_17:
+ std r17,-120(r1)
+.globl _savegpr0_18
+_savegpr0_18:
+ std r18,-112(r1)
+.globl _savegpr0_19
+_savegpr0_19:
+ std r19,-104(r1)
+.globl _savegpr0_20
+_savegpr0_20:
+ std r20,-96(r1)
+.globl _savegpr0_21
+_savegpr0_21:
+ std r21,-88(r1)
+.globl _savegpr0_22
+_savegpr0_22:
+ std r22,-80(r1)
+.globl _savegpr0_23
+_savegpr0_23:
+ std r23,-72(r1)
+.globl _savegpr0_24
+_savegpr0_24:
+ std r24,-64(r1)
+.globl _savegpr0_25
+_savegpr0_25:
+ std r25,-56(r1)
+.globl _savegpr0_26
+_savegpr0_26:
+ std r26,-48(r1)
+.globl _savegpr0_27
+_savegpr0_27:
+ std r27,-40(r1)
+.globl _savegpr0_28
+_savegpr0_28:
+ std r28,-32(r1)
+.globl _savegpr0_29
+_savegpr0_29:
+ std r29,-24(r1)
+.globl _savegpr0_30
+_savegpr0_30:
+ std r30,-16(r1)
+.globl _savegpr0_31
+_savegpr0_31:
+ std r31,-8(r1)
+ std r0,16(r1)
+ blr
+
+.globl _restgpr0_14
+_restgpr0_14:
+ ld r14,-144(r1)
+.globl _restgpr0_15
+_restgpr0_15:
+ ld r15,-136(r1)
+.globl _restgpr0_16
+_restgpr0_16:
+ ld r16,-128(r1)
+.globl _restgpr0_17
+_restgpr0_17:
+ ld r17,-120(r1)
+.globl _restgpr0_18
+_restgpr0_18:
+ ld r18,-112(r1)
+.globl _restgpr0_19
+_restgpr0_19:
+ ld r19,-104(r1)
+.globl _restgpr0_20
+_restgpr0_20:
+ ld r20,-96(r1)
+.globl _restgpr0_21
+_restgpr0_21:
+ ld r21,-88(r1)
+.globl _restgpr0_22
+_restgpr0_22:
+ ld r22,-80(r1)
+.globl _restgpr0_23
+_restgpr0_23:
+ ld r23,-72(r1)
+.globl _restgpr0_24
+_restgpr0_24:
+ ld r24,-64(r1)
+.globl _restgpr0_25
+_restgpr0_25:
+ ld r25,-56(r1)
+.globl _restgpr0_26
+_restgpr0_26:
+ ld r26,-48(r1)
+.globl _restgpr0_27
+_restgpr0_27:
+ ld r27,-40(r1)
+.globl _restgpr0_28
+_restgpr0_28:
+ ld r28,-32(r1)
+.globl _restgpr0_29
+_restgpr0_29:
+ ld r0,16(r1)
+ ld r29,-24(r1)
+ mtlr r0
+ ld r30,-16(r1)
+ ld r31,-8(r1)
+ blr
+
+.globl _restgpr0_30
+_restgpr0_30:
+ ld r30,-16(r1)
+.globl _restgpr0_31
+_restgpr0_31:
+ ld r0,16(r1)
+ ld r31,-8(r1)
+ mtlr r0
+ blr
diff --git a/criu/arch/ppc64/parasite-head.S b/criu/arch/ppc64/parasite-head.S
new file mode 100644
index 000000000000..a1c189fe94ea
--- /dev/null
+++ b/criu/arch/ppc64/parasite-head.S
@@ -0,0 +1,46 @@
+#include "asm/linkage.h"
+#include "parasite.h"
+
+ .section .head.text
+ .align 8
+
+ENTRY(__export_parasite_head_start)
+
+ // int __used parasite_service(unsigned int cmd, void *args)
+ // cmd = r3 = *__export_parasite_cmd (u32 ?)
+ // args = r4 = @parasite_args_ptr + @pc
+ bl 0f
+0: mflr r2
+
+#define LOAD_REG_ADDR(reg, name) \
+ addis reg,r2,(name - 0b)@ha; \
+ addi reg,r2,(name - 0b)@l;
+
+ LOAD_REG_ADDR(r3,__export_parasite_cmd)
+ lwz r3,0(r3)
+
+ LOAD_REG_ADDR(r4,parasite_args_ptr)
+ ld r4,0(r4)
+
+ LOAD_REG_ADDR(r12,parasite_service_ptr)
+ ld r12,0(r12)
+ mtctr r12
+
+ bctrl // call parasite_service
+ twi 31,0,0 // Should generate SIGTRAP
+
+parasite_args_ptr:
+ .quad __export_parasite_args
+
+parasite_service_ptr:
+ // We want to run the function prototype to set r2.
+ // Since the relocation will prefer the local entry
+ // point, we force it to the global one which is 2
+ // instructions above the local one.
+ // FIXME: There should be a way to specify the global entry here.
+ .quad parasite_service - 8
+
+__export_parasite_cmd:
+ .long 0
+
+END(__export_parasite_head_start)
diff --git a/criu/arch/ppc64/restorer.c b/criu/arch/ppc64/restorer.c
new file mode 100644
index 000000000000..665676045d3f
--- /dev/null
+++ b/criu/arch/ppc64/restorer.c
@@ -0,0 +1,31 @@
+#include <unistd.h>
+
+#include "restorer.h"
+#include "asm/restorer.h"
+#include "asm/fpu.h"
+
+#include "syscall.h"
+#include "log.h"
+
+int restore_nonsigframe_gpregs(UserPpc64RegsEntry *r)
+{
+ return 0;
+}
+
+unsigned long sys_shmat(int shmid, const void *shmaddr, int shmflg)
+{
+ unsigned long raddr;
+ int ret;
+
+ ret = sys_ipc(21 /*SHMAT */,
+ shmid, /* first */
+ shmflg, /* second */
+ (unsigned long)&raddr, /* third */
+ shmaddr, /* ptr */
+ 0 /* fifth not used */);
+
+ if (ret)
+ raddr = (unsigned long) ret;
+
+ return raddr;
+}
diff --git a/criu/arch/ppc64/syscalls/syscall-common-ppc64.S b/criu/arch/ppc64/syscalls/syscall-common-ppc64.S
new file mode 100644
index 000000000000..e18d6adf419e
--- /dev/null
+++ b/criu/arch/ppc64/syscalls/syscall-common-ppc64.S
@@ -0,0 +1,24 @@
+#include "asm/linkage.h"
+#include <asm/unistd.h> /* for __NR_ipc */
+
+#define SYSCALL(name, opcode) \
+ ENTRY(name); \
+ li r0, opcode; \
+ b __syscall_common; \
+ END(name)
+
+ .text
+ .align 4
+
+ENTRY(__syscall_common)
+ sc
+ bnslr+ /* if no error return to LR */
+ neg r3,r3 /* r3 = -r3 to return -errno value */
+ blr
+END(__syscall_common)
+
+ENTRY(__cr_restore_rt)
+ li r0, __NR_rt_sigreturn
+ b __syscall_common
+END(__cr_restore_rt)
+
diff --git a/criu/arch/ppc64/syscalls/syscall-ppc64.tbl b/criu/arch/ppc64/syscalls/syscall-ppc64.tbl
new file mode 100644
index 000000000000..331937973f72
--- /dev/null
+++ b/criu/arch/ppc64/syscalls/syscall-ppc64.tbl
@@ -0,0 +1,105 @@
+#
+# System calls table, please make sure the table consist only the syscalls
+# really used somewhere in project.
+#
+# The template is (name and srguments are optinal if you need only __NR_x
+# defined, but no realy entry point in syscalls lib).
+#
+# name code name arguments
+# -----------------------------------------------------------------------
+#
+__NR_read 3 sys_read (int fd, void *buf, unsigned long count)
+__NR_write 4 sys_write (int fd, const void *buf, unsigned long count)
+__NR_open 5 sys_open (const char *filename, unsigned long flags, unsigned long mode)
+__NR_close 6 sys_close (int fd)
+__NR_lseek 19 sys_lseek (int fd, unsigned long offset, unsigned long origin)
+__NR_mmap 90 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset)
+__NR_mprotect 125 sys_mprotect (const void *addr, unsigned long len, unsigned long prot)
+__NR_munmap 91 sys_munmap (void *addr, unsigned long len)
+__NR_brk 45 sys_brk (void *addr)
+__NR_rt_sigaction 173 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize)
+__NR_rt_sigprocmask 174 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize)
+__NR_rt_sigreturn 172 sys_rt_sigreturn (void)
+__NR_ioctl 54 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg)
+__NR_pread64 179 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos)
+__NR_ptrace 26 sys_ptrace (long request, pid_t pid, void *addr, void *data)
+__NR_mremap 163 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr)
+__NR_mincore 206 sys_mincore (void *addr, unsigned long size, unsigned char *vec)
+__NR_madvise 205 sys_madvise (unsigned long start, size_t len, int behavior)
+__NR_pause 29 sys_pause (void)
+__NR_nanosleep 162 sys_nanosleep (struct timespec *req, struct timespec *rem)
+__NR_getitimer 105 sys_getitimer (int which, const struct itimerval *val)
+__NR_setitimer 104 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old)
+__NR_getpid 20 sys_getpid (void)
+__NR_socket 326 sys_socket (int domain, int type, int protocol)
+__NR_connect 328 sys_connect (int sockfd, struct sockaddr *addr, int addrlen)
+__NR_sendto 335 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len)
+__NR_recvfrom 337 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len)
+__NR_sendmsg 341 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags)
+__NR_recvmsg 342 sys_recvmsg (int sockfd, struct msghdr *msg, int flags)
+__NR_shutdown 338 sys_shutdown (int sockfd, int how)
+__NR_bind 327 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen)
+__NR_setsockopt 339 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen)
+__NR_getsockopt 340 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen)
+__NR_clone 120 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, void *child_tid)
+__NR_exit 1 sys_exit (unsigned long error_code)
+__NR_wait4 114 sys_wait4 (int pid, int *status, int options, struct rusage *ru)
+__NR_kill 37 sys_kill (long pid, int sig)
+__NR_fcntl 55 sys_fcntl (int fd, int type, long arg)
+__NR_flock 143 sys_flock (int fd, unsigned long cmd)
+__NR_mkdir 39 sys_mkdir (const char *name, int mode)
+__NR_rmdir 40 sys_rmdir (const char *name)
+__NR_unlink 10 sys_unlink (char *pathname)
+__NR_readlinkat 296 sys_readlinkat (int fd, const char *path, char *buf, int bufsize)
+__NR_umask 60 sys_umask (int mask)
+__NR_getgroups 80 sys_getgroups (int gsize, unsigned int *groups)
+__NR_setgroups 81 sys_setgroups (int gsize, unsigned int *groups)
+__NR_setresuid 164 sys_setresuid (int uid, int euid, int suid)
+__NR_getresuid 165 sys_getresuid (int *uid, int *euid, int *suid)
+__NR_setresgid 169 sys_setresgid (int gid, int egid, int sgid)
+__NR_getresgid 170 sys_getresgid (int *gid, int *egid, int *sgid)
+__NR_getpgid 132 sys_getpgid (pid_t pid)
+__NR_setfsuid 138 sys_setfsuid (int fsuid)
+__NR_setfsgid 139 sys_setfsgid (int fsgid)
+__NR_getsid 147 sys_getsid (void)
+__NR_capget 183 sys_capget (struct cap_header *h, struct cap_data *d)
+__NR_capset 184 sys_capset (struct cap_header *h, struct cap_data *d)
+__NR_rt_sigqueueinfo 177 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info)
+__NR_sigaltstack 185 sys_sigaltstack (const void *uss, void *uoss)
+__NR_personality 136 sys_personality (unsigned int personality)
+__NR_setpriority 97 sys_setpriority (int which, int who, int nice)
+__NR_sched_setscheduler 156 sys_sched_setscheduler (int pid, int policy, struct sched_param *p)
+__NR_prctl 171 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5)
+__NR_setrlimit 75 sys_setrlimit (int resource, struct krlimit *rlim)
+__NR_mount 21 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data)
+__NR_umount2 52 sys_umount2 (char *name, int flags)
+__NR_gettid 207 sys_gettid (void)
+__NR_futex 221 sys_futex (u32 *uaddr, int op, u32 val, struct timespec *utime, u32 *uaddr2, u32 val3)
+__NR_set_tid_address 232 sys_set_tid_address (int *tid_addr)
+__NR_restart_syscall 0 sys_restart_syscall (void)
+__NR_sys_timer_create 240 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id)
+__NR_sys_timer_settime 241 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting)
+__NR_sys_timer_gettime 242 sys_timer_gettime (int timer_id, const struct itimerspec *setting)
+__NR_sys_timer_getoverrun 243 sys_timer_getoverrun (int timer_id)
+__NR_sys_timer_delete 244 sys_timer_delete (kernel_timer_t timer_id)
+__NR_clock_gettime 246 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp)
+__NR_exit_group 234 sys_exit_group (int error_code)
+__NR_waitid 272 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru)
+__NR_set_robust_list 300 sys_set_robust_list (struct robust_list_head *head, size_t len)
+__NR_get_robust_list 299 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
+__NR_vmsplice 285 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags)
+__NR_openat 286 sys_openat (int dfd, const char *filename, int flags, int mode)
+__NR_timerfd_settime 311 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
+__NR_signalfd4 313 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
+__NR_rt_tgsigqueueinfo 322 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info)
+__NR_fanotify_init 323 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags)
+__NR_fanotify_mark 324 sys_fanotify_mark (int fanotify_fd, unsigned int flags, u64 mask, int dfd, const char *pathname)
+__NR_prlimit64 325 sys_prlimit64 (pid_t pid, unsigned int resource, const struct rlimit64 *new_rlim, struct rlimit64 *old_rlim)
+__NR_open_by_handle_at 346 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags)
+__NR_setns 350 sys_setns (int fd, int nstype)
+__NR_kcmp 354 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2)
+__NR_seccomp 358 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs)
+__NR_memfd_create 360 sys_memfd_create (const char *name, unsigned int flags)
+__NR_io_setup 227 sys_io_setup (unsigned nr_events, aio_context_t *ctx_idp)
+__NR_io_getevents 229 sys_io_getevents (aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout)
+__NR_ipc 117 sys_ipc (unsigned int call, int first, unsigned long second, unsigned long third, const void *ptr, long fifth)
diff --git a/criu/arch/ppc64/vdso-pie.c b/criu/arch/ppc64/vdso-pie.c
new file mode 100644
index 000000000000..30437d5cc686
--- /dev/null
+++ b/criu/arch/ppc64/vdso-pie.c
@@ -0,0 +1,155 @@
+#include <unistd.h>
+
+#include "asm/string.h"
+#include "asm/types.h"
+
+#include "syscall.h"
+#include "parasite-vdso.h"
+#include "log.h"
+#include "bug.h"
+
+#ifdef LOG_PREFIX
+# undef LOG_PREFIX
+#endif
+#define LOG_PREFIX "vdso: "
+
+/* This symbols are defined in vdso-trampoline.S */
+extern char *vdso_trampoline, *vdso_trampoline_end;
+
+static inline void invalidate_caches(unsigned long at)
+{
+ asm volatile("isync \n" \
+ "li 3,0 \n" \
+ "dcbf 3,%0 \n" \
+ "sync \n" \
+ "icbi 3,%0 \n" \
+ "isync \n" \
+ : /* no output */ \
+ : "r"(at) \
+ :"memory", "r3");
+}
+
+/* This is the size of the trampoline call :
+ * mlfr r0
+ * bl trampoline
+ * <64 bit address>
+ */
+#define TRAMP_CALL_SIZE (2*sizeof(uint32_t) + sizeof(uint64_t))
+
+/*
+ * put_trampoline does 2 things :
+ *
+ * 1. it looks for a place in the checkpointed vDSO where to put the
+ * trampoline code (see vdso-trampoline.S).
+ *
+ * 2. for each symbol from the checkpointed vDSO, it checks that there are
+ * enough place to put the call to the vDSO trampoline (see
+ * TRAMP_CALL_SIZE's comment above).
+ * This done by checking that there is no interesting symbols in the range
+ * of current one's offset -> (current one's offset + TRAMP_CALL_SIZE).
+ * Unfortunately the symbols are not sorted by address so we have to look
+ * for the complete table all the time. Since the vDSO is small, this is
+ * not a big issue.
+ */
+static unsigned long put_trampoline(unsigned long at, struct vdso_symtable *sym)
+{
+ int i,j;
+ unsigned long size;
+ unsigned long trampoline = 0;
+
+ /* First of all we have to find a place where to put the trampoline
+ * code.
+ */
+ size = (unsigned long)&vdso_trampoline_end
+ - (unsigned long)&vdso_trampoline;
+
+ for (i = 0; i < ARRAY_SIZE(sym->symbols); i++) {
+ if (vdso_symbol_empty(&sym->symbols[i]))
+ continue;
+
+ pr_debug("Checking '%s' at %lx\n", sym->symbols[i].name,
+ sym->symbols[i].offset);
+
+ /* find the nearest followin symbol we are interested in */
+ for (j=0; j < ARRAY_SIZE(sym->symbols); j++) {
+ if (i==j || vdso_symbol_empty(&sym->symbols[j]))
+ continue;
+
+ if (sym->symbols[j].offset <= sym->symbols[i].offset)
+ /* this symbol is above the current one */
+ continue;
+
+ if ((sym->symbols[i].offset+TRAMP_CALL_SIZE) >
+ sym->symbols[j].offset) {
+ /* we have a major issue here since we cannot
+ * even put the trampoline call for this symbol
+ */
+ pr_err("Can't handle small vDSO symbol %s\n",
+ sym->symbols[i].name);
+ return 0;
+ }
+
+ if (trampoline)
+ /* no need to put it twice */
+ continue;
+
+ if ((sym->symbols[j].offset -
+ (sym->symbols[i].offset+TRAMP_CALL_SIZE)) <= size)
+ /* not enough place */
+ continue;
+
+ /* We can put the trampoline there */
+ trampoline = at + sym->symbols[i].offset;
+ trampoline += TRAMP_CALL_SIZE;
+
+ pr_debug("Puting vDSO trampoline in %s at %lx\n",
+ sym->symbols[i].name, trampoline);
+ builtin_memcpy((void *)trampoline, &vdso_trampoline,
+ size);
+ invalidate_caches(trampoline);
+ }
+ }
+
+ return trampoline;
+}
+
+static inline void put_trampoline_call(unsigned long at, unsigned long to,
+ unsigned long tr)
+{
+ uint32_t *addr = (uint32_t *)at;;
+
+ *addr++ = 0x7C0802a6; /* mflr r0 */
+ *addr++ = 0x48000001 | ((long)(tr-at-4) & 0x3fffffc); /* bl tr */
+ *(uint64_t *)addr = to; /* the address to read by the trampoline */
+
+ invalidate_caches(at);
+}
+
+int vdso_redirect_calls(unsigned long base_to,
+ unsigned long base_from,
+ struct vdso_symtable *to,
+ struct vdso_symtable *from)
+{
+ unsigned int i;
+ unsigned long trampoline;
+
+ trampoline = (unsigned long)put_trampoline(base_from, from);
+ if (!trampoline)
+ return 1;
+
+ for (i = 0; i < ARRAY_SIZE(to->symbols); i++) {
+ if (vdso_symbol_empty(&from->symbols[i]))
+ continue;
+
+ pr_debug("br: %lx/%lx -> %lx/%lx (index %d) '%s'\n",
+ base_from, from->symbols[i].offset,
+ base_to, to->symbols[i].offset, i,
+ from->symbols[i].name);
+
+ put_trampoline_call(base_from + from->symbols[i].offset,
+ base_to + to->symbols[i].offset,
+ trampoline);
+ }
+
+ return 0;
+}
diff --git a/criu/arch/ppc64/vdso-trampoline.S b/criu/arch/ppc64/vdso-trampoline.S
new file mode 100644
index 000000000000..e910e7ab99a4
--- /dev/null
+++ b/criu/arch/ppc64/vdso-trampoline.S
@@ -0,0 +1,11 @@
+#include "asm/linkage.h"
+
+ .section .text
+
+GLOBAL(vdso_trampoline)
+ mflr r12 /* r12 vdso_ptr's address */
+ mtlr r0 /* restore lr */
+ ld r12,0(r12) /* read value store in vdso_ptr */
+ mtctr r12 /* branch to it */
+ bctr
+GLOBAL(vdso_trampoline_end)
diff --git a/criu/arch/scripts/arm/gen-sys-exec-tbl.pl b/criu/arch/scripts/arm/gen-sys-exec-tbl.pl
new file mode 100755
index 000000000000..a3037b78c34e
--- /dev/null
+++ b/criu/arch/scripts/arm/gen-sys-exec-tbl.pl
@@ -0,0 +1,39 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+my $in = $ARGV[0];
+my $tblout = $ARGV[1];
+my $bits = $ARGV[2];
+
+my $code = "code$bits";
+
+open TBLOUT, ">", $tblout or die $!;
+open IN, "<", $in or die $!;
+
+print TBLOUT "/* Autogenerated, don't edit */\n";
+
+for (<IN>) {
+ if ($_ =~ /\#/) {
+ next;
+ }
+
+ my $sys_name;
+ my $sys_num;
+
+ if (/(?<name>\S+)\s+(?<alias>\S+)\s+(?<code64>\d+|\!)\s+(?<code32>(?:\d+|\!))\s+\((?<args>.+)\)/) {
+ $sys_name = $+{alias};
+ } elsif (/(?<name>\S+)\s+(?<code64>\d+|\!)\s+(?<code32>(?:\d+|\!))\s+\((?<args>.+)\)/) {
+ $sys_name = $+{name};
+ } else {
+ unlink $tblout;
+ die "Invalid syscall definition file: invalid entry $_\n";
+ }
+
+ $sys_num = $+{$code};
+
+ if ($sys_num ne "!") {
+ print TBLOUT "SYSCALL($sys_name, $sys_num)\n";
+ }
+}
diff --git a/criu/arch/scripts/arm/gen-syscalls.pl b/criu/arch/scripts/arm/gen-syscalls.pl
new file mode 100755
index 000000000000..6fb8f3bf2071
--- /dev/null
+++ b/criu/arch/scripts/arm/gen-syscalls.pl
@@ -0,0 +1,95 @@
+#!/usr/bin/perl
+
+use strict;
+use warnings;
+
+my $in = $ARGV[0];
+my $codesout = $ARGV[1];
+my $codes = $ARGV[1];
+$codes =~ s/.*include\///g;
+my $protosout = $ARGV[2];
+my $protos = $ARGV[2];
+$protos =~ s/.*include\///g;
+my $asmout = $ARGV[3];
+my $asmcommon = $ARGV[4];
+my $prototypes = $ARGV[5];
+$prototypes =~ s/.*include\///g;
+my $bits = $ARGV[6];
+
+my $codesdef = $codes;
+$codesdef =~ tr/.-/_/;
+my $protosdef = $protos;
+$protosdef =~ tr/.-/_/;
+my $code = "code$bits";
+my $need_aux = 0;
+
+unlink $codesout;
+unlink $protosout;
+unlink $asmout;
+
+open CODESOUT, ">", $codesout or die $!;
+open PROTOSOUT, ">", $protosout or die $!;
+open ASMOUT, ">", $asmout or die $!;
+open IN, "<", $in or die $!;
+
+print CODESOUT <<"END";
+/* Autogenerated, don't edit */
+#ifndef $codesdef
+#define $codesdef
+END
+
+print PROTOSOUT <<"END";
+/* Autogenerated, don't edit */
+#ifndef $protosdef
+#define $protosdef
+#include "$prototypes"
+#include "$codes"
+END
+
+print ASMOUT <<"END";
+/* Autogenerated, don't edit */
+#include "$codes"
+#include "$asmcommon"
+END
+
+
+for (<IN>) {
+ if ($_ =~ /\#/) {
+ next;
+ }
+
+ my $code_macro;
+ my $sys_name;
+
+ if (/(?<name>\S+)\s+(?<alias>\S+)\s+(?<code64>\d+|\!)\s+(?<code32>(?:\d+|\!))\s+\((?<args>.+)\)/) {
+ $code_macro = "__NR_$+{name}";
+ $sys_name = "sys_$+{alias}";
+ } elsif (/(?<name>\S+)\s+(?<code64>\d+|\!)\s+(?<code32>(?:\d+|\!))\s+\((?<args>.+)\)/) {
+ $code_macro = "__NR_$+{name}";
+ $sys_name = "sys_$+{name}";
+ } else {
+ unlink $codesout;
+ unlink $protosout;
+ unlink $asmout;
+
+ die "Invalid syscall definition file: invalid entry $_\n";
+ }
+
+ if ($+{$code} ne "!") {
+ print CODESOUT "#define $code_macro $+{$code}\n";
+ print ASMOUT "syscall $sys_name, $code_macro\n";
+
+ } else {
+ $need_aux = 1;
+ }
+
+ print PROTOSOUT "extern long $sys_name($+{args});\n";
+}
+
+if ($need_aux == 1) {
+ print ASMOUT "#include \"asm/syscall-aux.S\"\n";
+ print CODESOUT "#include \"asm/syscall-aux.h\"\n";
+}
+
+print CODESOUT "#endif /* $codesdef */";
+print PROTOSOUT "#endif /* $protosdef */";
diff --git a/criu/arch/x86/Makefile b/criu/arch/x86/Makefile
new file mode 100644
index 000000000000..369b41f28b6a
--- /dev/null
+++ b/criu/arch/x86/Makefile
@@ -0,0 +1,7 @@
+builtin-name := crtools.built-in.o
+
+ccflags-y += -iquote $(obj) -iquote $(SRC_DIR) -iquote $(obj)/include -iquote $(SRC_DIR)/criu/include
+
+obj-y += cpu.o
+obj-y += crtools.o
+obj-y += prlimit.o
diff --git a/criu/arch/x86/Makefile.syscalls b/criu/arch/x86/Makefile.syscalls
new file mode 100644
index 000000000000..24841797bf5b
--- /dev/null
+++ b/criu/arch/x86/Makefile.syscalls
@@ -0,0 +1,66 @@
+builtin-name := syscalls.built-in.o
+
+SYS-TYPES := ../../include/syscall-types.h
+SYS-CODES := ../../include/syscall-codes.h
+SYS-PROTO := ../../include/syscall.h
+
+ifeq ($(ARCH),x86)
+ SYS-DEF := syscall_64.tbl
+ SYS-ASM-COMMON := syscall-common-x86-64.S
+ asflags-y += -fpie -Wstrict-prototypes -Wa,--noexecstack
+else
+ SYS-DEF := syscall_32.tbl
+ SYS-ASM-COMMON := syscall-common-x86-32.S
+ asflags-y += -fno-pic -Wstrict-prototypes -Wa,--noexecstack
+ obj-y += syscalls/syscall32.o
+
+$(obj)/syscalls/syscall32.o: $(obj)/$(SYS-CODES) $(obj)/$(SYS-PROTO)
+endif
+
+asflags-y += -D__ASSEMBLY__ -nostdlib -fomit-frame-pointer
+asflags-y += -iquote $(obj) -iquote $(obj)/include -iquote $(SRC_DIR)/criu/include
+
+SYS-ASM := syscalls.S
+obj-y += $(SYS-ASM:.S=).o
+
+$(obj)/$(SYS-CODES): $(obj)/syscalls/$(SYS-DEF)
+ $(E) " GEN " $@
+ $(Q) echo "/* Autogenerated, don't edit */" > $@
+ $(Q) echo "#ifndef __ASM_CR_SYSCALL_CODES_H__" >> $@
+ $(Q) echo "#define __ASM_CR_SYSCALL_CODES_H__" >> $@
+ $(Q) cat $< | awk '/^__NR/{print "#define", $$1, $$2}' >> $@
+ $(Q) echo "#endif /* __ASM_CR_SYSCALL_CODES_H__ */" >> $@
+cleanup-y += $(obj)/$(SYS-CODES)
+
+$(obj)/$(SYS-PROTO): $(obj)/syscalls/$(SYS-DEF)
+ $(E) " GEN " $@
+ $(Q) echo "/* Autogenerated, don't edit */" > $@
+ $(Q) echo "#ifndef __ASM_CR_SYSCALL_PROTO_H__" >> $@
+ $(Q) echo "#define __ASM_CR_SYSCALL_PROTO_H__" >> $@
+ $(Q) echo "#ifndef CR_NOGLIBC" >> $@
+ $(Q) echo "# error This file should only be used in the parasite code" >> $@
+ $(Q) echo "#endif" >> $@
+ $(Q) echo "#include \"syscall-codes.h\"" >> $@
+ $(Q) echo "#include \"syscall-types.h\"" >> $@
+ifneq ($(ARCH),x86)
+ $(Q) echo "#include \"asm/syscall32.h\"" >> $@
+endif
+ $(Q) cat $< | awk '/^__NR/{print "extern long", $$3, substr($$0, index($$0,$$4)), ";"}' >> $@
+ $(Q) echo "#endif /* __ASM_CR_SYSCALL_PROTO_H__ */" >> $@
+cleanup-y += $(obj)/$(SYS-PROTO)
+
+$(obj)/$(SYS-ASM): $(obj)/syscalls/$(SYS-DEF) $(obj)/syscalls/$(SYS-ASM-COMMON) $(obj)/$(SYS-CODES) $(obj)/$(SYS-PROTO)
+ $(E) " GEN " $@
+ $(Q) echo "/* Autogenerated, don't edit */" > $@
+ $(Q) echo "#include \"syscall-codes.h\"" >> $@
+ $(Q) echo "#include \"syscalls/$(SYS-ASM-COMMON)\"" >> $@
+ $(Q) cat $< | awk '/^__NR/{print "SYSCALL(", $$3, ",", $$2, ")"}' >> $@
+cleanup-y += $(obj)/$(SYS-ASM)
+
+SYS-EXEC-TBL := sys-exec-tbl.c
+$(obj)/$(SYS-EXEC-TBL): $(obj)/syscalls/$(SYS-DEF) $(obj)/$(SYS-CODES) $(obj)/$(SYS-PROTO)
+ $(E) " GEN " $@
+ $(Q) echo "/* Autogenerated, don't edit */" > $@
+ $(Q) cat $< | awk '/^__NR/{print "SYSCALL(", substr($$3, 5), ",", $$2, ")"}' >> $@
+cleanup-y += $(obj)/$(SYS-EXEC-TBL)
+all-y += $(obj)/$(SYS-EXEC-TBL)
diff --git a/criu/arch/x86/cpu.c b/criu/arch/x86/cpu.c
new file mode 100644
index 000000000000..d703e68e3621
--- /dev/null
+++ b/criu/arch/x86/cpu.c
@@ -0,0 +1,491 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+
+#include <sys/types.h>
+
+#include "asm/bitops.h"
+#include "asm/types.h"
+#include "asm/cpu.h"
+#include "asm/fpu.h"
+
+#include "compiler.h"
+
+#include "cr_options.h"
+#include "proc_parse.h"
+#include "util.h"
+#include "log.h"
+
+#include "cpu.h"
+
+#include "protobuf.h"
+#include "protobuf/cpuinfo.pb-c.h"
+
+#undef LOG_PREFIX
+#define LOG_PREFIX "cpu: "
+
+static struct cpuinfo_x86 rt_cpu_info;
+
+static void set_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ if (likely(feature < NCAPINTS_BITS))
+ set_bit(feature, (unsigned long *)c->x86_capability);
+}
+
+static void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ if (likely(feature < NCAPINTS_BITS))
+ clear_bit(feature, (unsigned long *)c->x86_capability);
+}
+
+static int test_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ if (likely(feature < NCAPINTS_BITS))
+ return test_bit(feature, (unsigned long *)c->x86_capability);
+ return 0;
+}
+
+bool cpu_has_feature(unsigned int feature)
+{
+ return test_cpu_cap(&rt_cpu_info, feature);
+}
+
+static int cpu_init_cpuid(struct cpuinfo_x86 *c)
+{
+ /*
+ * See cpu_detect() in the kernel, also
+ * read cpuid specs not only from general
+ * SDM but for extended instructions set
+ * reference.
+ */
+
+ /* Get vendor name */
+ cpuid(0x00000000,
+ (unsigned int *)&c->cpuid_level,
+ (unsigned int *)&c->x86_vendor_id[0],
+ (unsigned int *)&c->x86_vendor_id[8],
+ (unsigned int *)&c->x86_vendor_id[4]);
+
+ if (!strcmp(c->x86_vendor_id, "GenuineIntel")) {
+ c->x86_vendor = X86_VENDOR_INTEL;
+ } else if (!strcmp(c->x86_vendor_id, "AuthenticAMD")) {
+ c->x86_vendor = X86_VENDOR_AMD;
+ } else {
+ pr_err("Unsupported CPU vendor %s\n",
+ c->x86_vendor_id);
+ return -1;
+ }
+
+ c->x86_family = 4;
+
+ /* Intel-defined flags: level 0x00000001 */
+ if (c->cpuid_level >= 0x00000001) {
+ u32 eax, ebx, ecx, edx;
+
+ cpuid(0x00000001, &eax, &ebx, &ecx, &edx);
+ c->x86_family = (eax >> 8) & 0xf;
+ c->x86_model = (eax >> 4) & 0xf;
+ c->x86_mask = eax & 0xf;
+
+ if (c->x86_family == 0xf)
+ c->x86_family += (eax >> 20) & 0xff;
+ if (c->x86_family >= 0x6)
+ c->x86_model += ((eax >> 16) & 0xf) << 4;
+
+ c->x86_capability[0] = edx;
+ c->x86_capability[4] = ecx;
+ }
+
+ /* Additional Intel-defined flags: level 0x00000007 */
+ if (c->cpuid_level >= 0x00000007) {
+ u32 eax, ebx, ecx, edx;
+
+ cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
+ c->x86_capability[9] = ebx;
+ c->x86_capability[11] = ecx;
+ }
+
+ /* Extended state features: level 0x0000000d */
+ if (c->cpuid_level >= 0x0000000d) {
+ u32 eax, ebx, ecx, edx;
+
+ cpuid_count(0x0000000d, 1, &eax, &ebx, &ecx, &edx);
+ c->x86_capability[10] = eax;
+ }
+
+ /* AMD-defined flags: level 0x80000001 */
+ c->extended_cpuid_level = cpuid_eax(0x80000000);
+
+ if ((c->extended_cpuid_level & 0xffff0000) == 0x80000000) {
+ if (c->extended_cpuid_level >= 0x80000001) {
+ c->x86_capability[1] = cpuid_edx(0x80000001);
+ c->x86_capability[6] = cpuid_ecx(0x80000001);
+ }
+ }
+
+ /*
+ * We're don't care about scattered features for now,
+ * otherwise look into init_scattered_cpuid_features()
+ * in kernel.
+ */
+
+ if (c->extended_cpuid_level >= 0x80000004) {
+ unsigned int *v;
+ char *p, *q;
+ v = (unsigned int *)c->x86_model_id;
+ cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
+ cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
+ cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
+ c->x86_model_id[48] = 0;
+
+ /*
+ * Intel chips right-justify this string for some dumb reason;
+ * undo that brain damage:
+ */
+ p = q = &c->x86_model_id[0];
+ while (*p == ' ')
+ p++;
+ if (p != q) {
+ while (*p)
+ *q++ = *p++;
+ while (q <= &c->x86_model_id[48])
+ *q++ = '\0'; /* Zero-pad the rest */
+ }
+ }
+
+ /* On x86-64 NOP is always present */
+ set_cpu_cap(c, X86_FEATURE_NOPL);
+
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ /*
+ * Strictly speaking we need to read MSR_IA32_MISC_ENABLE
+ * here but on ring3 it's impossible.
+ */
+ if (c->x86_family == 15) {
+ clear_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ clear_cpu_cap(c, X86_FEATURE_ERMS);
+ } else if (c->x86_family == 6) {
+ /* On x86-64 rep is fine */
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ }
+
+ /* See filter_cpuid_features in kernel */
+ if ((s32)c->cpuid_level < (s32)0x0000000d)
+ clear_cpu_cap(c, X86_FEATURE_XSAVE);
+ break;
+ case X86_VENDOR_AMD:
+ /*
+ * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
+ * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
+ */
+ clear_cpu_cap(c, 0 * 32 + 31);
+ if (c->x86_family >= 0x10)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ if (c->x86_family == 0xf) {
+ u32 level;
+
+ /* On C+ stepping K8 rep microcode works well for copy/memset */
+ level = cpuid_eax(1);
+ if ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
+ set_cpu_cap(c, X86_FEATURE_REP_GOOD);
+ }
+ break;
+ }
+
+ return 0;
+}
+
+int cpu_init(void)
+{
+ if (cpu_init_cpuid(&rt_cpu_info))
+ return -1;
+
+ BUILD_BUG_ON(sizeof(struct xsave_struct) != XSAVE_SIZE);
+ BUILD_BUG_ON(sizeof(struct i387_fxsave_struct) != FXSAVE_SIZE);
+
+ /*
+ * Make sure that at least FPU is onboard
+ * and fxsave is supported.
+ */
+ if (cpu_has_feature(X86_FEATURE_FPU)) {
+ if (!cpu_has_feature(X86_FEATURE_FXSR)) {
+ pr_err("missing support fxsave/restore insns\n");
+ return -1;
+ }
+ }
+
+ pr_debug("fpu:%d fxsr:%d xsave:%d\n",
+ !!cpu_has_feature(X86_FEATURE_FPU),
+ !!cpu_has_feature(X86_FEATURE_FXSR),
+ !!cpu_has_feature(X86_FEATURE_XSAVE));
+
+ return 0;
+}
+
+int cpu_dump_cpuinfo(void)
+{
+ CpuinfoEntry cpu_info = CPUINFO_ENTRY__INIT;
+ CpuinfoX86Entry cpu_x86_info = CPUINFO_X86_ENTRY__INIT;
+ CpuinfoX86Entry *cpu_x86_info_ptr = &cpu_x86_info;
+ struct cr_img *img;
+
+ img = open_image(CR_FD_CPUINFO, O_DUMP);
+ if (!img)
+ return -1;
+
+ cpu_info.x86_entry = &cpu_x86_info_ptr;
+ cpu_info.n_x86_entry = 1;
+
+ cpu_x86_info.vendor_id = (rt_cpu_info.x86_vendor == X86_VENDOR_INTEL) ?
+ CPUINFO_X86_ENTRY__VENDOR__INTEL :
+ CPUINFO_X86_ENTRY__VENDOR__AMD;
+ cpu_x86_info.cpu_family = rt_cpu_info.x86_family;
+ cpu_x86_info.model = rt_cpu_info.x86_model;
+ cpu_x86_info.stepping = rt_cpu_info.x86_mask;
+ cpu_x86_info.capability_ver = 1;
+ cpu_x86_info.n_capability = ARRAY_SIZE(rt_cpu_info.x86_capability);
+ cpu_x86_info.capability = (void *)rt_cpu_info.x86_capability;
+
+ if (rt_cpu_info.x86_model_id[0])
+ cpu_x86_info.model_id = rt_cpu_info.x86_model_id;
+
+ if (pb_write_one(img, &cpu_info, PB_CPUINFO) < 0) {
+ close_image(img);
+ return -1;
+ }
+
+ close_image(img);
+ return 0;
+}
+
+#define __ins_bit(__l, __v) (1u << ((__v) - 32u * (__l)))
+
+static u32 x86_ins_capability_mask[NCAPINTS] = {
+ [0] =
+ __ins_bit(0, X86_FEATURE_FPU) |
+ __ins_bit(0, X86_FEATURE_TSC) |
+ __ins_bit(0, X86_FEATURE_CX8) |
+ __ins_bit(0, X86_FEATURE_SEP) |
+ __ins_bit(0, X86_FEATURE_CMOV) |
+ __ins_bit(0, X86_FEATURE_CLFLUSH) |
+ __ins_bit(0, X86_FEATURE_MMX) |
+ __ins_bit(0, X86_FEATURE_FXSR) |
+ __ins_bit(0, X86_FEATURE_XMM) |
+ __ins_bit(0, X86_FEATURE_XMM2),
+
+ [1] =
+ __ins_bit(1, X86_FEATURE_SYSCALL) |
+ __ins_bit(1, X86_FEATURE_MMXEXT) |
+ __ins_bit(1, X86_FEATURE_RDTSCP) |
+ __ins_bit(1, X86_FEATURE_3DNOWEXT) |
+ __ins_bit(1, X86_FEATURE_3DNOW),
+
+ [3] =
+ __ins_bit(3, X86_FEATURE_REP_GOOD) |
+ __ins_bit(3, X86_FEATURE_NOPL),
+
+ [4] =
+ __ins_bit(4, X86_FEATURE_XMM3) |
+ __ins_bit(4, X86_FEATURE_PCLMULQDQ) |
+ __ins_bit(4, X86_FEATURE_MWAIT) |
+ __ins_bit(4, X86_FEATURE_SSSE3) |
+ __ins_bit(4, X86_FEATURE_CX16) |
+ __ins_bit(4, X86_FEATURE_XMM4_1) |
+ __ins_bit(4, X86_FEATURE_XMM4_2) |
+ __ins_bit(4, X86_FEATURE_MOVBE) |
+ __ins_bit(4, X86_FEATURE_POPCNT) |
+ __ins_bit(4, X86_FEATURE_AES) |
+ __ins_bit(4, X86_FEATURE_XSAVE) |
+ __ins_bit(4, X86_FEATURE_OSXSAVE) |
+ __ins_bit(4, X86_FEATURE_AVX) |
+ __ins_bit(4, X86_FEATURE_F16C) |
+ __ins_bit(4, X86_FEATURE_RDRAND),
+
+ [6] =
+ __ins_bit(6, X86_FEATURE_ABM) |
+ __ins_bit(6, X86_FEATURE_SSE4A) |
+ __ins_bit(6, X86_FEATURE_MISALIGNSSE) |
+ __ins_bit(6, X86_FEATURE_3DNOWPREFETCH) |
+ __ins_bit(6, X86_FEATURE_XOP) |
+ __ins_bit(6, X86_FEATURE_FMA4) |
+ __ins_bit(6, X86_FEATURE_TBM),
+
+ [9] =
+ __ins_bit(9, X86_FEATURE_FSGSBASE) |
+ __ins_bit(9, X86_FEATURE_BMI1) |
+ __ins_bit(9, X86_FEATURE_HLE) |
+ __ins_bit(9, X86_FEATURE_AVX2) |
+ __ins_bit(9, X86_FEATURE_BMI2) |
+ __ins_bit(9, X86_FEATURE_ERMS) |
+ __ins_bit(9, X86_FEATURE_RTM) |
+ __ins_bit(9, X86_FEATURE_MPX) |
+ __ins_bit(9, X86_FEATURE_AVX512F) |
+ __ins_bit(9, X86_FEATURE_AVX512DQ) |
+ __ins_bit(9, X86_FEATURE_RDSEED) |
+ __ins_bit(9, X86_FEATURE_ADX) |
+ __ins_bit(9, X86_FEATURE_CLFLUSHOPT) |
+ __ins_bit(9, X86_FEATURE_AVX512PF) |
+ __ins_bit(9, X86_FEATURE_AVX512ER) |
+ __ins_bit(9, X86_FEATURE_AVX512CD) |
+ __ins_bit(9, X86_FEATURE_SHA) |
+ __ins_bit(9, X86_FEATURE_AVX512BW) |
+ __ins_bit(9, X86_FEATURE_AVXVL),
+
+ [10] =
+ __ins_bit(10, X86_FEATURE_XSAVEOPT) |
+ __ins_bit(10, X86_FEATURE_XSAVEC) |
+ __ins_bit(10, X86_FEATURE_XGETBV1) |
+ __ins_bit(10, X86_FEATURE_XSAVES),
+
+ [11] =
+ __ins_bit(11, X86_FEATURE_PREFETCHWT1),
+};
+
+#undef __ins_bit
+
+static int cpu_validate_ins_features(CpuinfoX86Entry *img_x86_entry)
+{
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(rt_cpu_info.x86_capability); i++) {
+ u32 s = img_x86_entry->capability[i] & x86_ins_capability_mask[i];
+ u32 d = rt_cpu_info.x86_capability[i] & x86_ins_capability_mask[i];
+
+ /*
+ * Destination might be more feature rich
+ * but not the reverse.
+ */
+ if (s & ~d) {
+ pr_err("CPU instruction capabilities do not match run time\n");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int cpu_validate_features(CpuinfoX86Entry *img_x86_entry)
+{
+ if (img_x86_entry->n_capability != ARRAY_SIZE(rt_cpu_info.x86_capability)) {
+ /*
+ * Image carries different number of bits.
+ * Simply reject, we can't guarantee anything
+ * in such case.
+ */
+ pr_err("Size of features in image mismatch "
+ "one provided by run time CPU (%d:%d)\n",
+ (unsigned)img_x86_entry->n_capability,
+ (unsigned)ARRAY_SIZE(rt_cpu_info.x86_capability));
+ return -1;
+ }
+
+ if (opts.cpu_cap == CPU_CAP_FPU) {
+ /*
+ * If we're requested to check FPU only ignore
+ * any other bit. It's up to a user if the
+ * rest of mismatches won't cause problems.
+ */
+
+#define __mismatch_fpu_bit(__bit) \
+ (test_bit(__bit, (void *)img_x86_entry->capability) && \
+ !cpu_has_feature(__bit))
+ if (__mismatch_fpu_bit(X86_FEATURE_FPU) ||
+ __mismatch_fpu_bit(X86_FEATURE_FXSR) ||
+ __mismatch_fpu_bit(X86_FEATURE_XSAVE)) {
+ pr_err("FPU feature required by image "
+ "is not supported on host.\n");
+ return -1;
+ } else
+ return 0;
+#undef __mismatch_fpu_bit
+ }
+
+ /*
+ * Capability on instructions level only.
+ */
+ if (opts.cpu_cap == CPU_CAP_INS)
+ return cpu_validate_ins_features(img_x86_entry);
+
+ /*
+ * Strict capability mode. Everything must match.
+ */
+ if (memcmp(img_x86_entry->capability, rt_cpu_info.x86_capability,
+ sizeof(rt_cpu_info.x86_capability))) {
+ pr_err("CPU capabilites do not match run time\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+int cpu_validate_cpuinfo(void)
+{
+ CpuinfoX86Entry *img_x86_entry;
+ CpuinfoEntry *img_cpu_info;
+ struct cr_img *img;
+ int ret = -1;
+
+ img = open_image(CR_FD_CPUINFO, O_RSTR);
+ if (!img)
+ return -1;
+
+ if (pb_read_one(img, &img_cpu_info, PB_CPUINFO) < 0)
+ goto err;
+
+ if (img_cpu_info->n_x86_entry != 1) {
+ pr_err("No x86 related cpuinfo in image, "
+ "corruption (n_x86_entry = %zi)\n",
+ img_cpu_info->n_x86_entry);
+ goto err;
+ }
+
+ img_x86_entry = img_cpu_info->x86_entry[0];
+ if (img_x86_entry->vendor_id != CPUINFO_X86_ENTRY__VENDOR__INTEL &&
+ img_x86_entry->vendor_id != CPUINFO_X86_ENTRY__VENDOR__AMD) {
+ pr_err("Unknown cpu vendor %d\n", img_x86_entry->vendor_id);
+ goto err;
+ }
+
+ if (img_x86_entry->n_capability != ARRAY_SIZE(rt_cpu_info.x86_capability)) {
+ pr_err("Image carries %u words while %u expected\n",
+ (unsigned)img_x86_entry->n_capability,
+ (unsigned)ARRAY_SIZE(rt_cpu_info.x86_capability));
+ goto err;
+ }
+
+ ret = cpu_validate_features(img_x86_entry);
+err:
+ close_image(img);
+ return ret;
+}
+
+int cpuinfo_dump(void)
+{
+ if (cpu_init())
+ return -1;
+ if (cpu_dump_cpuinfo())
+ return -1;
+ return 0;
+}
+
+int cpuinfo_check(void)
+{
+ if (cpu_init())
+ return 1;
+
+ /*
+ * Force to check all caps if empty passed,
+ * still allow to check instructions only
+ * and etc.
+ */
+ if (!opts.cpu_cap)
+ opts.cpu_cap = CPU_CAP_ALL;
+
+ if (cpu_validate_cpuinfo())
+ return 1;
+
+ return 0;
+}
diff --git a/criu/arch/x86/crtools.c b/criu/arch/x86/crtools.c
new file mode 100644
index 000000000000..f713b0d3fd40
--- /dev/null
+++ b/criu/arch/x86/crtools.c
@@ -0,0 +1,572 @@
+#include <string.h>
+#include <unistd.h>
+#include <elf.h>
+#include <sys/user.h>
+#include <sys/mman.h>
+
+#include "asm/processor-flags.h"
+#include "asm/restorer.h"
+#include "asm/types.h"
+#include "asm/fpu.h"
+
+#include "cr_options.h"
+#include "compiler.h"
+#include "ptrace.h"
+#include "parasite-syscall.h"
+#include "restorer.h"
+#include "log.h"
+#include "util.h"
+#include "cpu.h"
+#include "errno.h"
+
+#include "protobuf.h"
+#include "protobuf/core.pb-c.h"
+#include "protobuf/creds.pb-c.h"
+
+/*
+ * Injected syscall instruction
+ */
+const char code_syscall[] = {
+ 0x0f, 0x05, /* syscall */
+ 0xcc, 0xcc, 0xcc, 0xcc, 0xcc, 0xcc /* int 3, ... */
+};
+
+const int code_syscall_size = round_up(sizeof(code_syscall), sizeof(long));
+
+static inline __always_unused void __check_code_syscall(void)
+{
+ BUILD_BUG_ON(sizeof(code_syscall) != BUILTIN_SYSCALL_SIZE);
+ BUILD_BUG_ON(!is_log2(sizeof(code_syscall)));
+}
+
+void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs)
+{
+ regs->ip = new_ip;
+ if (stack)
+ regs->sp = (unsigned long) stack;
+
+ /* Avoid end of syscall processing */
+ regs->orig_ax = -1;
+
+ /* Make sure flags are in known state */
+ regs->flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_DF | X86_EFLAGS_IF);
+}
+
+static int task_in_compat_mode(pid_t pid)
+{
+ unsigned long cs, ds;
+
+ errno = 0;
+ cs = ptrace(PTRACE_PEEKUSER, pid, offsetof(user_regs_struct_t, cs), 0);
+ if (errno != 0) {
+ pr_perror("Can't get CS register for %d", pid);
+ return -1;
+ }
+
+ errno = 0;
+ ds = ptrace(PTRACE_PEEKUSER, pid, offsetof(user_regs_struct_t, ds), 0);
+ if (errno != 0) {
+ pr_perror("Can't get DS register for %d", pid);
+ return -1;
+ }
+
+ /* It's x86-32 or x32 */
+ return cs != 0x33 || ds == 0x2b;
+}
+
+bool arch_can_dump_task(pid_t pid)
+{
+ if (task_in_compat_mode(pid)) {
+ pr_err("Can't dump task %d running in 32-bit mode\n", pid);
+ return false;
+ }
+
+ return true;
+}
+
+int syscall_seized(struct parasite_ctl *ctl, int nr, unsigned long *ret,
+ unsigned long arg1,
+ unsigned long arg2,
+ unsigned long arg3,
+ unsigned long arg4,
+ unsigned long arg5,
+ unsigned long arg6)
+{
+ user_regs_struct_t regs = ctl->orig.regs;
+ int err;
+
+ regs.ax = (unsigned long)nr;
+ regs.di = arg1;
+ regs.si = arg2;
+ regs.dx = arg3;
+ regs.r10 = arg4;
+ regs.r8 = arg5;
+ regs.r9 = arg6;
+
+ err = __parasite_execute_syscall(ctl, ®s);
+
+ *ret = regs.ax;
+ return err;
+}
+
+int get_task_regs(pid_t pid, user_regs_struct_t regs, CoreEntry *core)
+{
+ struct xsave_struct xsave = { };
+
+ struct iovec iov;
+ int ret = -1;
+
+ pr_info("Dumping GP/FPU registers for %d\n", pid);
+
+ /* Did we come from a system call? */
+ if ((int)regs.orig_ax >= 0) {
+ /* Restart the system call */
+ switch ((long)(int)regs.ax) {
+ case -ERESTARTNOHAND:
+ case -ERESTARTSYS:
+ case -ERESTARTNOINTR:
+ regs.ax = regs.orig_ax;
+ regs.ip -= 2;
+ break;
+ case -ERESTART_RESTARTBLOCK:
+ pr_warn("Will restore %d with interrupted system call\n", pid);
+ regs.ax = -EINTR;
+ break;
+ }
+ }
+
+#define assign_reg(dst, src, e) do { dst->e = (__typeof__(dst->e))src.e; } while (0)
+#define assign_array(dst, src, e) memcpy(dst->e, &src.e, sizeof(src.e))
+
+ assign_reg(core->thread_info->gpregs, regs, r15);
+ assign_reg(core->thread_info->gpregs, regs, r14);
+ assign_reg(core->thread_info->gpregs, regs, r13);
+ assign_reg(core->thread_info->gpregs, regs, r12);
+ assign_reg(core->thread_info->gpregs, regs, bp);
+ assign_reg(core->thread_info->gpregs, regs, bx);
+ assign_reg(core->thread_info->gpregs, regs, r11);
+ assign_reg(core->thread_info->gpregs, regs, r10);
+ assign_reg(core->thread_info->gpregs, regs, r9);
+ assign_reg(core->thread_info->gpregs, regs, r8);
+ assign_reg(core->thread_info->gpregs, regs, ax);
+ assign_reg(core->thread_info->gpregs, regs, cx);
+ assign_reg(core->thread_info->gpregs, regs, dx);
+ assign_reg(core->thread_info->gpregs, regs, si);
+ assign_reg(core->thread_info->gpregs, regs, di);
+ assign_reg(core->thread_info->gpregs, regs, orig_ax);
+ assign_reg(core->thread_info->gpregs, regs, ip);
+ assign_reg(core->thread_info->gpregs, regs, cs);
+ assign_reg(core->thread_info->gpregs, regs, flags);
+ assign_reg(core->thread_info->gpregs, regs, sp);
+ assign_reg(core->thread_info->gpregs, regs, ss);
+ assign_reg(core->thread_info->gpregs, regs, fs_base);
+ assign_reg(core->thread_info->gpregs, regs, gs_base);
+ assign_reg(core->thread_info->gpregs, regs, ds);
+ assign_reg(core->thread_info->gpregs, regs, es);
+ assign_reg(core->thread_info->gpregs, regs, fs);
+ assign_reg(core->thread_info->gpregs, regs, gs);
+
+#ifndef PTRACE_GETREGSET
+# define PTRACE_GETREGSET 0x4204
+#endif
+
+ if (!cpu_has_feature(X86_FEATURE_FPU))
+ goto out;
+
+ /*
+ * FPU fetched either via fxsave or via xsave,
+ * thus decode it accrodingly.
+ */
+
+ if (cpu_has_feature(X86_FEATURE_XSAVE)) {
+ iov.iov_base = &xsave;
+ iov.iov_len = sizeof(xsave);
+
+ if (ptrace(PTRACE_GETREGSET, pid, (unsigned int)NT_X86_XSTATE, &iov) < 0) {
+ pr_perror("Can't obtain FPU registers for %d", pid);
+ goto err;
+ }
+ } else {
+ if (ptrace(PTRACE_GETFPREGS, pid, NULL, &xsave)) {
+ pr_perror("Can't obtain FPU registers for %d", pid);
+ goto err;
+ }
+ }
+
+ assign_reg(core->thread_info->fpregs, xsave.i387, cwd);
+ assign_reg(core->thread_info->fpregs, xsave.i387, swd);
+ assign_reg(core->thread_info->fpregs, xsave.i387, twd);
+ assign_reg(core->thread_info->fpregs, xsave.i387, fop);
+ assign_reg(core->thread_info->fpregs, xsave.i387, rip);
+ assign_reg(core->thread_info->fpregs, xsave.i387, rdp);
+ assign_reg(core->thread_info->fpregs, xsave.i387, mxcsr);
+ assign_reg(core->thread_info->fpregs, xsave.i387, mxcsr_mask);
+
+ /* Make sure we have enough space */
+ BUG_ON(core->thread_info->fpregs->n_st_space != ARRAY_SIZE(xsave.i387.st_space));
+ BUG_ON(core->thread_info->fpregs->n_xmm_space != ARRAY_SIZE(xsave.i387.xmm_space));
+
+ assign_array(core->thread_info->fpregs, xsave.i387, st_space);
+ assign_array(core->thread_info->fpregs, xsave.i387, xmm_space);
+
+ if (cpu_has_feature(X86_FEATURE_XSAVE)) {
+ BUG_ON(core->thread_info->fpregs->xsave->n_ymmh_space != ARRAY_SIZE(xsave.ymmh.ymmh_space));
+
+ assign_reg(core->thread_info->fpregs->xsave, xsave.xsave_hdr, xstate_bv);
+ assign_array(core->thread_info->fpregs->xsave, xsave.ymmh, ymmh_space);
+ }
+
+#undef assign_reg
+#undef assign_array
+
+out:
+ ret = 0;
+
+err:
+ return ret;
+}
+
+int arch_alloc_thread_info(CoreEntry *core)
+{
+ size_t sz;
+ bool with_fpu, with_xsave = false;
+ void *m;
+ ThreadInfoX86 *ti = NULL;
+
+
+ with_fpu = cpu_has_feature(X86_FEATURE_FPU);
+
+ sz = sizeof(ThreadInfoX86) + sizeof(UserX86RegsEntry);
+ if (with_fpu) {
+ sz += sizeof(UserX86FpregsEntry);
+ with_xsave = cpu_has_feature(X86_FEATURE_XSAVE);
+ if (with_xsave)
+ sz += sizeof(UserX86XsaveEntry);
+ }
+
+ m = xmalloc(sz);
+ if (!m)
+ return -1;
+
+ ti = core->thread_info = xptr_pull(&m, ThreadInfoX86);
+ thread_info_x86__init(ti);
+ ti->gpregs = xptr_pull(&m, UserX86RegsEntry);
+ user_x86_regs_entry__init(ti->gpregs);
+
+ if (with_fpu) {
+ UserX86FpregsEntry *fpregs;
+
+ fpregs = ti->fpregs = xptr_pull(&m, UserX86FpregsEntry);
+ user_x86_fpregs_entry__init(fpregs);
+
+ /* These are numbers from kernel */
+ fpregs->n_st_space = 32;
+ fpregs->n_xmm_space = 64;
+
+ fpregs->st_space = xzalloc(pb_repeated_size(fpregs, st_space));
+ fpregs->xmm_space = xzalloc(pb_repeated_size(fpregs, xmm_space));
+
+ if (!fpregs->st_space || !fpregs->xmm_space)
+ goto err;
+
+ if (with_xsave) {
+ UserX86XsaveEntry *xsave;
+
+ xsave = fpregs->xsave = xptr_pull(&m, UserX86XsaveEntry);
+ user_x86_xsave_entry__init(xsave);
+
+ xsave->n_ymmh_space = 64;
+ xsave->ymmh_space = xzalloc(pb_repeated_size(xsave, ymmh_space));
+ if (!xsave->ymmh_space)
+ goto err;
+ }
+ }
+
+ return 0;
+err:
+ return -1;
+}
+
+void arch_free_thread_info(CoreEntry *core)
+{
+ if (!core->thread_info)
+ return;
+
+ if (core->thread_info->fpregs->xsave)
+ xfree(core->thread_info->fpregs->xsave->ymmh_space);
+ xfree(core->thread_info->fpregs->st_space);
+ xfree(core->thread_info->fpregs->xmm_space);
+ xfree(core->thread_info);
+}
+
+static bool valid_xsave_frame(CoreEntry *core)
+{
+ struct xsave_struct *x = NULL;
+
+ if (core->thread_info->fpregs->n_st_space < ARRAY_SIZE(x->i387.st_space)) {
+ pr_err("Corruption in FPU st_space area "
+ "(got %li but %li expected)\n",
+ (long)core->thread_info->fpregs->n_st_space,
+ (long)ARRAY_SIZE(x->i387.st_space));
+ return false;
+ }
+
+ if (core->thread_info->fpregs->n_xmm_space < ARRAY_SIZE(x->i387.xmm_space)) {
+ pr_err("Corruption in FPU xmm_space area "
+ "(got %li but %li expected)\n",
+ (long)core->thread_info->fpregs->n_st_space,
+ (long)ARRAY_SIZE(x->i387.xmm_space));
+ return false;
+ }
+
+ if (cpu_has_feature(X86_FEATURE_XSAVE)) {
+ if (core->thread_info->fpregs->xsave &&
+ core->thread_info->fpregs->xsave->n_ymmh_space < ARRAY_SIZE(x->ymmh.ymmh_space)) {
+ pr_err("Corruption in FPU ymmh_space area "
+ "(got %li but %li expected)\n",
+ (long)core->thread_info->fpregs->xsave->n_ymmh_space,
+ (long)ARRAY_SIZE(x->ymmh.ymmh_space));
+ return false;
+ }
+ } else {
+ /*
+ * If the image has xsave area present then CPU we're restoring
+ * on must have X86_FEATURE_XSAVE feature until explicitly
+ * stated in options.
+ */
+ if (core->thread_info->fpregs->xsave) {
+ if (opts.cpu_cap & CPU_CAP_FPU) {
+ pr_err("FPU xsave area present, "
+ "but host cpu doesn't support it\n");
+ return false;
+ } else
+ pr_warn_once("FPU is about to restore ignoring ymm state!\n");
+ }
+ }
+
+ return true;
+}
+
+static void show_rt_xsave_frame(struct xsave_struct *x)
+{
+ struct fpx_sw_bytes *fpx = (void *)&x->i387.sw_reserved;
+ struct xsave_hdr_struct *xsave_hdr = &x->xsave_hdr;
+ struct i387_fxsave_struct *i387 = &x->i387;
+
+ pr_debug("xsave runtime structure\n");
+ pr_debug("-----------------------\n");
+
+ pr_debug("cwd:%x swd:%x twd:%x fop:%x mxcsr:%x mxcsr_mask:%x\n",
+ (int)i387->cwd, (int)i387->swd, (int)i387->twd,
+ (int)i387->fop, (int)i387->mxcsr, (int)i387->mxcsr_mask);
+
+ pr_debug("magic1:%x extended_size:%x xstate_bv:%lx xstate_size:%x\n",
+ fpx->magic1, fpx->extended_size, (long)fpx->xstate_bv, fpx->xstate_size);
+
+ pr_debug("xstate_bv: %lx\n", (long)xsave_hdr->xstate_bv);
+
+ pr_debug("-----------------------\n");
+}
+
+int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core)
+{
+ fpu_state_t *fpu_state = &sigframe->fpu_state;
+ struct xsave_struct *x = &fpu_state->xsave;
+
+ /*
+ * If no FPU information provided -- we're restoring
+ * old image which has no FPU support, or the dump simply
+ * has no FPU support at all.
+ */
+ if (!core->thread_info->fpregs) {
+ fpu_state->has_fpu = false;
+ return 0;
+ }
+
+ if (!valid_xsave_frame(core))
+ return -1;
+
+ fpu_state->has_fpu = true;
+
+#define assign_reg(dst, src, e) do { dst.e = (__typeof__(dst.e))src->e; } while (0)
+#define assign_array(dst, src, e) memcpy(dst.e, (src)->e, sizeof(dst.e))
+
+ assign_reg(x->i387, core->thread_info->fpregs, cwd);
+ assign_reg(x->i387, core->thread_info->fpregs, swd);
+ assign_reg(x->i387, core->thread_info->fpregs, twd);
+ assign_reg(x->i387, core->thread_info->fpregs, fop);
+ assign_reg(x->i387, core->thread_info->fpregs, rip);
+ assign_reg(x->i387, core->thread_info->fpregs, rdp);
+ assign_reg(x->i387, core->thread_info->fpregs, mxcsr);
+ assign_reg(x->i387, core->thread_info->fpregs, mxcsr_mask);
+
+ assign_array(x->i387, core->thread_info->fpregs, st_space);
+ assign_array(x->i387, core->thread_info->fpregs, xmm_space);
+
+ if (cpu_has_feature(X86_FEATURE_XSAVE)) {
+ struct fpx_sw_bytes *fpx_sw = (void *)&x->i387.sw_reserved;
+ void *magic2;
+
+ x->xsave_hdr.xstate_bv = XSTATE_FP | XSTATE_SSE | XSTATE_YMM;
+
+ /*
+ * fpregs->xsave pointer might not present on image so we
+ * simply clear out all ymm registers.
+ */
+ if (core->thread_info->fpregs->xsave)
+ assign_array(x->ymmh, core->thread_info->fpregs->xsave, ymmh_space);
+
+ fpx_sw->magic1 = FP_XSTATE_MAGIC1;
+ fpx_sw->xstate_bv = XSTATE_FP | XSTATE_SSE | XSTATE_YMM;
+ fpx_sw->xstate_size = sizeof(struct xsave_struct);
+ fpx_sw->extended_size = sizeof(struct xsave_struct) + FP_XSTATE_MAGIC2_SIZE;
+
+ /*
+ * This should be at the end of xsave frame.
+ */
+ magic2 = fpu_state->__pad + sizeof(struct xsave_struct);
+ *(u32 *)magic2 = FP_XSTATE_MAGIC2;
+ }
+
+ show_rt_xsave_frame(x);
+
+#undef assign_reg
+#undef assign_array
+
+ return 0;
+}
+
+void *mmap_seized(struct parasite_ctl *ctl,
+ void *addr, size_t length, int prot,
+ int flags, int fd, off_t offset)
+{
+ unsigned long map;
+ int err;
+
+ err = syscall_seized(ctl, __NR_mmap, &map,
+ (unsigned long)addr, length, prot, flags, fd, offset);
+ if (err < 0)
+ return NULL;
+
+ if (IS_ERR_VALUE(map)) {
+ if (map == -EACCES && (prot & PROT_WRITE) && (prot & PROT_EXEC))
+ pr_warn("mmap(PROT_WRITE | PROT_EXEC) failed for %d, "
+ "check selinux execmem policy\n", ctl->pid.real);
+ return NULL;
+ }
+
+ return (void *)map;
+}
+
+int restore_gpregs(struct rt_sigframe *f, UserX86RegsEntry *r)
+{
+#define CPREG1(d) f->uc.uc_mcontext.d = r->d
+#define CPREG2(d, s) f->uc.uc_mcontext.d = r->s
+
+#ifdef CONFIG_X86_64
+ CPREG1(r8);
+ CPREG1(r9);
+ CPREG1(r10);
+ CPREG1(r11);
+ CPREG1(r12);
+ CPREG1(r13);
+ CPREG1(r14);
+ CPREG1(r15);
+#endif
+
+ CPREG2(rdi, di);
+ CPREG2(rsi, si);
+ CPREG2(rbp, bp);
+ CPREG2(rbx, bx);
+ CPREG2(rdx, dx);
+ CPREG2(rax, ax);
+ CPREG2(rcx, cx);
+ CPREG2(rsp, sp);
+ CPREG2(rip, ip);
+ CPREG2(eflags, flags);
+
+ CPREG1(cs);
+ CPREG1(ss);
+
+#ifdef CONFIG_X86_32
+ CPREG1(gs);
+ CPREG1(fs);
+ CPREG1(es);
+ CPREG1(ds);
+#endif
+
+ return 0;
+}
+
+int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, fpu_state_t *fpu_state)
+{
+ unsigned long addr = (unsigned long)(void *)&fpu_state->xsave;
+
+ if ((addr % 64ul) == 0ul) {
+ sigframe->uc.uc_mcontext.fpstate = &fpu_state->xsave;
+ } else {
+ pr_err("Unaligned address passed: %lx\n", addr);
+ return -1;
+ }
+
+ return 0;
+}
+
+/* Copied from the gdb header gdb/nat/x86-dregs.h */
+
+/* Debug registers' indices. */
+#define DR_FIRSTADDR 0
+#define DR_LASTADDR 3
+#define DR_NADDR 4 /* The number of debug address registers. */
+#define DR_STATUS 6 /* Index of debug status register (DR6). */
+#define DR_CONTROL 7 /* Index of debug control register (DR7). */
+
+#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit. */
+#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit. */
+#define DR_ENABLE_SIZE 2 /* Two enable bits per debug register. */
+
+/* Locally enable the break/watchpoint in the I'th debug register. */
+#define X86_DR_LOCAL_ENABLE(i) (1 << (DR_LOCAL_ENABLE_SHIFT + DR_ENABLE_SIZE * (i)))
+
+int ptrace_set_breakpoint(pid_t pid, void *addr)
+{
+ int ret;
+
+ /* Set a breakpoint */
+ if (ptrace(PTRACE_POKEUSER, pid,
+ offsetof(struct user, u_debugreg[DR_FIRSTADDR]),
+ addr)) {
+ pr_perror("Unable to setup a breakpoint into %d", pid);
+ return -1;
+ }
+
+ /* Enable the breakpoint */
+ if (ptrace(PTRACE_POKEUSER, pid,
+ offsetof(struct user, u_debugreg[DR_CONTROL]),
+ X86_DR_LOCAL_ENABLE(DR_FIRSTADDR))) {
+ pr_perror("Unable to enable the breakpoint for %d", pid);
+ return -1;
+ }
+
+ ret = ptrace(PTRACE_CONT, pid, NULL, NULL);
+ if (ret) {
+ pr_perror("Unable to restart the stopped tracee process %d", pid);
+ return -1;
+ }
+
+ return 1;
+}
+
+int ptrace_flush_breakpoints(pid_t pid)
+{
+ /* Disable the breakpoint */
+ if (ptrace(PTRACE_POKEUSER, pid,
+ offsetof(struct user, u_debugreg[DR_CONTROL]),
+ 0)) {
+ pr_perror("Unable to disable the breakpoint for %d", pid);
+ return -1;
+ }
+
+ return 0;
+}
+
diff --git a/criu/arch/x86/include/asm/atomic.h b/criu/arch/x86/include/asm/atomic.h
new file mode 100644
index 000000000000..d447b65cb4c6
--- /dev/null
+++ b/criu/arch/x86/include/asm/atomic.h
@@ -0,0 +1,78 @@
+#ifndef __CR_ATOMIC_H__
+#define __CR_ATOMIC_H__
+
+#include "asm/cmpxchg.h"
+
+#define LOCK_PREFIX "\n\tlock; "
+
+typedef struct {
+ int counter;
+} atomic_t;
+
+#define ATOMIC_INIT(i) { (i) }
+
+static inline int atomic_read(const atomic_t *v)
+{
+ return (*(volatile int *)&(v)->counter);
+}
+
+static inline void atomic_set(atomic_t *v, int i)
+{
+ v->counter = i;
+}
+
+static inline void atomic_add(int i, atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "addl %1,%0"
+ : "+m" (v->counter)
+ : "ir" (i));
+}
+
+static inline void atomic_sub(int i, atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "subl %1,%0"
+ : "+m" (v->counter)
+ : "ir" (i));
+}
+
+static inline void atomic_inc(atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "incl %0"
+ : "+m" (v->counter));
+}
+
+static inline void atomic_dec(atomic_t *v)
+{
+ asm volatile(LOCK_PREFIX "decl %0"
+ : "+m" (v->counter));
+}
+
+static inline int atomic_dec_and_test(atomic_t *v)
+{
+ unsigned char c;
+
+ asm volatile(LOCK_PREFIX "decl %0; sete %1"
+ : "+m" (v->counter), "=qm" (c)
+ : : "memory");
+ return c != 0;
+}
+
+static inline int atomic_add_return(int i, atomic_t *v)
+{
+ return i + xadd(&v->counter, i);
+}
+
+static inline int atomic_sub_return(int i, atomic_t *v)
+{
+ return atomic_add_return(-i, v);
+}
+
+#define atomic_inc_return(v) (atomic_add_return(1, v))
+#define atomic_dec_return(v) (atomic_sub_return(1, v))
+
+static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
+{
+ return cmpxchg(&v->counter, old, new);
+}
+
+#endif /* __CR_ATOMIC_H__ */
diff --git a/criu/arch/x86/include/asm/bitops.h b/criu/arch/x86/include/asm/bitops.h
new file mode 100644
index 000000000000..7d6283183953
--- /dev/null
+++ b/criu/arch/x86/include/asm/bitops.h
@@ -0,0 +1,113 @@
+#ifndef __CR_BITOPS_H__
+#define __CR_BITOPS_H__
+
+#include "asm/bitsperlong.h"
+
+#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG)
+
+#define DECLARE_BITMAP(name, bits) \
+ unsigned long name[BITS_TO_LONGS(bits)]
+
+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
+/* Technically wrong, but this avoids compilation errors on some gcc
+ versions. */
+#define BITOP_ADDR(x) "=m" (*(volatile long *) (x))
+#else
+#define BITOP_ADDR(x) "+m" (*(volatile long *) (x))
+#endif
+
+#define ADDR BITOP_ADDR(addr)
+
+static inline void set_bit(int nr, volatile unsigned long *addr)
+{
+ asm volatile("btsl %1,%0" : ADDR : "Ir" (nr) : "memory");
+}
+
+static inline void change_bit(int nr, volatile unsigned long *addr)
+{
+ asm volatile("btcl %1,%0" : ADDR : "Ir" (nr));
+}
+
+static inline int test_bit(int nr, volatile const unsigned long *addr)
+{
+ int oldbit;
+
+ asm volatile("bt %2,%1\n\t"
+ "sbb %0,%0"
+ : "=r" (oldbit)
+ : "m" (*(unsigned long *)addr), "Ir" (nr));
+
+ return oldbit;
+}
+
+static inline void clear_bit(int nr, volatile unsigned long *addr)
+{
+ asm volatile("btrl %1,%0" : ADDR : "Ir" (nr));
+}
+
+/**
+ * __ffs - find first set bit in word
+ * @word: The word to search
+ *
+ * Undefined if no bit exists, so code should check against 0 first.
+ */
+static inline unsigned long __ffs(unsigned long word)
+{
+ asm("bsf %1,%0"
+ : "=r" (word)
+ : "rm" (word));
+ return word;
+}
+
+#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
+
+/*
+ * Find the next set bit in a memory region.
+ */
+static inline
+unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
+ unsigned long offset)
+{
+ const unsigned long *p = addr + BITOP_WORD(offset);
+ unsigned long result = offset & ~(BITS_PER_LONG-1);
+ unsigned long tmp;
+
+ if (offset >= size)
+ return size;
+ size -= result;
+ offset %= BITS_PER_LONG;
+ if (offset) {
+ tmp = *(p++);
+ tmp &= (~0UL << offset);
+ if (size < BITS_PER_LONG)
+ goto found_first;
+ if (tmp)
+ goto found_middle;
+ size -= BITS_PER_LONG;
+ result += BITS_PER_LONG;
+ }
+ while (size & ~(BITS_PER_LONG-1)) {
+ if ((tmp = *(p++)))
+ goto found_middle;
+ result += BITS_PER_LONG;
+ size -= BITS_PER_LONG;
+ }
+ if (!size)
+ return result;
+ tmp = *p;
+
+found_first:
+ tmp &= (~0UL >> (BITS_PER_LONG - size));
+ if (tmp == 0UL) /* Are any bits set? */
+ return result + size; /* Nope. */
+found_middle:
+ return result + __ffs(tmp);
+}
+
+#define for_each_bit(i, bitmask) \
+ for (i = find_next_bit(bitmask, sizeof(bitmask), 0); \
+ i < sizeof(bitmask); \
+ i = find_next_bit(bitmask, sizeof(bitmask), i + 1))
+
+#endif /* __CR_BITOPS_H__ */
diff --git a/criu/arch/x86/include/asm/bitsperlong.h b/criu/arch/x86/include/asm/bitsperlong.h
new file mode 100644
index 000000000000..7e0a71e8d71d
--- /dev/null
+++ b/criu/arch/x86/include/asm/bitsperlong.h
@@ -0,0 +1,10 @@
+#ifndef __CR_BITSPERLONG_H__
+#define __CR_BITSPERLONG_H__
+
+#ifdef CONFIG_X86_64
+# define BITS_PER_LONG 64
+#else
+# define BITS_PER_LONG 32
+#endif
+
+#endif /* __CR_BITSPERLONG_H__ */
diff --git a/criu/arch/x86/include/asm/cmpxchg.h b/criu/arch/x86/include/asm/cmpxchg.h
new file mode 100644
index 000000000000..600d0a7fff84
--- /dev/null
+++ b/criu/arch/x86/include/asm/cmpxchg.h
@@ -0,0 +1,105 @@
+#ifndef __CR_CMPXCHG_H__
+#define __CR_CMPXCHG_H__
+
+#include "asm/int.h"
+
+#define __X86_CASE_B 1
+#define __X86_CASE_W 2
+#define __X86_CASE_L 4
+#define __X86_CASE_Q 8
+
+/*
+ * An exchange-type operation, which takes a value and a pointer, and
+ * returns the old value. Make sure you never reach non-case statement
+ * here, otherwise behaviour is undefined.
+ */
+#define __xchg_op(ptr, arg, op, lock) \
+ ({ \
+ __typeof__ (*(ptr)) __ret = (arg); \
+ switch (sizeof(*(ptr))) { \
+ case __X86_CASE_B: \
+ asm volatile (lock #op "b %b0, %1\n" \
+ : "+q" (__ret), "+m" (*(ptr)) \
+ : : "memory", "cc"); \
+ break; \
+ case __X86_CASE_W: \
+ asm volatile (lock #op "w %w0, %1\n" \
+ : "+r" (__ret), "+m" (*(ptr)) \
+ : : "memory", "cc"); \
+ break; \
+ case __X86_CASE_L: \
+ asm volatile (lock #op "l %0, %1\n" \
+ : "+r" (__ret), "+m" (*(ptr)) \
+ : : "memory", "cc"); \
+ break; \
+ case __X86_CASE_Q: \
+ asm volatile (lock #op "q %q0, %1\n" \
+ : "+r" (__ret), "+m" (*(ptr)) \
+ : : "memory", "cc"); \
+ break; \
+ } \
+ __ret; \
+ })
+
+#define __xadd(ptr, inc, lock) __xchg_op((ptr), (inc), xadd, lock)
+#define xadd(ptr, inc) __xadd((ptr), (inc), "lock ;")
+
+/* Borrowed from linux kernel arch/x86/include/asm/cmpxchg.h */
+
+/*
+ * Atomic compare and exchange. Compare OLD with MEM, if identical,
+ * store NEW in MEM. Return the initial value in MEM. Success is
+ * indicated by comparing RETURN with OLD.
+ */
+#define __raw_cmpxchg(ptr, old, new, size, lock) \
+({ \
+ __typeof__(*(ptr)) __ret; \
+ __typeof__(*(ptr)) __old = (old); \
+ __typeof__(*(ptr)) __new = (new); \
+ switch (size) { \
+ case __X86_CASE_B: \
+ { \
+ volatile u8 *__ptr = (volatile u8 *)(ptr); \
+ asm volatile(lock "cmpxchgb %2,%1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "q" (__new), "0" (__old) \
+ : "memory"); \
+ break; \
+ } \
+ case __X86_CASE_W: \
+ { \
+ volatile u16 *__ptr = (volatile u16 *)(ptr); \
+ asm volatile(lock "cmpxchgw %2,%1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "r" (__new), "0" (__old) \
+ : "memory"); \
+ break; \
+ } \
+ case __X86_CASE_L: \
+ { \
+ volatile u32 *__ptr = (volatile u32 *)(ptr); \
+ asm volatile(lock "cmpxchgl %2,%1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "r" (__new), "0" (__old) \
+ : "memory"); \
+ break; \
+ } \
+ case __X86_CASE_Q: \
+ { \
+ volatile u64 *__ptr = (volatile u64 *)(ptr); \
+ asm volatile(lock "cmpxchgq %2,%1" \
+ : "=a" (__ret), "+m" (*__ptr) \
+ : "r" (__new), "0" (__old) \
+ : "memory"); \
+ break; \
+ } \
+ } \
+ __ret; \
+})
+
+#define __cmpxchg(ptr, old, new, size) \
+ __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
+#define cmpxchg(ptr, old, new) \
+ __cmpxchg(ptr, old, new, sizeof(*(ptr)))
+
+#endif /* __CR_CMPXCHG_H__ */
diff --git a/criu/arch/x86/include/asm/cpu.h b/criu/arch/x86/include/asm/cpu.h
new file mode 100644
index 000000000000..6f49229d6396
--- /dev/null
+++ b/criu/arch/x86/include/asm/cpu.h
@@ -0,0 +1,207 @@
+#ifndef __CR_ASM_CPU_H__
+#define __CR_ASM_CPU_H__
+
+#include "asm/types.h"
+
+/*
+ * Adopted from linux kernel and enhanced from Intel/AMD manuals.
+ */
+
+#define NCAPINTS (12) /* N 32-bit words worth of info */
+#define NCAPINTS_BITS (NCAPINTS * 32)
+
+#define X86_FEATURE_FPU (0*32+ 0) /* Onboard FPU */
+#define X86_FEATURE_VME (0*32+ 1) /* Virtual 8086 Mode Enhancements */
+#define X86_FEATURE_DE (0*32+ 2) /* Debugging Extensions */
+#define X86_FEATURE_PSE (0*32+ 3) /* Page Size Extension */
+#define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */
+#define X86_FEATURE_MSR (0*32+ 5) /* Model Specific Registers RDMSR and WRMSR Instructions */
+#define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extension */
+#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Exception */
+#define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */
+#define X86_FEATURE_APIC (0*32+ 9) /* APIC On-Chip */
+#define X86_FEATURE_SEP (0*32+11) /* SYSENTER and SYSEXIT Instructions */
+#define X86_FEATURE_MTRR (0*32+12) /* Memory Type Range Registers */
+#define X86_FEATURE_PGE (0*32+13) /* PTE Global Bit */
+#define X86_FEATURE_MCA (0*32+14) /* Machine Check Architecture */
+#define X86_FEATURE_CMOV (0*32+15) /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */
+#define X86_FEATURE_PAT (0*32+16) /* Page Attribute Table */
+#define X86_FEATURE_PSE36 (0*32+17) /* 36-Bit Page Size Extension */
+#define X86_FEATURE_PSN (0*32+18) /* Processor Serial Number */
+#define X86_FEATURE_DS (0*32+21) /* Debug Store */
+#define X86_FEATURE_CLFLUSH (0*32+19) /* CLFLUSH instruction */
+#define X86_FEATURE_ACPI (0*32+22) /* Thermal Monitor and Software Controlled Clock Facilities */
+#define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */
+#define X86_FEATURE_FXSR (0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
+#define X86_FEATURE_XMM (0*32+25) /* "sse" */
+#define X86_FEATURE_XMM2 (0*32+26) /* "sse2" */
+#define X86_FEATURE_SS (0*32+27) /* Self Snoop */
+#define X86_FEATURE_HTT (0*32+28) /* Multi-Threading */
+#define X86_FEATURE_TM (0*32+29) /* Thermal Monitor */
+#define X86_FEATURE_PBE (0*32+31) /* Pending Break Enable */
+
+#define X86_FEATURE_SYSCALL (1*32+11) /* SYSCALL/SYSRET */
+#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */
+#define X86_FEATURE_RDTSCP (1*32+27) /* RDTSCP */
+#define X86_FEATURE_3DNOWEXT (1*32+30) /* AMD 3DNow! extensions */
+#define X86_FEATURE_3DNOW (1*32+31) /* 3DNow! */
+
+#define X86_FEATURE_REP_GOOD (3*32+16) /* rep microcode works well */
+#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */
+
+#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
+#define X86_FEATURE_PCLMULQDQ (4*32+ 1) /* PCLMULQDQ instruction */
+#define X86_FEATURE_DTES64 (4*32+ 2) /* 64-bit DS Area */
+#define X86_FEATURE_MWAIT (4*32+ 3) /* "monitor" Monitor/Mwait support */
+#define X86_FEATURE_DSCPL (4*32+ 4) /* CPL Qualified Debug Store */
+#define X86_FEATURE_VMX (4*32+ 5) /* Virtual Machine Extensions */
+#define X86_FEATURE_SMX (4*32+ 6) /* Safer Mode Extensions */
+#define X86_FEATURE_EST (4*32+ 7) /* Enhanced Intel SpeedStep technology */
+#define X86_FEATURE_TM2 (4*32+ 8) /* Thermal Monitor 2 */
+#define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */
+#define X86_FEATURE_CNXTID (4*32+10) /* L1 Context ID */
+#define X86_FEATURE_FMA (4*32+12) /* Fused multiply-add */
+#define X86_FEATURE_CX16 (4*32+13) /* CMPXCHG16B */
+#define X86_FEATURE_XTPR_UCTL (4*32+14) /* xTPR Update Control */
+#define X86_FEATURE_PDCM (4*32+15) /* Perfmon and Debug Capability */
+#define X86_FEATURE_PCID (4*32+17) /* Process-context identifiers */
+#define X86_FEATURE_DCA (4*32+18) /* Ability to prefetch data from a memory mapped device */
+#define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */
+#define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */
+#define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */
+#define X86_FEATURE_MOVBE (4*32+22) /* MOVBE instruction */
+#define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */
+#define X86_FEATURE_TSCDL (4*32+24) /* Local APIC timer supports one-shot operation using a TSC deadline value */
+#define X86_FEATURE_AES (4*32+25) /* AES instructions */
+#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
+#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
+#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */
+#define X86_FEATURE_F16C (4*32+29) /* 16-bit fp conversions */
+#define X86_FEATURE_RDRAND (4*32+30) /* The RDRAND instruction */
+
+#define X86_FEATURE_ABM (6*32+ 5) /* Advanced bit manipulation */
+#define X86_FEATURE_SSE4A (6*32+ 6) /* SSE-4A */
+#define X86_FEATURE_MISALIGNSSE (6*32+ 7) /* Misaligned SSE mode */
+#define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */
+#define X86_FEATURE_XOP (6*32+11) /* extended AVX instructions */
+#define X86_FEATURE_FMA4 (6*32+16) /* 4 operands MAC instructions */
+#define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */
+
+#define X86_FEATURE_FSGSBASE (9*32+ 0) /* Supports RDFSBASE/RDGSBASE/WRFSBASE/WRGSBASE */
+#define X86_FEATURE_BMI1 (9*32+ 3) /* 1st group bit manipulation extensions */
+#define X86_FEATURE_HLE (9*32+ 4) /* Hardware Lock Elision */
+#define X86_FEATURE_AVX2 (9*32+ 5) /* AVX2 instructions */
+#define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */
+#define X86_FEATURE_BMI2 (9*32+ 8) /* 2nd group bit manipulation extensions */
+#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */
+#define X86_FEATURE_INVPCID (9*32+10) /* Invalidate Processor Context ID */
+#define X86_FEATURE_RTM (9*32+11) /* Restricted Transactional Memory */
+#define X86_FEATURE_MPX (9*32+14) /* Memory Protection Extension */
+#define X86_FEATURE_AVX512F (9*32+16) /* AVX-512 Foundation */
+#define X86_FEATURE_AVX512DQ (9*32+17) /* AVX-512 Foundation */
+#define X86_FEATURE_RDSEED (9*32+18) /* The RDSEED instruction */
+#define X86_FEATURE_ADX (9*32+19) /* The ADCX and ADOX instructions */
+#define X86_FEATURE_SMAP (9*32+20) /* Supervisor Mode Access Prevention */
+#define X86_FEATURE_CLFLUSHOPT (9*32+23) /* CLFLUSHOPT instruction */
+#define X86_FEATURE_IPT (9*32+25) /* Intel Processor Trace */
+#define X86_FEATURE_AVX512PF (9*32+26) /* AVX-512 Prefetch */
+#define X86_FEATURE_AVX512ER (9*32+27) /* AVX-512 Exponential and Reciprocal */
+#define X86_FEATURE_AVX512CD (9*32+28) /* AVX-512 Conflict Detection */
+#define X86_FEATURE_SHA (9*32+29) /* Intel SHA extensions */
+#define X86_FEATURE_AVX512BW (9*32+30) /* AVX-512 */
+#define X86_FEATURE_AVXVL (9*32+31) /* AVX-512 */
+
+#define X86_FEATURE_XSAVEOPT (10*32+0) /* XSAVEOPT */
+#define X86_FEATURE_XSAVEC (10*32+1) /* XSAVEC */
+#define X86_FEATURE_XGETBV1 (10*32+2) /* XGETBV with ECX = 1 */
+#define X86_FEATURE_XSAVES (10*32+3) /* XSAVES/XRSTORS */
+
+/*
+ * Node 11 is our own, kernel has not such entry.
+ */
+#define X86_FEATURE_PREFETCHWT1 (11*32+0) /* The PREFETCHWT1 instruction */
+
+static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ /* ecx is often an input as well as an output. */
+ asm volatile("cpuid"
+ : "=a" (*eax),
+ "=b" (*ebx),
+ "=c" (*ecx),
+ "=d" (*edx)
+ : "0" (*eax), "2" (*ecx)
+ : "memory");
+}
+
+static inline void cpuid(unsigned int op,
+ unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ *eax = op;
+ *ecx = 0;
+ native_cpuid(eax, ebx, ecx, edx);
+}
+
+static inline void cpuid_count(unsigned int op, int count,
+ unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ *eax = op;
+ *ecx = count;
+ native_cpuid(eax, ebx, ecx, edx);
+}
+
+static inline unsigned int cpuid_eax(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+ return eax;
+}
+
+static inline unsigned int cpuid_ecx(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+ return ecx;
+}
+
+static inline unsigned int cpuid_edx(unsigned int op)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ cpuid(op, &eax, &ebx, &ecx, &edx);
+ return edx;
+}
+
+#define X86_FEATURE_VERSION 1
+
+enum {
+ X86_VENDOR_INTEL = 0,
+ X86_VENDOR_AMD = 1,
+
+ X86_VENDOR_MAX
+};
+
+struct cpuinfo_x86 {
+ u8 x86_family;
+ u8 x86_vendor;
+ u8 x86_model;
+ u8 x86_mask;
+ u32 x86_capability[NCAPINTS];
+ u32 extended_cpuid_level;
+ int cpuid_level;
+ char x86_vendor_id[16];
+ char x86_model_id[64];
+};
+
+extern bool cpu_has_feature(unsigned int feature);
+extern int cpu_init(void);
+extern int cpu_dump_cpuinfo(void);
+extern int cpu_validate_cpuinfo(void);
+extern int cpuinfo_dump(void);
+extern int cpuinfo_check(void);
+
+#endif /* __CR_CPU_H__ */
diff --git a/criu/arch/x86/include/asm/dump.h b/criu/arch/x86/include/asm/dump.h
new file mode 100644
index 000000000000..1505fd2983b0
--- /dev/null
+++ b/criu/arch/x86/include/asm/dump.h
@@ -0,0 +1,11 @@
+#ifndef __CR_ASM_DUMP_H__
+#define __CR_ASM_DUMP_H__
+
+extern int get_task_regs(pid_t pid, user_regs_struct_t regs, CoreEntry *core);
+extern int arch_alloc_thread_info(CoreEntry *core);
+extern void arch_free_thread_info(CoreEntry *core);
+
+
+#define core_put_tls(core, tls)
+
+#endif
diff --git a/criu/arch/x86/include/asm/fpu.h b/criu/arch/x86/include/asm/fpu.h
new file mode 100644
index 000000000000..be168324bc96
--- /dev/null
+++ b/criu/arch/x86/include/asm/fpu.h
@@ -0,0 +1,102 @@
+#ifndef __CR_ASM_FPU_H__
+#define __CR_ASM_FPU_H__
+
+#include <sys/types.h>
+#include <stdbool.h>
+
+#include "compiler.h"
+#include "asm/int.h"
+
+#define FP_MIN_ALIGN_BYTES 64
+
+#define FP_XSTATE_MAGIC1 0x46505853U
+#define FP_XSTATE_MAGIC2 0x46505845U
+#define FP_XSTATE_MAGIC2_SIZE sizeof(FP_XSTATE_MAGIC2)
+
+#define XSTATE_FP 0x1
+#define XSTATE_SSE 0x2
+#define XSTATE_YMM 0x4
+
+#define FXSAVE_SIZE 512
+#define XSAVE_SIZE 832
+
+struct fpx_sw_bytes {
+ u32 magic1;
+ u32 extended_size;
+ u64 xstate_bv;
+ u32 xstate_size;
+ u32 padding[7];
+};
+
+struct i387_fxsave_struct {
+ u16 cwd; /* Control Word */
+ u16 swd; /* Status Word */
+ u16 twd; /* Tag Word */
+ u16 fop; /* Last Instruction Opcode */
+ union {
+ struct {
+ u64 rip; /* Instruction Pointer */
+ u64 rdp; /* Data Pointer */
+ };
+ struct {
+ u32 fip; /* FPU IP Offset */
+ u32 fcs; /* FPU IP Selector */
+ u32 foo; /* FPU Operand Offset */
+ u32 fos; /* FPU Operand Selector */
+ };
+ };
+ u32 mxcsr; /* MXCSR Register State */
+ u32 mxcsr_mask; /* MXCSR Mask */
+
+ /* 8*16 bytes for each FP-reg = 128 bytes */
+ u32 st_space[32];
+
+ /* 16*16 bytes for each XMM-reg = 256 bytes */
+ u32 xmm_space[64];
+
+ u32 padding[12];
+
+ union {
+ u32 padding1[12];
+ u32 sw_reserved[12];
+ };
+
+} __aligned(16);
+
+struct xsave_hdr_struct {
+ u64 xstate_bv;
+ u64 reserved1[2];
+ u64 reserved2[5];
+} __packed;
+
+struct ymmh_struct {
+ u32 ymmh_space[64];
+} __packed;
+
+/*
+ * cpu requires it to be 64 byte aligned
+ */
+struct xsave_struct {
+ struct i387_fxsave_struct i387;
+ struct xsave_hdr_struct xsave_hdr;
+ struct ymmh_struct ymmh;
+} __aligned(FP_MIN_ALIGN_BYTES) __packed;
+
+/*
+ * This one is used in restorer.
+ */
+typedef struct {
+ /*
+ * The FPU xsave area must be continious and FP_MIN_ALIGN_BYTES
+ * aligned, thus make sure the compiler won't insert any hole here.
+ */
+
+ union {
+ struct xsave_struct xsave;
+ unsigned char __pad[sizeof(struct xsave_struct) + FP_XSTATE_MAGIC2_SIZE];
+ };
+
+ bool has_fpu;
+} fpu_state_t;
+
+#endif /* __CR_ASM_FPU_H__ */
diff --git a/criu/arch/x86/include/asm/int.h b/criu/arch/x86/include/asm/int.h
new file mode 100644
index 000000000000..642804e9b485
--- /dev/null
+++ b/criu/arch/x86/include/asm/int.h
@@ -0,0 +1,6 @@
+#ifndef __CR_ASM_INT_H__
+#define __CR_ASM_INT_H__
+
+#include "asm-generic/int.h"
+
+#endif /* __CR_ASM_INT_H__ */
diff --git a/criu/arch/x86/include/asm/linkage.h b/criu/arch/x86/include/asm/linkage.h
new file mode 100644
index 000000000000..5e0948f07ee6
--- /dev/null
+++ b/criu/arch/x86/include/asm/linkage.h
@@ -0,0 +1,24 @@
+#ifndef __CR_LINKAGE_H__
+#define __CR_LINKAGE_H__
+
+#ifdef __ASSEMBLY__
+
+#define __ALIGN .align 4, 0x90
+#define __ALIGN_STR ".align 4, 0x90"
+
+#define GLOBAL(name) \
+ .globl name; \
+ name:
+
+#define ENTRY(name) \
+ .globl name; \
+ .type name, @function; \
+ __ALIGN; \
+ name:
+
+#define END(sym) \
+ .size sym, . - sym
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __CR_LINKAGE_H__ */
diff --git a/criu/arch/x86/include/asm/page.h b/criu/arch/x86/include/asm/page.h
new file mode 100644
index 000000000000..134835556c62
--- /dev/null
+++ b/criu/arch/x86/include/asm/page.h
@@ -0,0 +1,19 @@
+#ifndef __CR_ASM_PAGE_H__
+#define __CR_ASM_PAGE_H__
+
+#ifndef PAGE_SHIFT
+# define PAGE_SHIFT 12
+#endif
+
+#ifndef PAGE_SIZE
+# define PAGE_SIZE (1UL << PAGE_SHIFT)
+#endif
+
+#ifndef PAGE_MASK
+# define PAGE_MASK (~(PAGE_SIZE - 1))
+#endif
+
+#define PAGE_PFN(addr) ((addr) / PAGE_SIZE)
+#define page_size() PAGE_SIZE
+
+#endif /* __CR_ASM_PAGE_H__ */
diff --git a/criu/arch/x86/include/asm/parasite-syscall.h b/criu/arch/x86/include/asm/parasite-syscall.h
new file mode 100644
index 000000000000..4d56cb07220c
--- /dev/null
+++ b/criu/arch/x86/include/asm/parasite-syscall.h
@@ -0,0 +1,20 @@
+#ifndef __CR_ASM_PARASITE_SYSCALL_H__
+#define __CR_ASM_PARASITE_SYSCALL_H__
+
+#include "asm/types.h"
+
+struct parasite_ctl;
+
+#define ARCH_SI_TRAP SI_KERNEL
+
+
+extern const char code_syscall[];
+extern const int code_syscall_size;
+
+void parasite_setup_regs(unsigned long new_ip, void *stack, user_regs_struct_t *regs);
+
+void *mmap_seized(struct parasite_ctl *ctl,
+ void *addr, size_t length, int prot,
+ int flags, int fd, off_t offset);
+
+#endif
diff --git a/criu/arch/x86/include/asm/parasite.h b/criu/arch/x86/include/asm/parasite.h
new file mode 100644
index 000000000000..669ae63e26cc
--- /dev/null
+++ b/criu/arch/x86/include/asm/parasite.h
@@ -0,0 +1,10 @@
+#ifndef __ASM_PARASITE_H__
+#define __ASM_PARASITE_H__
+
+#ifdef CONFIG_X86_32
+# define __parasite_entry __attribute__((regparm(3)))
+#endif
+
+static inline void arch_get_tls(tls_t *ptls) { (void)ptls; }
+
+#endif
diff --git a/criu/arch/x86/include/asm/prlimit.h b/criu/arch/x86/include/asm/prlimit.h
new file mode 100644
index 000000000000..6746ba0e6f19
--- /dev/null
+++ b/criu/arch/x86/include/asm/prlimit.h
@@ -0,0 +1,14 @@
+#ifndef __CR_PRLIMIT_H__
+#define __CR_PRLIMIT_H__
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "config.h"
+
+#ifndef CONFIG_HAS_PRLIMIT
+extern int prlimit(pid_t pid, int resource, const struct rlimit *new_rlimit, struct rlimit *old_rlimit);
+#endif
+
+#endif /* __CR_PRLIMIT_H__ */
diff --git a/criu/arch/x86/include/asm/processor-flags.h b/criu/arch/x86/include/asm/processor-flags.h
new file mode 100644
index 000000000000..9f1bccdbece8
--- /dev/null
+++ b/criu/arch/x86/include/asm/processor-flags.h
@@ -0,0 +1,28 @@
+#ifndef __CR_PROCESSOR_FLAGS_H__
+#define __CR_PROCESSOR_FLAGS_H__
+
+/* Taken from linux kernel headers */
+
+/*
+ * EFLAGS bits
+ */
+#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
+#define X86_EFLAGS_BIT1 0x00000002 /* Bit 1 - always on */
+#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
+#define X86_EFLAGS_AF 0x00000010 /* Auxiliary carry Flag */
+#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
+#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
+#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
+#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */
+#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */
+#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */
+#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */
+#define X86_EFLAGS_NT 0x00004000 /* Nested Task */
+#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */
+#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */
+#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */
+#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
+#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
+#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
+
+#endif /* __CR_PROCESSOR_FLAGS_H__ */
diff --git a/criu/arch/x86/include/asm/restore.h b/criu/arch/x86/include/asm/restore.h
new file mode 100644
index 000000000000..9d39b2bbae50
--- /dev/null
+++ b/criu/arch/x86/include/asm/restore.h
@@ -0,0 +1,36 @@
+#ifndef __CR_ASM_RESTORE_H__
+#define __CR_ASM_RESTORE_H__
+
+#include "asm/restorer.h"
+
+#include "protobuf/core.pb-c.h"
+
+#ifdef CONFIG_X86_64
+#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \
+ task_args) \
+ asm volatile( \
+ "movq %0, %%rbx \n" \
+ "movq %1, %%rax \n" \
+ "movq %2, %%rdi \n" \
+ "movq %%rbx, %%rsp \n" \
+ "callq *%%rax \n" \
+ : \
+ : "g"(new_sp), \
+ "g"(restore_task_exec_start), \
+ "g"(task_args) \
+ : "rsp", "rdi", "rsi", "rbx", "rax", "memory")
+#else /* CONFIG_X86_64 */
+#define JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, \
+ task_args) \
+ (void)new_sp; \
+ (void)restore_task_exec_start; \
+ (void)task_args; \
+ ;
+#endif /* CONFIG_X86_64 */
+
+#define core_get_tls(pcore, ptls)
+
+
+int restore_fpu(struct rt_sigframe *sigframe, CoreEntry *core);
+
+#endif
diff --git a/criu/arch/x86/include/asm/restorer.h b/criu/arch/x86/include/asm/restorer.h
new file mode 100644
index 000000000000..2021c41f5bb4
--- /dev/null
+++ b/criu/arch/x86/include/asm/restorer.h
@@ -0,0 +1,181 @@
+#ifndef __CR_ASM_RESTORER_H__
+#define __CR_ASM_RESTORER_H__
+
+#include "asm/types.h"
+#include "asm/fpu.h"
+#include "protobuf/core.pb-c.h"
+
+struct pt_regs {
+ unsigned long r15;
+ unsigned long r14;
+ unsigned long r13;
+ unsigned long r12;
+ unsigned long bp;
+ unsigned long bx;
+
+ unsigned long r11;
+ unsigned long r10;
+ unsigned long r9;
+ unsigned long r8;
+ unsigned long ax;
+ unsigned long cx;
+ unsigned long dx;
+ unsigned long si;
+ unsigned long di;
+ unsigned long orig_ax;
+
+ unsigned long ip;
+ unsigned long cs;
+ unsigned long flags;
+ unsigned long sp;
+ unsigned long ss;
+};
+
+struct rt_sigcontext {
+ unsigned long r8;
+ unsigned long r9;
+ unsigned long r10;
+ unsigned long r11;
+ unsigned long r12;
+ unsigned long r13;
+ unsigned long r14;
+ unsigned long r15;
+ unsigned long rdi;
+ unsigned long rsi;
+ unsigned long rbp;
+ unsigned long rbx;
+ unsigned long rdx;
+ unsigned long rax;
+ unsigned long rcx;
+ unsigned long rsp;
+ unsigned long rip;
+ unsigned long eflags;
+ unsigned short cs;
+ unsigned short gs;
+ unsigned short fs;
+ unsigned short ss;
+ unsigned long err;
+ unsigned long trapno;
+ unsigned long oldmask;
+ unsigned long cr2;
+ void *fpstate;
+ unsigned long reserved1[8];
+};
+
+#include "sigframe.h"
+
+struct rt_sigframe {
+ char *pretcode;
+ struct rt_ucontext uc;
+ struct rt_siginfo info;
+
+ fpu_state_t fpu_state;
+};
+
+#ifdef CONFIG_X86_64
+#define ARCH_RT_SIGRETURN(new_sp) \
+ asm volatile( \
+ "movq %0, %%rax \n" \
+ "movq %%rax, %%rsp \n" \
+ "movl $"__stringify(__NR_rt_sigreturn)", %%eax \n" \
+ "syscall \n" \
+ : \
+ : "r"(new_sp) \
+ : "rax","rsp","memory")
+
+#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \
+ thread_args, clone_restore_fn) \
+ asm volatile( \
+ "clone_emul: \n" \
+ "movq %2, %%rsi \n" \
+ "subq $16, %%rsi \n" \
+ "movq %6, %%rdi \n" \
+ "movq %%rdi, 8(%%rsi) \n" \
+ "movq %5, %%rdi \n" \
+ "movq %%rdi, 0(%%rsi) \n" \
+ "movq %1, %%rdi \n" \
+ "movq %3, %%rdx \n" \
+ "movq %4, %%r10 \n" \
+ "movl $"__stringify(__NR_clone)", %%eax \n" \
+ "syscall \n" \
+ \
+ "testq %%rax,%%rax \n" \
+ "jz thread_run \n" \
+ \
+ "movq %%rax, %0 \n" \
+ "jmp clone_end \n" \
+ \
+ "thread_run: \n" \
+ "xorq %%rbp, %%rbp \n" \
+ "popq %%rax \n" \
+ "popq %%rdi \n" \
+ "callq *%%rax \n" \
+ \
+ "clone_end: \n" \
+ : "=r"(ret) \
+ : "g"(clone_flags), \
+ "g"(new_sp), \
+ "g"(&parent_tid), \
+ "g"(&thread_args[i].pid), \
+ "g"(clone_restore_fn), \
+ "g"(&thread_args[i]) \
+ : "rax", "rcx", "rdi", "rsi", "rdx", "r10", "r11", "memory")
+
+#define ARCH_FAIL_CORE_RESTORE \
+ asm volatile( \
+ "movq %0, %%rsp \n" \
+ "movq 0, %%rax \n" \
+ "jmp *%%rax \n" \
+ : \
+ : "r"(ret) \
+ : "memory")
+#else /* CONFIG_X86_64 */
+#define ARCH_RT_SIGRETURN(new_sp) \
+ asm volatile( \
+ "movl %0, %%eax \n" \
+ "movl %%eax, %%esp \n" \
+ "movl $"__stringify(__NR_rt_sigreturn)", %%eax \n" \
+ "int $0x80 \n" \
+ : \
+ : "r"(new_sp) \
+ : "eax","esp","memory")
+
+#define RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, \
+ thread_args, clone_restore_fn) \
+ (void)ret; \
+ (void)clone_flags; \
+ (void)new_sp; \
+ (void)parent_tid; \
+ (void)thread_args; \
+ (void)clone_restore_fn; \
+ ;
+#define ARCH_FAIL_CORE_RESTORE \
+ asm volatile( \
+ "movl %0, %%esp \n" \
+ "xorl %%eax, %%eax \n" \
+ "jmp *%%eax \n" \
+ : \
+ : "r"(ret) \
+ : "memory")
+#endif /* CONFIG_X86_64 */
+
+#define RT_SIGFRAME_UC(rt_sigframe) rt_sigframe->uc
+#define RT_SIGFRAME_REGIP(rt_sigframe) (rt_sigframe)->uc.uc_mcontext.rip
+#define RT_SIGFRAME_HAS_FPU(rt_sigframe) (rt_sigframe)->fpu_state.has_fpu
+#define RT_SIGFRAME_FPU(rt_sigframe) (rt_sigframe)->fpu_state
+
+#define SIGFRAME_OFFSET 8
+
+
+int restore_gpregs(struct rt_sigframe *f, UserX86RegsEntry *r);
+int restore_nonsigframe_gpregs(UserX86RegsEntry *r);
+
+int sigreturn_prep_fpu_frame(struct rt_sigframe *sigframe, fpu_state_t *fpu_state);
+
+static inline void restore_tls(tls_t *ptls) { (void)ptls; }
+
+int ptrace_set_breakpoint(pid_t pid, void *addr);
+int ptrace_flush_breakpoints(pid_t pid);
+
+
+#endif
diff --git a/criu/arch/x86/include/asm/string.h b/criu/arch/x86/include/asm/string.h
new file mode 100644
index 000000000000..e1d875e45f95
--- /dev/null
+++ b/criu/arch/x86/include/asm/string.h
@@ -0,0 +1,24 @@
+#ifndef __CR_ASM_STRING_H__
+#define __CR_ASM_STRING_H__
+
+#define HAS_BUILTIN_MEMCPY
+
+#include "compiler.h"
+#include "asm-generic/string.h"
+
+static always_inline void *builtin_memcpy(void *to, const void *from, unsigned int n)
+{
+ int d0, d1, d2;
+ asm volatile("rep ; movsl \n"
+ "movl %4,%%ecx \n"
+ "andl $3,%%ecx \n"
+ "jz 1f \n"
+ "rep ; movsb \n"
+ "1:"
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2)
+ : "0" (n / 4), "g" (n), "1" ((long)to), "2" ((long)from)
+ : "memory");
+ return to;
+}
+
+#endif /* __CR_ASM_STRING_H__ */
diff --git a/criu/arch/x86/include/asm/syscall32.h b/criu/arch/x86/include/asm/syscall32.h
new file mode 100644
index 000000000000..b0d5cb71d3a5
--- /dev/null
+++ b/criu/arch/x86/include/asm/syscall32.h
@@ -0,0 +1,25 @@
+#ifndef __CR_SYSCALL32_H__
+#define __CR_SYSCALL32_H__
+
+extern long sys_socket(int domain, int type, int protocol);
+extern long sys_connect(int sockfd, struct sockaddr *addr, int addrlen);
+extern long sys_sendto(int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len);
+extern long sys_recvfrom(int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len);
+extern long sys_sendmsg(int sockfd, const struct msghdr *msg, int flags);
+extern long sys_recvmsg(int sockfd, struct msghdr *msg, int flags);
+extern long sys_shutdown(int sockfd, int how);
+extern long sys_bind(int sockfd, const struct sockaddr *addr, int addrlen);
+extern long sys_setsockopt(int sockfd, int level, int optname, const void *optval, unsigned int optlen);
+extern long sys_getsockopt(int sockfd, int level, int optname, const void *optval, unsigned int *optlen);
+extern long sys_shmat(int shmid, void *shmaddr, int shmflag);
+extern long sys_pread(unsigned int fd, char *ubuf, u32 count, u64 pos);
+
+/*
+ * For x86_32 __NR_mmap inside the kernel represents old_mmap system
+ * call, but since we didn't use it yet lets go further and simply
+ * define own alias for __NR_mmap2 which would allow us to unify code
+ * between 32 and 64 bits version.
+ */
+#define __NR_mmap __NR_mmap2
+
+#endif /* __CR_SYSCALL32_H__ */
diff --git a/criu/arch/x86/include/asm/types.h b/criu/arch/x86/include/asm/types.h
new file mode 100644
index 000000000000..b2d018983ffa
--- /dev/null
+++ b/criu/arch/x86/include/asm/types.h
@@ -0,0 +1,142 @@
+#ifndef __CR_ASM_TYPES_H__
+#define __CR_ASM_TYPES_H__
+
+#include <stdbool.h>
+#include <signal.h>
+
+#include "asm/page.h"
+#include "asm/bitops.h"
+#include "asm/int.h"
+#include "asm/prlimit.h"
+
+#include "protobuf/core.pb-c.h"
+
+#define SIGMAX 64
+#define SIGMAX_OLD 31
+
+#define MAJOR(dev) ((dev)>>8)
+#define MINOR(dev) ((dev) & 0xff)
+
+typedef void rt_signalfn_t(int, siginfo_t *, void *);
+typedef rt_signalfn_t *rt_sighandler_t;
+
+typedef void rt_restorefn_t(void);
+typedef rt_restorefn_t *rt_sigrestore_t;
+
+#define _KNSIG 64
+# define _NSIG_BPW 64
+
+#define _KNSIG_WORDS (_KNSIG / _NSIG_BPW)
+
+typedef struct {
+ unsigned long sig[_KNSIG_WORDS];
+} k_rtsigset_t;
+
+static inline void ksigfillset(k_rtsigset_t *set)
+{
+ int i;
+ for (i = 0; i < _KNSIG_WORDS; i++)
+ set->sig[i] = (unsigned long)-1;
+}
+
+#define SA_RESTORER 0x04000000
+
+typedef struct {
+ rt_sighandler_t rt_sa_handler;
+ unsigned long rt_sa_flags;
+ rt_sigrestore_t rt_sa_restorer;
+ k_rtsigset_t rt_sa_mask;
+} rt_sigaction_t;
+
+typedef struct {
+ unsigned int entry_number;
+ unsigned int base_addr;
+ unsigned int limit;
+ unsigned int seg_32bit:1;
+ unsigned int contents:2;
+ unsigned int read_exec_only:1;
+ unsigned int limit_in_pages:1;
+ unsigned int seg_not_present:1;
+ unsigned int useable:1;
+ unsigned int lm:1;
+} user_desc_t;
+
+typedef struct {
+ unsigned long r15;
+ unsigned long r14;
+ unsigned long r13;
+ unsigned long r12;
+ unsigned long bp;
+ unsigned long bx;
+ unsigned long r11;
+ unsigned long r10;
+ unsigned long r9;
+ unsigned long r8;
+ unsigned long ax;
+ unsigned long cx;
+ unsigned long dx;
+ unsigned long si;
+ unsigned long di;
+ unsigned long orig_ax;
+ unsigned long ip;
+ unsigned long cs;
+ unsigned long flags;
+ unsigned long sp;
+ unsigned long ss;
+ unsigned long fs_base;
+ unsigned long gs_base;
+ unsigned long ds;
+ unsigned long es;
+ unsigned long fs;
+ unsigned long gs;
+} user_regs_struct_t;
+
+typedef struct {
+ unsigned short cwd;
+ unsigned short swd;
+ unsigned short twd; /* Note this is not the same as
+ the 32bit/x87/FSAVE twd */
+ unsigned short fop;
+ u64 rip;
+ u64 rdp;
+ u32 mxcsr;
+ u32 mxcsr_mask;
+ u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
+ u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
+ u32 padding[24];
+} user_fpregs_struct_t;
+
+#define ASSIGN_TYPED(a, b) do { a = (typeof(a))b; } while (0)
+#define ASSIGN_MEMBER(a,b,m) do { ASSIGN_TYPED((a)->m, (b)->m); } while (0)
+
+#ifdef CONFIG_X86_64
+# define TASK_SIZE ((1UL << 47) - PAGE_SIZE)
+#else
+/*
+ * Task size may be limited to 3G but we need a
+ * higher limit, because it's backward compatible.
+ */
+# define TASK_SIZE (0xffffe000)
+#endif
+
+static inline unsigned long task_size() { return TASK_SIZE; }
+
+typedef u64 auxv_t;
+typedef u32 tls_t;
+
+#define REG_RES(regs) ((regs).ax)
+#define REG_IP(regs) ((regs).ip)
+#define REG_SYSCALL_NR(regs) ((regs).orig_ax)
+
+#define CORE_ENTRY__MARCH CORE_ENTRY__MARCH__X86_64
+
+#define AT_VECTOR_SIZE 44
+
+#define CORE_THREAD_ARCH_INFO(core) core->thread_info
+
+typedef UserX86RegsEntry UserRegsEntry;
+
+static inline u64 encode_pointer(void *p) { return (u64)(long)p; }
+static inline void *decode_pointer(u64 v) { return (void*)(long)v; }
+
+#endif /* __CR_ASM_TYPES_H__ */
diff --git a/criu/arch/x86/include/asm/vdso.h b/criu/arch/x86/include/asm/vdso.h
new file mode 100644
index 000000000000..a1cc9bb9751b
--- /dev/null
+++ b/criu/arch/x86/include/asm/vdso.h
@@ -0,0 +1,30 @@
+#ifndef __CR_ASM_VDSO_H__
+#define __CR_ASM_VDSO_H__
+
+#include "asm/int.h"
+#include "asm-generic/vdso.h"
+
+/* This definition is used in pie/util-vdso.c to initialize the vdso symbol
+ * name string table 'vdso_symbols'
+ */
+
+/*
+ * This is a minimal amount of symbols
+ * we should support at the moment.
+ */
+#define VDSO_SYMBOL_MAX 4
+
+#define ARCH_VDSO_SYMBOLS \
+ "__vdso_clock_gettime", \
+ "__vdso_getcpu", \
+ "__vdso_gettimeofday", \
+ "__vdso_time"
+
+
+struct vdso_symtable;
+extern int vdso_redirect_calls(unsigned long base_to,
+ unsigned long base_from,
+ struct vdso_symtable *to,
+ struct vdso_symtable *from);
+
+#endif /* __CR_ASM_VDSO_H__ */
diff --git a/criu/arch/x86/parasite-head.S b/criu/arch/x86/parasite-head.S
new file mode 100644
index 000000000000..5fb00a5c87c1
--- /dev/null
+++ b/criu/arch/x86/parasite-head.S
@@ -0,0 +1,40 @@
+#include "asm/linkage.h"
+#include "parasite.h"
+
+ .section .head.text, "ax"
+
+#ifdef CONFIG_X86_64
+
+ENTRY(__export_parasite_head_start)
+ subq $16, %rsp
+ andq $~15, %rsp
+ pushq $0
+ movq %rsp, %rbp
+ movl __export_parasite_cmd(%rip), %edi
+ leaq __export_parasite_args(%rip), %rsi
+ call parasite_service
+ int $0x03
+ .align 8
+__export_parasite_cmd:
+ .long 0
+END(__export_parasite_head_start)
+
+#else /* CONFIG_X86_64 */
+
+ENTRY(__export_parasite_head_start)
+ subl $16, %esp
+ andl $~15, %esp
+ pushl $0
+ movl %esp, %ebp
+ call 1f
+1: popl %ecx
+2: leal (__export_parasite_cmd-2b)(%ecx), %eax
+ leal (__export_parasite_args-2b)(%ecx), %edx
+ call parasite_service
+ int $0x03
+ .align 8
+GLOBAL(__export_parasite_cmd)
+ .long 0
+END(__export_parasite_head_start)
+
+#endif /* CONFIG_X86_64 */
diff --git a/criu/arch/x86/prlimit.c b/criu/arch/x86/prlimit.c
new file mode 100644
index 000000000000..8bc4aba9f6a6
--- /dev/null
+++ b/criu/arch/x86/prlimit.c
@@ -0,0 +1,68 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include "asm/types.h"
+#include "asm/prlimit.h"
+
+#include "compiler.h"
+#include "config.h"
+
+#ifndef CONFIG_HAS_PRLIMIT
+
+#ifndef RLIM64_INFINITY
+# define RLIM64_INFINITY (~0ULL)
+#endif
+
+int prlimit(pid_t pid, int resource, const struct rlimit *new_rlimit, struct rlimit *old_rlimit)
+{
+ struct rlimit64 new_rlimit64_mem;
+ struct rlimit64 old_rlimit64_mem;
+ struct rlimit64 *new_rlimit64 = NULL;
+ struct rlimit64 *old_rlimit64 = NULL;
+ int ret;
+
+ if (old_rlimit)
+ old_rlimit64 = &old_rlimit64_mem;
+
+ if (new_rlimit) {
+ if (new_rlimit->rlim_cur == RLIM_INFINITY)
+ new_rlimit64_mem.rlim_cur = RLIM64_INFINITY;
+ else
+ new_rlimit64_mem.rlim_cur = new_rlimit->rlim_cur;
+ if (new_rlimit->rlim_max == RLIM_INFINITY)
+ new_rlimit64_mem.rlim_max = RLIM64_INFINITY;
+ else
+ new_rlimit64_mem.rlim_max = new_rlimit->rlim_max;
+ new_rlimit64 = &new_rlimit64_mem;
+ }
+
+ ret = sys_prlimit64(pid, resource, new_rlimit64, old_rlimit64);
+
+ if (ret == 0 && old_rlimit) {
+ old_rlimit->rlim_cur = old_rlimit64_mem.rlim_cur;
+ if (old_rlimit->rlim_cur != old_rlimit64_mem.rlim_cur) {
+ if (new_rlimit) {
+ errno = EOVERFLOW;
+ return -1;
+ }
+ old_rlimit->rlim_cur = RLIM_INFINITY;
+ }
+ old_rlimit->rlim_max = old_rlimit64_mem.rlim_max;
+ if (old_rlimit->rlim_max != old_rlimit64_mem.rlim_max) {
+ if (new_rlimit) {
+ errno = EOVERFLOW;
+ return -1;
+ }
+ old_rlimit->rlim_max = RLIM_INFINITY;
+ }
+ } else if (ret) {
+ errno = -ret;
+ ret = -1;
+ }
+
+ return ret;
+}
+
+#endif /* CONFIG_HAS_PRLIMIT */
diff --git a/criu/arch/x86/restorer.c b/criu/arch/x86/restorer.c
new file mode 100644
index 000000000000..364b156be91e
--- /dev/null
+++ b/criu/arch/x86/restorer.c
@@ -0,0 +1,33 @@
+#include <asm/prctl.h>
+#include <unistd.h>
+
+#include "restorer.h"
+#include "asm/restorer.h"
+#include "asm/fpu.h"
+
+#include "syscall.h"
+#include "log.h"
+#include "cpu.h"
+
+int restore_nonsigframe_gpregs(UserX86RegsEntry *r)
+{
+#ifdef CONFIG_X86_64
+ long ret;
+ unsigned long fsgs_base;
+
+ fsgs_base = r->fs_base;
+ ret = sys_arch_prctl(ARCH_SET_FS, fsgs_base);
+ if (ret) {
+ pr_info("SET_FS fail %ld\n", ret);
+ return -1;
+ }
+
+ fsgs_base = r->gs_base;
+ ret = sys_arch_prctl(ARCH_SET_GS, fsgs_base);
+ if (ret) {
+ pr_info("SET_GS fail %ld\n", ret);
+ return -1;
+ }
+#endif
+ return 0;
+}
diff --git a/criu/arch/x86/syscalls/syscall-common-x86-32.S b/criu/arch/x86/syscalls/syscall-common-x86-32.S
new file mode 100644
index 000000000000..ae6d594dc4fe
--- /dev/null
+++ b/criu/arch/x86/syscalls/syscall-common-x86-32.S
@@ -0,0 +1,36 @@
+#include "asm/linkage.h"
+
+#define SYSCALL(name, opcode) \
+ ENTRY(name); \
+ movl $opcode, %eax; \
+ jmp __syscall_common; \
+ END(name)
+
+ENTRY(__syscall_common)
+ pushl %ebx
+ pushl %esi
+ pushl %edi
+ pushl %ebp
+
+#define __arg(n) (4 * (n) + 20)(%esp)
+ movl __arg(0),%ebx
+ movl __arg(1),%ecx
+ movl __arg(2),%edx
+ movl __arg(3),%esi
+ movl __arg(4),%edi
+ movl __arg(5),%ebp
+#undef __arg
+
+ int $0x80
+
+ popl %ebp
+ popl %edi
+ popl %esi
+ popl %ebx
+ ret
+END(__syscall_common)
+
+ENTRY(__cr_restore_rt)
+ movl $__NR_rt_sigreturn, %eax
+ jmp __syscall_common
+END(__cr_restore_rt)
diff --git a/criu/arch/x86/syscalls/syscall-common-x86-64.S b/criu/arch/x86/syscalls/syscall-common-x86-64.S
new file mode 100644
index 000000000000..b93c31288a20
--- /dev/null
+++ b/criu/arch/x86/syscalls/syscall-common-x86-64.S
@@ -0,0 +1,21 @@
+#include "asm/linkage.h"
+
+#define SYSCALL(name, opcode) \
+ ENTRY(name); \
+ movl $opcode, %eax; \
+ jmp __syscall_common; \
+ END(name)
+
+ .text
+ .align 4
+
+ENTRY(__syscall_common)
+ movq %rcx, %r10
+ syscall
+ ret
+END(__syscall_common)
+
+ENTRY(__cr_restore_rt)
+ movq $__NR_rt_sigreturn, %rax
+ syscall
+END(__cr_restore_rt)
diff --git a/criu/arch/x86/syscalls/syscall32.c b/criu/arch/x86/syscalls/syscall32.c
new file mode 100644
index 000000000000..b68ef09572f3
--- /dev/null
+++ b/criu/arch/x86/syscalls/syscall32.c
@@ -0,0 +1,85 @@
+#include "asm/types.h"
+#include "syscall.h"
+
+#define SYS_SOCKET 1 /* sys_socket(2) */
+#define SYS_BIND 2 /* sys_bind(2) */
+#define SYS_CONNECT 3 /* sys_connect(2) */
+#define SYS_SENDTO 11 /* sys_sendto(2) */
+#define SYS_RECVFROM 12 /* sys_recvfrom(2) */
+#define SYS_SHUTDOWN 13 /* sys_shutdown(2) */
+#define SYS_SETSOCKOPT 14 /* sys_setsockopt(2) */
+#define SYS_GETSOCKOPT 15 /* sys_getsockopt(2) */
+#define SYS_SENDMSG 16 /* sys_sendmsg(2) */
+#define SYS_RECVMSG 17 /* sys_recvmsg(2) */
+
+long sys_socket(int domain, int type, int protocol)
+{
+ u32 a[] = { (u32)domain, (u32)type, (u32)protocol };
+ return sys_socketcall(SYS_SOCKET, (unsigned long *)a);
+}
+
+long sys_connect(int sockfd, struct sockaddr *addr, int addrlen)
+{
+ u32 a[] = {(u32)sockfd, (u32)addr, (u32)addrlen};
+ return sys_socketcall(SYS_CONNECT, (unsigned long *)a);
+}
+
+long sys_sendto(int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len)
+{
+ u32 a[] = {(u32)sockfd, (u32)buff, (u32)len, (u32)flags, (u32)addr, (u32)addr_len};
+ return sys_socketcall(SYS_SENDTO, (unsigned long *)a);
+}
+
+long sys_recvfrom(int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len)
+{
+ u32 a[] = {(u32)sockfd, (u32)ubuf, (u32)size, (u32)flags, (u32)addr, (u32)addr_len};
+ return sys_socketcall(SYS_RECVFROM, (unsigned long *)a);
+}
+
+long sys_sendmsg(int sockfd, const struct msghdr *msg, int flags)
+{
+ u32 a[] = {(u32)sockfd, (u32)msg, (u32)flags};
+ return sys_socketcall(SYS_SENDMSG, (unsigned long *)a);
+}
+
+long sys_recvmsg(int sockfd, struct msghdr *msg, int flags)
+{
+ u32 a[] = {(u32)sockfd, (u32)msg, (u32)flags};
+ return sys_socketcall(SYS_RECVMSG, (unsigned long *)a);
+}
+
+long sys_shutdown(int sockfd, int how)
+{
+ u32 a[] = {(u32)sockfd, (u32)how};
+ return sys_socketcall(SYS_SHUTDOWN, (unsigned long *)a);
+}
+
+long sys_bind(int sockfd, const struct sockaddr *addr, int addrlen)
+{
+ u32 a[] = {(u32)sockfd, (u32)addr, (u32)addrlen};
+ return sys_socketcall(SYS_BIND, (unsigned long *)a);
+}
+
+long sys_setsockopt(int sockfd, int level, int optname, const void *optval, unsigned int optlen)
+{
+ u32 a[] = {(u32)sockfd, (u32)level, (u32)optname, (u32)optval, (u32)optlen};
+ return sys_socketcall(SYS_SETSOCKOPT, (unsigned long *)a);
+}
+
+long sys_getsockopt(int sockfd, int level, int optname, const void *optval, unsigned int *optlen)
+{
+ u32 a[] = {(u32)sockfd, (u32)level, (u32)optname, (u32)optval, (u32)optlen};
+ return sys_socketcall(SYS_GETSOCKOPT, (unsigned long *)a);
+}
+
+#define SHMAT 21
+
+long sys_shmat(int shmid, void *shmaddr, int shmflag)
+{
+ return sys_ipc(SHMAT, shmid, shmflag, 0, shmaddr, 0);
+}
+
+long sys_pread(unsigned int fd, char *ubuf, u32 count, u64 pos)
+{
+ return sys_pread64(fd, ubuf, count, (u32)(pos & 0xffffffffu), (u32)(pos >> 32));
+}
diff --git a/criu/arch/x86/syscalls/syscall_32.tbl b/criu/arch/x86/syscalls/syscall_32.tbl
new file mode 100644
index 000000000000..04d10d119c3f
--- /dev/null
+++ b/criu/arch/x86/syscalls/syscall_32.tbl
@@ -0,0 +1,92 @@
+#
+# System calls table, please make sure the table consist only the syscalls
+# really used somewhere in project.
+#
+# code name arguments
+# -------------------------------------------------------------------------------------------------------------------------------------------------------------
+__NR_restart_syscall 0 sys_restart_syscall (void)
+__NR_exit 1 sys_exit (unsigned long error_code)
+__NR_read 3 sys_read (int fd, void *buf, unsigned long count)
+__NR_write 4 sys_write (int fd, const void *buf, unsigned long count)
+__NR_open 5 sys_open (const char *filename, int flags, unsigned int mode)
+__NR_close 6 sys_close (int fd)
+__NR_unlink 10 sys_unlink (char *pathname)
+__NR_lseek 19 sys_lseek (int fd, s32 offset, unsigned int origin)
+__NR_getpid 20 sys_getpid (void)
+__NR_mount 21 sys_mount (const char *dev_name, const char *dir_name, const char *type, unsigned long flags, const void *data)
+__NR_ptrace 26 sys_ptrace (long request, pid_t pid, void *addr, void *data)
+__NR_kill 37 sys_kill (long pid, int sig)
+__NR_mkdir 39 sys_mkdir (const char *name, int mode)
+__NR_rmdir 40 sys_rmdir (const char *name)
+__NR_brk 45 sys_brk (void *addr)
+__NR_umount2 52 sys_umount2 (char *name, int flags)
+__NR_ioctl 54 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg)
+__NR_fcntl 55 sys_fcntl (unsigned int fd, unsigned int cmd, unsigned long arg)
+__NR_umask 60 sys_umask (int mask)
+__NR_setrlimit 75 sys_setrlimit (unsigned int resource, struct krlimit *rlim)
+__NR_munmap 91 sys_munmap (void *addr, unsigned long len)
+__NR_setpriority 97 sys_setpriority (int which, int who, int nice)
+__NR_socketcall 102 sys_socketcall (int call, unsigned long *args)
+__NR_setitimer 104 sys_setitimer (int which, struct itimerval *in, struct itimerval *out)
+__NR_getitimer 105 sys_getitimer (int which, struct itimerval *it)
+__NR_wait4 114 sys_wait4 (pid_t pid, int *stat_addr, int options, struct rusage *ru)
+__NR_ipc 117 sys_ipc (unsigned int call, int first, unsigned long second, unsigned long third, void *ptr, long fifth)
+__NR_clone 120 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, void *child_tid)
+__NR_mprotect 125 sys_mprotect (const void *addr, unsigned long len, unsigned long prot)
+__NR_getpgid 132 sys_getpgid (pid_t pid)
+__NR_personality 136 sys_personality (unsigned int personality)
+__NR_flock 143 sys_flock (int fd, unsigned long cmd)
+__NR_getsid 147 sys_getsid (void)
+__NR_sched_setscheduler 156 sys_sched_setscheduler (int pid, int policy, struct sched_param *p)
+__NR_nanosleep 162 sys_nanosleep (struct timespec *rqtp, struct timespec *rmtp)
+__NR_mremap 163 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr)
+__NR_prctl 172 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5)
+__NR_rt_sigreturn 173 sys_rt_sigreturn (void)
+__NR_rt_sigaction 174 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize)
+__NR_rt_sigprocmask 175 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *oset, size_t sigsetsize)
+__NR_rt_sigqueueinfo 178 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *uinfo)
+__NR_pread64 180 sys_pread64 (unsigned int fd, char *ubuf, u32 count, u32 poslo, u32 poshi)
+__NR_capget 184 sys_capget (struct cap_header *h, struct cap_data *d)
+__NR_capset 185 sys_capset (struct cap_header *h, struct cap_data *d)
+__NR_sigaltstack 186 sys_sigaltstack (const void *uss_ptr, void *uoss_ptr)
+__NR_mmap2 192 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff)
+__NR_getgroups32 205 sys_getgroups (int gsize, unsigned int *groups)
+__NR_setgroups32 206 sys_setgroups (int gsize, unsigned int *groups)
+__NR_setresuid32 208 sys_setresuid (int uid, int euid, int suid)
+__NR_getresuid32 209 sys_getresuid (int *uid, int *euid, int *suid)
+__NR_setresgid32 210 sys_setresgid (int gid, int egid, int sgid)
+__NR_getresgid32 211 sys_getresgid (int *gid, int *egid, int *sgid)
+__NR_setfsuid32 215 sys_setfsuid (int fsuid)
+__NR_setfsgid32 216 sys_setfsgid (int fsgid)
+__NR_mincore 218 sys_mincore (void *addr, unsigned long size, unsigned char *vec)
+__NR_madvise 219 sys_madvise (unsigned long start, size_t len, int behavior)
+__NR_gettid 224 sys_gettid (void)
+__NR_futex 240 sys_futex (u32 *uaddr, int op, u32 val, struct timespec *utime, u32 *uaddr2, u32 val3)
+__NR_set_thread_area 243 sys_set_thread_area (user_desc_t *info)
+__NR_get_thread_area 244 sys_get_thread_area (user_desc_t *info)
+__NR_io_setup 245 sys_io_setup (unsigned nr_reqs, aio_context_t *ctx32p)
+__NR_io_getevents 247 sys_io_getevents (aio_context_t ctx_id, long min_nr, long nr, struct io_event *events, struct timespec *timeout)
+__NR_exit_group 252 sys_exit_group (int error_code)
+__NR_set_tid_address 258 sys_set_tid_address (int *tid_addr)
+__NR_timer_create 259 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id)
+__NR_timer_settime 260 sys_timer_settime (kernel_timer_t timer_id, int flags, struct itimerspec *new, struct itimerspec *old)
+__NR_timer_gettime 261 sys_timer_gettime (int timer_id, struct itimerspec *setting)
+__NR_timer_getoverrun 262 sys_timer_getoverrun (int timer_id)
+__NR_timer_delete 263 sys_timer_delete (kernel_timer_t timer_id)
+__NR_clock_gettime 265 sys_clock_gettime (int which_clock, struct timespec *tp)
+__NR_seccomp 354 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs)
+__NR_waitid 284 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru)
+__NR_readlinkat 305 sys_readlinkat (int fd, const char *path, char *buf, int bufsize)
+__NR_set_robust_list 311 sys_set_robust_list (struct robust_list_head *head, size_t len)
+__NR_get_robust_list 312 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
+__NR_vmsplice 316 sys_vmsplice (int fd, const struct iovec *iov, unsigned int nr_segs, unsigned int flags)
+__NR_signalfd 321 sys_signalfd (int ufd, const k_rtsigset_t *sigmask, size_t sigsetsize)
+__NR_timerfd_settime 325 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
+__NR_rt_tgsigqueueinfo 335 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *uinfo)
+__NR_fanotify_init 338 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags)
+__NR_fanotify_mark 339 sys_fanotify_mark (int fanotify_fd, unsigned int flag, u32 mask, int dfd, const char *pathname)
+__NR_prlimit64 340 sys_prlimit64 (pid_t pid, unsigned int resource, const struct rlimit64 *new_rlim, struct rlimit64 *old_rlim)
+__NR_open_by_handle_at 342 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags)
+__NR_setns 346 sys_setns (int fd, int nstype)
+__NR_kcmp 349 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2)
+__NR_memfd_create 356 sys_memfd_create (const char *name, unsigned int flags)
diff --git a/criu/arch/x86/syscalls/syscall_64.tbl b/criu/arch/x86/syscalls/syscall_64.tbl
new file mode 100644
index 000000000000..5c32d4c46c75
--- /dev/null
+++ b/criu/arch/x86/syscalls/syscall_64.tbl
@@ -0,0 +1,103 @@
+#
+# System calls table, please make sure the table consist only the syscalls
+# really used somewhere in project.
+#
+# __NR_name code name arguments
+# -------------------------------------------------------------------------------------------------------------------------------------------------------------
+__NR_read 0 sys_read (int fd, void *buf, unsigned long count)
+__NR_write 1 sys_write (int fd, const void *buf, unsigned long count)
+__NR_open 2 sys_open (const char *filename, unsigned long flags, unsigned long mode)
+__NR_close 3 sys_close (int fd)
+__NR_lseek 8 sys_lseek (int fd, unsigned long offset, unsigned long origin)
+__NR_mmap 9 sys_mmap (void *addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long offset)
+__NR_mprotect 10 sys_mprotect (const void *addr, unsigned long len, unsigned long prot)
+__NR_munmap 11 sys_munmap (void *addr, unsigned long len)
+__NR_brk 12 sys_brk (void *addr)
+__NR_rt_sigaction 13 sys_sigaction (int signum, const rt_sigaction_t *act, rt_sigaction_t *oldact, size_t sigsetsize)
+__NR_rt_sigprocmask 14 sys_sigprocmask (int how, k_rtsigset_t *set, k_rtsigset_t *old, size_t sigsetsize)
+__NR_rt_sigreturn 15 sys_rt_sigreturn (void)
+__NR_ioctl 16 sys_ioctl (unsigned int fd, unsigned int cmd, unsigned long arg)
+__NR_pread64 17 sys_pread (unsigned int fd, char *buf, size_t count, loff_t pos)
+__NR_mremap 25 sys_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr)
+__NR_mincore 27 sys_mincore (void *addr, unsigned long size, unsigned char *vec)
+__NR_madvise 28 sys_madvise (unsigned long start, size_t len, int behavior)
+__NR_shmat 30 sys_shmat (int shmid, void *shmaddr, int shmflag)
+__NR_nanosleep 35 sys_nanosleep (struct timespec *req, struct timespec *rem)
+__NR_getitimer 36 sys_getitimer (int which, const struct itimerval *val)
+__NR_setitimer 38 sys_setitimer (int which, const struct itimerval *val, struct itimerval *old)
+__NR_getpid 39 sys_getpid (void)
+__NR_socket 41 sys_socket (int domain, int type, int protocol)
+__NR_connect 42 sys_connect (int sockfd, struct sockaddr *addr, int addrlen)
+__NR_sendto 44 sys_sendto (int sockfd, void *buff, size_t len, unsigned int flags, struct sockaddr *addr, int addr_len)
+__NR_recvfrom 45 sys_recvfrom (int sockfd, void *ubuf, size_t size, unsigned int flags, struct sockaddr *addr, int *addr_len)
+__NR_sendmsg 46 sys_sendmsg (int sockfd, const struct msghdr *msg, int flags)
+__NR_recvmsg 47 sys_recvmsg (int sockfd, struct msghdr *msg, int flags)
+__NR_shutdown 48 sys_shutdown (int sockfd, int how)
+__NR_bind 49 sys_bind (int sockfd, const struct sockaddr *addr, int addrlen)
+__NR_setsockopt 54 sys_setsockopt (int sockfd, int level, int optname, const void *optval, socklen_t optlen)
+__NR_getsockopt 55 sys_getsockopt (int sockfd, int level, int optname, const void *optval, socklen_t *optlen)
+__NR_clone 56 sys_clone (unsigned long flags, void *child_stack, void *parent_tid, void *child_tid)
+__NR_exit 60 sys_exit (unsigned long error_code)
+__NR_wait4 61 sys_wait4 (int pid, int *status, int options, struct rusage *ru)
+__NR_kill 62 sys_kill (long pid, int sig)
+__NR_fcntl 72 sys_fcntl (int fd, int type, long arg)
+__NR_flock 73 sys_flock (int fd, unsigned long cmd)
+__NR_mkdir 83 sys_mkdir (const char *name, int mode)
+__NR_rmdir 84 sys_rmdir (const char *name)
+__NR_unlink 87 sys_unlink (char *pathname)
+__NR_umask 95 sys_umask (int mask)
+__NR_ptrace 101 sys_ptrace (long request, pid_t pid, void *addr, void *data)
+__NR_getgroups 115 sys_getgroups (int gsize, unsigned int *groups)
+__NR_setgroups 116 sys_setgroups (int gsize, unsigned int *groups)
+__NR_setresuid 117 sys_setresuid (int uid, int euid, int suid)
+__NR_getresuid 118 sys_getresuid (int *uid, int *euid, int *suid)
+__NR_setresgid 119 sys_setresgid (int gid, int egid, int sgid)
+__NR_getresgid 120 sys_getresgid (int *gid, int *egid, int *sgid)
+__NR_getpgid 121 sys_getpgid (pid_t pid)
+__NR_setfsuid 122 sys_setfsuid (int fsuid)
+__NR_setfsgid 123 sys_setfsgid (int fsgid)
+__NR_getsid 124 sys_getsid (void)
+__NR_capget 125 sys_capget (struct cap_header *h, struct cap_data *d)
+__NR_capset 126 sys_capset (struct cap_header *h, struct cap_data *d)
+__NR_rt_sigqueueinfo 129 sys_rt_sigqueueinfo (pid_t pid, int sig, siginfo_t *info)
+__NR_sigaltstack 131 sys_sigaltstack (const void *uss, void *uoss)
+__NR_personality 135 sys_personality (unsigned int personality)
+__NR_setpriority 141 sys_setpriority (int which, int who, int nice)
+__NR_sched_setscheduler 144 sys_sched_setscheduler (int pid, int policy, struct sched_param *p)
+__NR_prctl 157 sys_prctl (int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5)
+__NR_arch_prctl 158 sys_arch_prctl (int option, unsigned long addr)
+__NR_setrlimit 160 sys_setrlimit (int resource, struct krlimit *rlim)
+__NR_mount 165 sys_mount (char *dev_nmae, char *dir_name, char *type, unsigned long flags, void *data)
+__NR_umount2 166 sys_umount2 (char *name, int flags)
+__NR_gettid 186 sys_gettid (void)
+__NR_futex 202 sys_futex (u32 *uaddr, int op, u32 val, struct timespec *utime, u32 *uaddr2, u32 val3)
+__NR_set_thread_area 205 sys_set_thread_area (user_desc_t *info)
+__NR_io_setup 206 sys_io_setup (unsigned nr_events, aio_context_t *ctx)
+__NR_io_getevents 208 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo)
+__NR_get_thread_area 211 sys_get_thread_area (user_desc_t *info)
+__NR_set_tid_address 218 sys_set_tid_address (int *tid_addr)
+__NR_restart_syscall 219 sys_restart_syscall (void)
+__NR_sys_timer_create 222 sys_timer_create (clockid_t which_clock, struct sigevent *timer_event_spec, kernel_timer_t *created_timer_id)
+__NR_sys_timer_settime 223 sys_timer_settime (kernel_timer_t timer_id, int flags, const struct itimerspec *new_setting, struct itimerspec *old_setting)
+__NR_sys_timer_gettime 224 sys_timer_gettime (int timer_id, const struct itimerspec *setting)
+__NR_sys_timer_getoverrun 225 sys_timer_getoverrun (int timer_id)
+__NR_sys_timer_delete 226 sys_timer_delete (kernel_timer_t timer_id)
+__NR_clock_gettime 228 sys_clock_gettime (const clockid_t which_clock, const struct timespec *tp)
+__NR_exit_group 231 sys_exit_group (int error_code)
+__NR_openat 257 sys_openat (int dfd, const char *filename, int flags, int mode)
+__NR_waitid 247 sys_waitid (int which, pid_t pid, struct siginfo *infop, int options, struct rusage *ru)
+__NR_readlinkat 267 sys_readlinkat (int fd, const char *path, char *buf, int bufsize)
+__NR_set_robust_list 273 sys_set_robust_list (struct robust_list_head *head, size_t len)
+__NR_get_robust_list 274 sys_get_robust_list (int pid, struct robust_list_head **head_ptr, size_t *len_ptr)
+__NR_seccomp 317 sys_seccomp (unsigned int op, unsigned int flags, const char *uargs)
+__NR_vmsplice 278 sys_vmsplice (int fd, const struct iovec *iov, unsigned long nr_segs, unsigned int flags)
+__NR_timerfd_settime 286 sys_timerfd_settime (int ufd, int flags, const struct itimerspec *utmr, struct itimerspec *otmr)
+__NR_signalfd4 289 sys_signalfd (int fd, k_rtsigset_t *mask, size_t sizemask, int flags)
+__NR_rt_tgsigqueueinfo 297 sys_rt_tgsigqueueinfo (pid_t tgid, pid_t pid, int sig, siginfo_t *info)
+__NR_fanotify_init 300 sys_fanotify_init (unsigned int flags, unsigned int event_f_flags)
+__NR_fanotify_mark 301 sys_fanotify_mark (int fanotify_fd, unsigned int flags, u64 mask, int dfd, const char *pathname)
+__NR_prlimit64 302 sys_prlimit64 (pid_t pid, unsigned int resource, const struct rlimit64 *new_rlim, struct rlimit64 *old_rlim)
+__NR_open_by_handle_at 304 sys_open_by_handle_at (int mountdirfd, struct file_handle *handle, int flags)
+__NR_setns 308 sys_setns (int fd, int nstype)
+__NR_kcmp 312 sys_kcmp (pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2)
+__NR_memfd_create 319 sys_memfd_create (const char *name, unsigned int flags)
diff --git a/criu/arch/x86/vdso-pie.c b/criu/arch/x86/vdso-pie.c
new file mode 100644
index 000000000000..b1e087cd8837
--- /dev/null
+++ b/criu/arch/x86/vdso-pie.c
@@ -0,0 +1,59 @@
+#include <unistd.h>
+
+#include "asm/string.h"
+#include "asm/types.h"
+
+#include "syscall.h"
+#include "parasite-vdso.h"
+#include "log.h"
+#include "bug.h"
+
+#ifdef LOG_PREFIX
+# undef LOG_PREFIX
+#endif
+#define LOG_PREFIX "vdso: "
+
+#ifdef CONFIG_X86_64
+typedef struct {
+ u16 movabs;
+ u64 imm64;
+ u16 jmp_rax;
+ u32 guards;
+} __packed jmp_t;
+
+int vdso_redirect_calls(unsigned long base_to, unsigned long base_from,
+ struct vdso_symtable *to,
+ struct vdso_symtable *from)
+{
+ jmp_t jmp = {
+ .movabs = 0xb848,
+ .jmp_rax = 0xe0ff,
+ .guards = 0xcccccccc,
+ };
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(to->symbols); i++) {
+ if (vdso_symbol_empty(&from->symbols[i]))
+ continue;
+
+ pr_debug("jmp: %lx/%lx -> %lx/%lx (index %d)\n",
+ base_from, from->symbols[i].offset,
+ base_to, to->symbols[i].offset, i);
+
+ jmp.imm64 = base_to + to->symbols[i].offset;
+ builtin_memcpy((void *)(base_from + from->symbols[i].offset), &jmp, sizeof(jmp));
+ }
+
+ return 0;
+}
+
+#else /* CONFIG_X86_64 */
+
+int vdso_redirect_calls(unsigned long base_to, unsigned long base_from,
+ struct vdso_symtable *to,
+ struct vdso_symtable *from)
+{
+ return 0;
+}
+
+#endif /* CONFIG_X86_64 */
diff --git a/criu/bfd.c b/criu/bfd.c
new file mode 100644
index 000000000000..66c318c6ee5b
--- /dev/null
+++ b/criu/bfd.c
@@ -0,0 +1,327 @@
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <sys/uio.h>
+#include <errno.h>
+
+#include "bug.h"
+#include "log.h"
+#include "bfd.h"
+#include "list.h"
+#include "util.h"
+#include "xmalloc.h"
+#include "asm/page.h"
+
+#undef LOG_PREFIX
+#define LOG_PREFIX "bfd: "
+
+/*
+ * Kernel doesn't produce more than one page of
+ * date per one read call on proc files.
+ */
+#define BUFSIZE (PAGE_SIZE)
+
+struct bfd_buf {
+ char *mem;
+ struct list_head l;
+};
+
+static LIST_HEAD(bufs);
+
+#define BUFBATCH (16)
+
+static int buf_get(struct xbuf *xb)
+{
+ struct bfd_buf *b;
+
+ if (list_empty(&bufs)) {
+ void *mem;
+ int i;
+
+ mem = mmap(NULL, BUFBATCH * BUFSIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANON, 0, 0);
+ if (mem == MAP_FAILED) {
+ pr_perror("No buf");
+ return -1;
+ }
+
+ for (i = 0; i < BUFBATCH; i++) {
+ b = xmalloc(sizeof(*b));
+ if (!b) {
+ if (i == 0) {
+ pr_err("No buffer for bfd\n");
+ return -1;
+ }
+
+ pr_warn("BFD buffers partial refil!\n");
+ break;
+ }
+
+ b->mem = mem + i * BUFSIZE;
+ list_add_tail(&b->l, &bufs);
+ }
+ }
+
+ b = list_first_entry(&bufs, struct bfd_buf, l);
+ list_del_init(&b->l);
+
+ xb->mem = b->mem;
+ xb->data = xb->mem;
+ xb->sz = 0;
+ xb->buf = b;
+ return 0;
+}
+
+static void buf_put(struct xbuf *xb)
+{
+ /*
+ * Don't unmap buffer back, it will get reused
+ * by next bfdopen call
+ */
+ list_add(&xb->buf->l, &bufs);
+ xb->buf = NULL;
+ xb->mem = NULL;
+ xb->data = NULL;
+}
+
+static int bfdopen(struct bfd *f, bool writable)
+{
+ if (buf_get(&f->b)) {
+ close(f->fd);
+ return -1;
+ }
+
+ f->writable = writable;
+ return 0;
+}
+
+int bfdopenr(struct bfd *f)
+{
+ return bfdopen(f, false);
+}
+
+int bfdopenw(struct bfd *f)
+{
+ return bfdopen(f, true);
+}
+
+static int bflush(struct bfd *bfd);
+static bool flush_failed = false;
+
+int bfd_flush_images(void)
+{
+ return flush_failed ? -1 : 0;
+}
+
+void bclose(struct bfd *f)
+{
+ if (bfd_buffered(f)) {
+ if (f->writable && bflush(f) < 0) {
+ /*
+ * This is to propagate error up. It's
+ * hardly possible by returning and
+ * checking it, but setting a static
+ * flag, failing further bfdopen-s and
+ * checking one at the end would work.
+ */
+ flush_failed = true;
+ pr_perror("Error flushing image");
+ }
+
+ buf_put(&f->b);
+ }
+ close_safe(&f->fd);
+}
+
+static int brefill(struct bfd *f)
+{
+ int ret;
+ struct xbuf *b = &f->b;
+
+ memmove(b->mem, b->data, b->sz);
+ b->data = b->mem;
+
+ ret = read(f->fd, b->mem + b->sz, BUFSIZE - b->sz);
+ if (ret < 0) {
+ pr_perror("Error reading file");
+ return -1;
+ }
+
+ if (ret == 0)
+ return 0;
+
+ b->sz += ret;
+ return 1;
+}
+
+static char *strnchr(char *str, unsigned int len, char c)
+{
+ while (len > 0 && *str != c) {
+ str++;
+ len--;
+ }
+
+ return len == 0 ? NULL : str;
+}
+
+char *breadline(struct bfd *f)
+{
+ return breadchr(f, '\n');
+}
+
+char *breadchr(struct bfd *f, char c)
+{
+ struct xbuf *b = &f->b;
+ bool refilled = false;
+ char *n;
+ unsigned int ss = 0;
+
+again:
+ n = strnchr(b->data + ss, b->sz - ss, c);
+ if (n) {
+ char *ret;
+
+ ret = b->data;
+ b->data = n + 1; /* skip the \n found */
+ *n = '\0';
+ b->sz -= (b->data - ret);
+ return ret;
+ }
+
+ if (refilled) {
+ if (!b->sz)
+ return NULL;
+
+ /*
+ * Last bytes may lack the \n at the
+ * end, need to report this as full
+ * line anyway
+ */
+ b->data[b->sz] = '\0';
+
+ /*
+ * The b->data still points to old data,
+ * but we say that no bytes left there
+ * so next call to breadline will not
+ * "find" these bytes again.
+ */
+ b->sz = 0;
+ return b->data;
+ }
+
+ /*
+ * small optimization -- we've scanned b->sz
+ * symols already, no need to re-scan them after
+ * the buffer refill.
+ */
+ ss = b->sz;
+
+ /* no full line in the buffer -- refill one */
+ if (brefill(f) < 0)
+ return ERR_PTR(-EIO);
+
+ refilled = true;
+
+ goto again;
+}
+
+static int bflush(struct bfd *bfd)
+{
+ struct xbuf *b = &bfd->b;
+ int ret;
+
+ if (!b->sz)
+ return 0;
+
+ ret = write(bfd->fd, b->data, b->sz);
+ if (ret != b->sz)
+ return -1;
+
+ b->sz = 0;
+ return 0;
+}
+
+static int __bwrite(struct bfd *bfd, const void *buf, int size)
+{
+ struct xbuf *b = &bfd->b;
+
+ if (b->sz + size > BUFSIZE) {
+ int ret;
+ ret = bflush(bfd);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (size > BUFSIZE)
+ return write(bfd->fd, buf, size);
+
+ memcpy(b->data + b->sz, buf, size);
+ b->sz += size;
+ return size;
+}
+
+int bwrite(struct bfd *bfd, const void *buf, int size)
+{
+ if (!bfd_buffered(bfd))
+ return write(bfd->fd, buf, size);
+
+ return __bwrite(bfd, buf, size);
+}
+
+int bwritev(struct bfd *bfd, const struct iovec *iov, int cnt)
+{
+ int i, written = 0;
+
+ if (!bfd_buffered(bfd))
+ return writev(bfd->fd, iov, cnt);
+
+ for (i = 0; i < cnt; i++) {
+ int ret;
+
+ ret = __bwrite(bfd, (const void *)iov[i].iov_base, iov[i].iov_len);
+ if (ret < 0)
+ return ret;
+
+ written += ret;
+ if (ret < iov[i].iov_len)
+ break;
+ }
+
+ return written;
+}
+
+int bread(struct bfd *bfd, void *buf, int size)
+{
+ struct xbuf *b = &bfd->b;
+ int more = 1, filled = 0;
+
+ if (!bfd_buffered(bfd))
+ return read(bfd->fd, buf, size);
+
+ while (more > 0) {
+ int chunk;
+
+ chunk = size - filled;
+ if (chunk > b->sz)
+ chunk = b->sz;
+
+ if (chunk) {
+ memcpy(buf + filled, b->data, chunk);
+ b->data += chunk;
+ b->sz -= chunk;
+ filled += chunk;
+ }
+
+ if (filled < size)
+ more = brefill(bfd);
+ else {
+ BUG_ON(filled > size);
+ more = 0;
+ }
+ }
+
+ return more < 0 ? more : filled;
+}
diff --git a/criu/bitmap.c b/criu/bitmap.c
new file mode 100644
index 000000000000..65a501e728cb
--- /dev/null
+++ b/criu/bitmap.c
@@ -0,0 +1,54 @@
+#include "asm/bitsperlong.h"
+
+#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
+
+#define BITMAP_FIRST_WORD_MASK(start) (~0ul << ((start) % BITS_PER_LONG))
+
+#define BITMAP_LAST_WORD_MASK(nbits) \
+( \
+ ((nbits) % BITS_PER_LONG) ? \
+ (1ul << ((nbits) % BITS_PER_LONG)) - 1 : ~0ul \
+)
+
+#define small_const_nbits(nbits) \
+ (__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG)
+
+void bitmap_set(unsigned long *map, int start, int nr)
+{
+ unsigned long *p = map + BIT_WORD(start);
+ const int size = start + nr;
+ int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG);
+ unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start);
+
+ while (nr - bits_to_set >= 0) {
+ *p |= mask_to_set;
+ nr -= bits_to_set;
+ bits_to_set = BITS_PER_LONG;
+ mask_to_set = ~0UL;
+ p++;
+ }
+ if (nr) {
+ mask_to_set &= BITMAP_LAST_WORD_MASK(size);
+ *p |= mask_to_set;
+ }
+}
+
+void bitmap_clear(unsigned long *map, int start, int nr)
+{
+ unsigned long *p = map + BIT_WORD(start);
+ const int size = start + nr;
+ int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG);
+ unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start);
+
+ while (nr - bits_to_clear >= 0) {
+ *p &= ~mask_to_clear;
+ nr -= bits_to_clear;
+ bits_to_clear = BITS_PER_LONG;
+ mask_to_clear = ~0UL;
+ p++;
+ }
+ if (nr) {
+ mask_to_clear &= BITMAP_LAST_WORD_MASK(size);
+ *p &= ~mask_to_clear;
+ }
+}
diff --git a/criu/cgroup.c b/criu/cgroup.c
new file mode 100644
index 000000000000..704f144f0001
--- /dev/null
+++ b/criu/cgroup.c
@@ -0,0 +1,1571 @@
+#define LOG_PREFIX "cg: "
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <ftw.h>
+#include <libgen.h>
+#include "list.h"
+#include "xmalloc.h"
+#include "cgroup.h"
+#include "cr_options.h"
+#include "pstree.h"
+#include "proc_parse.h"
+#include "util.h"
+#include "imgset.h"
+#include "util-pie.h"
+#include "namespaces.h"
+#include "seize.h"
+#include "protobuf.h"
+#include "protobuf/core.pb-c.h"
+#include "protobuf/cgroup.pb-c.h"
+
+/*
+ * These string arrays have the names of all the properties that will be
+ * restored. To add a property for a cgroup type, add it to the
+ * corresponding char array above the NULL terminator. If you are adding
+ * a new cgroup family all together, you must also edit get_known_properties()
+ * Currently the code only supports properties with 1 value
+ */
+
+static const char *cpu_props[] = {
+ "cpu.shares",
+ "cpu.cfs_period_us",
+ "cpu.cfs_quota_us",
+ "cpu.rt_period_us",
+ "cpu.rt_runtime_us",
+ "notify_on_release",
+ NULL
+};
+
+static const char *memory_props[] = {
+ /* limit_in_bytes and memsw.limit_in_bytes must be set in this order */
+ "memory.limit_in_bytes",
+ "memory.memsw.limit_in_bytes",
+ "memory.use_hierarchy",
+ "notify_on_release",
+ NULL
+};
+
+static const char *cpuset_props[] = {
+ /*
+ * cpuset.cpus and cpuset.mems must be set before the process moves
+ * into its cgroup; they are "initialized" below to whatever the root
+ * values are in copy_special_cg_props so as not to cause ENOSPC when
+ * values are restored via this code.
+ */
+ "cpuset.cpus",
+ "cpuset.mems",
+ "cpuset.memory_migrate",
+ "cpuset.cpu_exclusive",
+ "cpuset.mem_exclusive",
+ "cpuset.mem_hardwall",
+ "cpuset.memory_spread_page",
+ "cpuset.memory_spread_slab",
+ "cpuset.sched_load_balance",
+ "cpuset.sched_relax_domain_level",
+ "notify_on_release",
+ NULL
+};
+
+static const char *blkio_props[] = {
+ "blkio.weight",
+ "notify_on_release",
+ NULL
+};
+
+static const char *freezer_props[] = {
+ "notify_on_release",
+ NULL
+};
+
+static const char *global_props[] = {
+ "cgroup.clone_children",
+ "notify_on_release",
+ "cgroup.procs",
+ "tasks",
+ NULL
+};
+
+/*
+ * This structure describes set of controller groups
+ * a task lives in. The cg_ctl entries are stored in
+ * the @ctls list sorted by the .name field and then
+ * by the .path field.
+ */
+
+struct cg_set {
+ u32 id;
+ struct list_head l;
+ unsigned int n_ctls;
+ struct list_head ctls;
+};
+
+static LIST_HEAD(cg_sets);
+static unsigned int n_sets;
+static CgSetEntry **rst_sets;
+static unsigned int n_controllers;
+static CgControllerEntry **controllers;
+static char *cg_yard;
+static struct cg_set *root_cgset; /* Set root item lives in */
+static struct cg_set *criu_cgset; /* Set criu process lives in */
+static u32 cg_set_ids = 1;
+
+static LIST_HEAD(cgroups);
+static unsigned int n_cgroups;
+
+static CgSetEntry *find_rst_set_by_id(u32 id)
+{
+ int i;
+
+ for (i = 0; i < n_sets; i++)
+ if (rst_sets[i]->id == id)
+ return rst_sets[i];
+
+ return NULL;
+}
+
+#define CGCMP_MATCH 1 /* check for exact match */
+#define CGCMP_ISSUB 2 /* check set is subset of ctls */
+
+static bool cg_set_compare(struct cg_set *set, struct list_head *ctls, int what)
+{
+ struct list_head *l1 = &set->ctls, *l2 = ctls;
+
+ while (1) {
+ struct cg_ctl *c1 = NULL, *c2 = NULL;
+
+ if (l1->next != &set->ctls)
+ c1 = list_first_entry(l1, struct cg_ctl, l);
+ if (l2->next != ctls)
+ c2 = list_first_entry(l2, struct cg_ctl, l);
+
+ if (!c1 || !c2) /* Nowhere to move next */
+ return !c1 && !c2; /* Both lists scanned -- match */
+
+ if (strcmp(c1->name, c2->name))
+ return false;
+
+ switch (what) {
+ case CGCMP_MATCH:
+ if (strcmp(c1->path, c2->path))
+ return false;
+
+ break;
+ case CGCMP_ISSUB:
+ if (!strstartswith(c1->path, c2->path))
+ return false;
+
+ break;
+ }
+
+ l1 = l1->next;
+ l2 = l2->next;
+ }
+}
+
+static struct cg_set *get_cg_set(struct list_head *ctls, unsigned int n_ctls)
+{
+ struct cg_set *cs;
+
+ list_for_each_entry(cs, &cg_sets, l)
+ if (cg_set_compare(cs, ctls, CGCMP_MATCH)) {
+ pr_debug(" `- Existing css %d found\n", cs->id);
+ put_ctls(ctls);
+ return cs;
+ }
+
+ pr_debug(" `- New css ID %d\n", cg_set_ids);
+ cs = xmalloc(sizeof(*cs));
+ if (cs) {
+ cs->id = cg_set_ids++;
+ INIT_LIST_HEAD(&cs->ctls);
+ list_splice(ctls, &cs->ctls);
+ cs->n_ctls = n_ctls;
+ list_add_tail(&cs->l, &cg_sets);
+ n_sets++;
+
+ if (!pr_quelled(LOG_DEBUG)) {
+ struct cg_ctl *ctl;
+
+ list_for_each_entry(ctl, &cs->ctls, l)
+ pr_debug(" `- [%s] -> [%s]\n", ctl->name, ctl->path);
+ }
+ }
+
+ return cs;
+}
+
+struct cg_controller *new_controller(const char *name)
+{
+ struct cg_controller *nc = xmalloc(sizeof(*nc));
+ if (!nc)
+ return NULL;
+
+ nc->controllers = xmalloc(sizeof(char *));
+ if (!nc->controllers) {
+ xfree(nc);
+ return NULL;
+ }
+
+ nc->controllers[0] = xstrdup(name);
+ if (!nc->controllers[0]) {
+ xfree(nc->controllers);
+ xfree(nc);
+ return NULL;
+ }
+
+ nc->n_controllers = 1;
+
+ nc->n_heads = 0;
+ INIT_LIST_HEAD(&nc->heads);
+
+ return nc;
+}
+
+int parse_cg_info(void)
+{
+ if (collect_controllers(&cgroups, &n_cgroups) < 0)
+ return -1;
+
+ return 0;
+}
+
+/* Check that co-mounted controllers from /proc/cgroups (e.g. cpu and cpuacct)
+ * are contained in a comma separated string (e.g. from /proc/self/cgroup or
+ * mount options). */
+static bool cgroup_contains(char **controllers, unsigned int n_controllers, char *name)
+{
+ unsigned int i;
+ bool all_match = true;
+ for (i = 0; i < n_controllers; i++) {
+ bool found = false;
+ const char *loc = name;
+ do {
+ loc = strstr(loc, controllers[i]);
+ if (loc) {
+ loc += strlen(controllers[i]);
+ switch (*loc) {
+ case '\0':
+ case ',':
+ found = true;
+ break;
+ }
+ }
+ } while (loc);
+ all_match &= found;
+ }
+
+ return all_match && n_controllers > 0;
+}
+
+/* This is for use in add_cgroup() as additional arguments for the ftw()
+ * callback */
+static struct cg_controller *current_controller;
+static unsigned int path_pref_len;
+
+#define EXACT_MATCH 0
+#define PARENT_MATCH 1
+#define NO_MATCH 2
+
+static int find_dir(const char *path, struct list_head *dirs, struct cgroup_dir **rdir)
+{
+ struct cgroup_dir *d;
+ list_for_each_entry(d, dirs, siblings) {
+ if (strcmp(d->path, path) == 0) {
+ *rdir = d;
+ return EXACT_MATCH;
+ }
+
+ if (strstartswith(path, d->path)) {
+ int ret = find_dir(path, &d->children, rdir);
+ if (ret == NO_MATCH) {
+ *rdir = d;
+ return PARENT_MATCH;
+ }
+ return ret;
+
+ }
+ }
+
+ return NO_MATCH;
+}
+
+/*
+ * Strips trailing '\n' from the string
+ */
+static inline char *strip(char *str)
+{
+ char *e;
+
+ e = strchr(str, '\0');
+ if (e != str && *(e - 1) == '\n')
+ *(e - 1) = '\0';
+
+ return str;
+}
+
+/*
+ * Currently this function only supports properties that have a string value
+ * under 1024 chars.
+ */
+static int read_cgroup_prop(struct cgroup_prop *property, const char *fullpath)
+{
+ char buf[1024];
+ int fd, ret;
+ struct stat sb;
+
+ fd = open(fullpath, O_RDONLY);
+ if (fd == -1) {
+ property->value = NULL;
+ pr_perror("Failed opening %s", fullpath);
+ return -1;
+ }
+
+ if (fstat(fd, &sb) < 0) {
+ pr_perror("failed statting cgroup prop %s", fullpath);
+ close(fd);
+ return -1;
+ }
+
+ property->mode = sb.st_mode;
+ property->uid = sb.st_uid;
+ property->gid = sb.st_gid;
+
+ /* skip dumping the value of these, since it doesn't make sense (we
+ * just want to restore the perms) */
+ if (!strcmp(property->name, "cgroup.procs") || !strcmp(property->name, "tasks")) {
+ ret = 0;
+ /* libprotobuf segfaults if we leave a null pointer in a
+ * string, so let's not do that */
+ property->value = xstrdup("");
+ if (!property->value)
+ ret = -1;
+
+ close(fd);
+ return ret;
+ }
+
+ ret = read(fd, buf, sizeof(buf) - 1);
+ if (ret == -1) {
+ pr_err("Failed scanning %s\n", fullpath);
+ close(fd);
+ return -1;
+ }
+ close(fd);
+
+ buf[ret] = 0;
+
+ if (strtoll(buf, NULL, 10) == LLONG_MAX)
+ strcpy(buf, "-1");
+
+ property->value = xstrdup(strip(buf));
+ if (!property->value)
+ return -1;
+ return 0;
+}
+
+static struct cgroup_prop *create_cgroup_prop(const char *name)
+{
+ struct cgroup_prop *property;
+
+ property = xmalloc(sizeof(*property));
+ if (!property)
+ return NULL;
+
+ property->name = xstrdup(name);
+ if (!property->name) {
+ xfree(property);
+ return NULL;
+ }
+
+ property->value = NULL;
+ return property;
+}
+
+static void free_cgroup_prop(struct cgroup_prop *prop)
+{
+ xfree(prop->name);
+ xfree(prop->value);
+ xfree(prop);
+}
+
+static void free_all_cgroup_props(struct cgroup_dir *ncd)
+{
+ struct cgroup_prop *prop, *t;
+
+ list_for_each_entry_safe(prop, t, &ncd->properties, list) {
+ list_del(&prop->list);
+ free_cgroup_prop(prop);
+ }
+
+ INIT_LIST_HEAD(&ncd->properties);
+ ncd->n_properties = 0;
+}
+
+static const char **get_known_properties(char *controller)
+{
+ const char **prop_arr = NULL;
+
+ if (!strcmp(controller, "cpu"))
+ prop_arr = cpu_props;
+ else if (!strcmp(controller, "memory"))
+ prop_arr = memory_props;
+ else if (!strcmp(controller, "cpuset"))
+ prop_arr = cpuset_props;
+ else if (!strcmp(controller, "blkio"))
+ prop_arr = blkio_props;
+ else if (!strcmp(controller, "freezer"))
+ prop_arr = freezer_props;
+
+ return prop_arr;
+}
+
+static int dump_cg_props_array(const char *fpath, struct cgroup_dir *ncd,
+ const char **prop_arr)
+{
+ int j;
+ char buf[PATH_MAX];
+ struct cgroup_prop *prop;
+
+ for (j = 0; prop_arr != NULL && prop_arr[j] != NULL; ++j) {
+ if (snprintf(buf, PATH_MAX, "%s/%s", fpath, prop_arr[j]) >= PATH_MAX) {
+ pr_err("snprintf output was truncated\n");
+ return -1;
+ }
+
+ if (access(buf, F_OK) < 0 && errno == ENOENT) {
+ pr_info("Couldn't open %s. This cgroup property may not exist on this kernel\n", buf);
+ continue;
+ }
+
+ prop = create_cgroup_prop(prop_arr[j]);
+ if (!prop) {
+ free_all_cgroup_props(ncd);
+ return -1;
+ }
+
+ if (read_cgroup_prop(prop, buf) < 0) {
+ free_cgroup_prop(prop);
+ free_all_cgroup_props(ncd);
+ return -1;
+ }
+
+ pr_info("Dumping value %s from %s/%s\n", prop->value, fpath, prop->name);
+ list_add_tail(&prop->list, &ncd->properties);
+ ncd->n_properties++;
+ }
+
+ return 0;
+}
+
+static int add_cgroup_properties(const char *fpath, struct cgroup_dir *ncd,
+ struct cg_controller *controller)
+{
+ int i;
+
+ for (i = 0; i < controller->n_controllers; ++i) {
+
+ const char **prop_arr = get_known_properties(controller->controllers[i]);
+
+ if (dump_cg_props_array(fpath, ncd, prop_arr) < 0) {
+ pr_err("dumping known properties failed");
+ return -1;
+ }
+
+ if (dump_cg_props_array(fpath, ncd, global_props) < 0) {
+ pr_err("dumping global properties failed");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int add_cgroup(const char *fpath, const struct stat *sb, int typeflag)
+{
+ struct cgroup_dir *ncd = NULL, *match;
+ int exit_code = -1;
+
+ if (typeflag == FTW_D) {
+ int mtype;
+
+ pr_info("adding cgroup %s\n", fpath);
+
+ ncd = xmalloc(sizeof(*ncd));
+ if (!ncd)
+ goto out;
+
+ ncd->mode = sb->st_mode;
+ ncd->uid = sb->st_uid;
+ ncd->gid = sb->st_gid;
+
+ /* chop off the first "/proc/self/fd/N" str */
+ if (fpath[path_pref_len] == '\0')
+ ncd->path = xstrdup("/");
+ else
+ ncd->path = xstrdup(fpath + path_pref_len);
+
+ if (!ncd->path)
+ goto out;
+
+ mtype = find_dir(ncd->path, ¤t_controller->heads, &match);
+
+ switch (mtype) {
+ /* ignore co-mounted cgroups */
+ case EXACT_MATCH:
+ exit_code = 0;
+ goto out;
+ case PARENT_MATCH:
+ list_add_tail(&ncd->siblings, &match->children);
+ match->n_children++;
+ break;
+ case NO_MATCH:
+ list_add_tail(&ncd->siblings, ¤t_controller->heads);
+ current_controller->n_heads++;
+ break;
+ default:
+ BUG();
+ }
+
+ INIT_LIST_HEAD(&ncd->children);
+ ncd->n_children = 0;
+
+ INIT_LIST_HEAD(&ncd->properties);
+ ncd->n_properties = 0;
+ if (add_cgroup_properties(fpath, ncd, current_controller) < 0)
+ goto out;
+ }
+
+ return 0;
+
+out:
+ if (ncd)
+ xfree(ncd->path);
+ xfree(ncd);
+ return exit_code;
+}
+
+static int add_freezer_state(struct cg_controller *controller)
+{
+ struct cgroup_dir *root_dir;
+ struct cgroup_prop *prop;
+
+ /*
+ * Here we rely on --freeze-cgroup option assumption that all tasks are in a
+ * specified freezer cgroup hierarchy, so we need to dump only one root freezer cgroup.
+ */
+ if (!list_is_singular(&controller->heads)) {
+ pr_err("Should be only one root freezer cgroup");
+ return -1;
+ }
+ root_dir = list_first_entry(&controller->heads, struct cgroup_dir, siblings);
+
+ prop = create_cgroup_prop("freezer.state");
+ if (!prop)
+ return -1;
+ prop->value = xstrdup(get_real_freezer_state());
+ if (!prop->value) {
+ free_cgroup_prop(prop);
+ return -1;
+ }
+
+ list_add_tail(&prop->list, &root_dir->properties);
+ root_dir->n_properties++;
+
+ return 0;
+}
+
+static int collect_cgroups(struct list_head *ctls)
+{
+ struct cg_ctl *cc;
+ int ret = 0;
+ int fd = -1;
+
+ list_for_each_entry(cc, ctls, l) {
+ char path[PATH_MAX], mopts[1024];
+ char *name, prefix[] = ".criu.cgmounts.XXXXXX";
+ struct cg_controller *cg;
+
+ current_controller = NULL;
+
+ /* We should get all the "real" (i.e. not name=systemd type)
+ * controller from parse_cgroups(), so find that controller if
+ * it exists. */
+ list_for_each_entry(cg, &cgroups, l) {
+ if (cgroup_contains(cg->controllers, cg->n_controllers, cc->name)) {
+ current_controller = cg;
+ break;
+ }
+ }
+
+ if (!current_controller) {
+ /* only allow "fake" controllers to be created this way */
+ if (!strstartswith(cc->name, "name=")) {
+ pr_err("controller %s not found\n", cc->name);
+ return -1;
+ } else {
+ struct cg_controller *nc = new_controller(cc->name);
+ list_add_tail(&nc->l, &cg->l);
+ n_cgroups++;
+ current_controller = nc;
+ }
+ }
+
+ if (!opts.manage_cgroups)
+ continue;
+
+ if (strstartswith(cc->name, "name=")) {
+ name = cc->name + 5;
+ snprintf(mopts, sizeof(mopts), "none,%s", cc->name);
+ } else {
+ name = cc->name;
+ snprintf(mopts, sizeof(mopts), "%s", name);
+ }
+
+ if (mkdtemp(prefix) == NULL) {
+ pr_perror("can't make dir for cg mounts");
+ return -1;
+ }
+
+ if (mount("none", prefix, "cgroup", 0, mopts) < 0) {
+ pr_perror("couldn't mount %s", mopts);
+ rmdir(prefix);
+ return -1;
+ }
+
+ fd = open_detach_mount(prefix);
+ if (fd < 0)
+ return -1;
+
+ path_pref_len = snprintf(path, PATH_MAX, "/proc/self/fd/%d", fd);
+ snprintf(path + path_pref_len, PATH_MAX - path_pref_len, "%s", cc->path);
+
+ ret = ftw(path, add_cgroup, 4);
+ if (ret < 0)
+ pr_perror("failed walking %s for empty cgroups", path);
+
+ close_safe(&fd);
+
+ if (ret < 0)
+ return ret;
+
+ if (opts.freeze_cgroup && !strcmp(cc->name, "freezer") &&
+ add_freezer_state(current_controller))
+ return -1;
+ }
+
+ return 0;
+}
+
+int dump_task_cgroup(struct pstree_item *item, u32 *cg_id)
+{
+ int pid;
+ LIST_HEAD(ctls);
+ unsigned int n_ctls = 0;
+ struct cg_set *cs;
+
+ if (item)
+ pid = item->pid.real;
+ else
+ pid = getpid();
+
+ pr_info("Dumping cgroups for %d\n", pid);
+ if (parse_task_cgroup(pid, &ctls, &n_ctls))
+ return -1;
+
+ cs = get_cg_set(&ctls, n_ctls);
+ if (!cs)
+ return -1;
+
+ if (!item) {
+ BUG_ON(criu_cgset);
+ criu_cgset = cs;
+ pr_info("Set %d is criu one\n", cs->id);
+ } else if (item == root_item) {
+ BUG_ON(root_cgset);
+ root_cgset = cs;
+ pr_info("Set %d is root one\n", cs->id);
+
+ /*
+ * The on-stack ctls is moved into cs inside
+ * the get_cg_set routine.
+ */
+ if (cs != criu_cgset && collect_cgroups(&cs->ctls))
+ return -1;
+ }
+
+ *cg_id = cs->id;
+ return 0;
+}
+
+static int dump_cg_dir_props(struct list_head *props, size_t n_props,
+ CgroupPropEntry ***ents)
+{
+ struct cgroup_prop *prop_cur;
+ CgroupPropEntry *cpe;
+ void *m;
+ int i = 0;
+
+ m = xmalloc(n_props * (sizeof(CgroupPropEntry *) + sizeof(CgroupPropEntry)));
+ *ents = m;
+ if (!m)
+ return -1;
+
+ cpe = m + n_props * sizeof(CgroupPropEntry *);
+
+ list_for_each_entry(prop_cur, props, list) {
+ cgroup_prop_entry__init(cpe);
+
+ cpe->perms = xmalloc(sizeof(*cpe->perms));
+ if (!cpe->perms)
+ goto error;
+ cgroup_perms__init(cpe->perms);
+
+ cpe->name = xstrdup(prop_cur->name);
+ cpe->value = xstrdup(prop_cur->value);
+ if (!cpe->name || !cpe->value)
+ goto error;
+ cpe->perms->mode = prop_cur->mode;
+ cpe->perms->uid = prop_cur->uid;
+ cpe->perms->gid = prop_cur->gid;
+
+ (*ents)[i++] = cpe++;
+ }
+
+ return 0;
+
+error:
+ while (i >= 0) {
+ xfree(cpe->name);
+ xfree(cpe->value);
+ --cpe;
+ --i;
+ }
+
+ xfree(*ents);
+ return -1;
+}
+
+static int dump_cg_dirs(struct list_head *dirs, size_t n_dirs, CgroupDirEntry ***ents, int poff)
+{
+ struct cgroup_dir *cur;
+ CgroupDirEntry *cde;
+ void *m;
+ int i = 0;
+
+ m = xmalloc(n_dirs * (sizeof(CgroupDirEntry *) + sizeof(CgroupDirEntry)));
+ *ents = m;
+ if (!m)
+ return -1;
+
+ cde = m + n_dirs * sizeof(CgroupDirEntry *);
+
+ list_for_each_entry(cur, dirs, siblings) {
+ cgroup_dir_entry__init(cde);
+
+ cde->dir_perms = xmalloc(sizeof(*cde->dir_perms));
+ if (!cde->dir_perms)
+ return -1;
+ cgroup_perms__init(cde->dir_perms);
+
+ cde->dir_perms->mode = cur->mode;
+ cde->dir_perms->uid = cur->uid;
+ cde->dir_perms->gid = cur->gid;
+
+ cde->dir_name = cur->path + poff;
+ if (poff != 1) /* parent isn't "/" */
+ cde->dir_name++; /* leading / */
+ cde->n_children = cur->n_children;
+ if (cur->n_children > 0)
+ if (dump_cg_dirs(&cur->children, cur->n_children, &cde->children, strlen(cur->path)) < 0) {
+ xfree(*ents);
+ return -1;
+ }
+
+ cde->n_properties = cur->n_properties;
+ if (cde->n_properties > 0) {
+ if (dump_cg_dir_props(&cur->properties,
+ cde->n_properties, &cde->properties) < 0) {
+ xfree(*ents);
+ return -1;
+ }
+ }
+
+ (*ents)[i++] = cde++;
+ }
+
+ return 0;
+}
+
+static int dump_controllers(CgroupEntry *cg)
+{
+ struct cg_controller *cur;
+ CgControllerEntry *ce;
+ void *m;
+ int i;
+
+ cg->n_controllers = n_cgroups;
+ m = xmalloc(n_cgroups * (sizeof(CgControllerEntry *) + sizeof(CgControllerEntry)));
+ cg->controllers = m;
+ ce = m + cg->n_controllers * sizeof(CgControllerEntry *);
+ if (!m)
+ return -1;
+
+ i = 0;
+ list_for_each_entry(cur, &cgroups, l) {
+ cg_controller_entry__init(ce);
+
+ ce->cnames = cur->controllers;
+ ce->n_cnames = cur->n_controllers;
+ ce->n_dirs = cur->n_heads;
+ if (ce->n_dirs > 0)
+ if (dump_cg_dirs(&cur->heads, cur->n_heads, &ce->dirs, 0) < 0) {
+ xfree(cg->controllers);
+ return -1;
+ }
+ cg->controllers[i++] = ce++;
+ }
+
+ return 0;
+}
+
+
+static int dump_sets(CgroupEntry *cg)
+{
+ struct cg_set *set;
+ struct cg_ctl *ctl;
+ int s, c;
+ void *m;
+ CgSetEntry *se;
+ CgMemberEntry *ce;
+
+ pr_info("Dumping %d sets\n", n_sets - 1);
+
+ cg->n_sets = n_sets - 1;
+ m = xmalloc(cg->n_sets * (sizeof(CgSetEntry *) + sizeof(CgSetEntry)));
+ cg->sets = m;
+ se = m + cg->n_sets * sizeof(CgSetEntry *);
+ if (!m)
+ return -1;
+
+ s = 0;
+ list_for_each_entry(set, &cg_sets, l) {
+ if (set == criu_cgset)
+ continue;
+
+ /*
+ * Check that all sets we've found that tasks live in are
+ * subsets of the one root task lives in
+ */
+
+ pr_info(" `- Dumping %d set (%d ctls)\n", set->id, set->n_ctls);
+ if (!cg_set_compare(set, &root_cgset->ctls, CGCMP_ISSUB)) {
+ pr_err("Set %d is not subset of %d\n",
+ set->id, root_cgset->id);
+
+ list_for_each_entry(ctl, &set->ctls, l)
+ pr_info(" `- %s of %s\n", ctl->name, ctl->path);
+ return -1;
+ }
+
+ /*
+ * Now encode them onto the image entry
+ */
+
+ cg_set_entry__init(se);
+ se->id = set->id;
+
+ se->n_ctls = set->n_ctls;
+ m = xmalloc(se->n_ctls * (sizeof(CgMemberEntry *) + sizeof(CgMemberEntry)));
+ se->ctls = m;
+ ce = m + se->n_ctls * sizeof(CgMemberEntry *);
+ if (!m)
+ return -1;
+
+ c = 0;
+ list_for_each_entry(ctl, &set->ctls, l) {
+ pr_info(" `- Dumping %s of %s\n", ctl->name, ctl->path);
+ cg_member_entry__init(ce);
+ ce->name = ctl->name;
+ ce->path = ctl->path;
+ se->ctls[c++] = ce++;
+ }
+
+ cg->sets[s++] = se++;
+ }
+
+ return 0;
+}
+
+int dump_cgroups(void)
+{
+ CgroupEntry cg = CGROUP_ENTRY__INIT;
+
+ BUG_ON(!criu_cgset || !root_cgset);
+
+ /*
+ * Check whether root task lives in its own set as compared
+ * to criu. If yes, we should not dump anything, but make
+ * sure no other sets exist. The latter case can be supported,
+ * but requires some trickery and is hardly needed at the
+ * moment.
+ */
+
+ if (root_cgset == criu_cgset) {
+ if (!list_is_singular(&cg_sets)) {
+ pr_err("Non supported sub-cgroups found\n");
+ return -1;
+ }
+
+ pr_info("All tasks in criu's cgroups. Nothing to dump.\n");
+ return 0;
+ }
+
+ if (dump_sets(&cg))
+ return -1;
+ if (dump_controllers(&cg))
+ return -1;
+
+ pr_info("Writing CG image\n");
+ return pb_write_one(img_from_set(glob_imgset, CR_FD_CGROUP), &cg, PB_CGROUP);
+}
+
+static int ctrl_dir_and_opt(CgControllerEntry *ctl, char *dir, int ds,
+ char *opt, int os)
+{
+ int i, doff = 0, ooff = 0;
+ bool none_opt = false;
+
+ for (i = 0; i < ctl->n_cnames; i++) {
+ char *n;
+
+ n = ctl->cnames[i];
+ if (strstartswith(n, "name=")) {
+ n += 5;
+ if (opt && !none_opt) {
+ ooff += snprintf(opt + ooff, os - ooff, "none,");
+ none_opt = true;
+ }
+ }
+
+ doff += snprintf(dir + doff, ds - doff, "%s,", n);
+ if (opt)
+ ooff += snprintf(opt + ooff, os - ooff, "%s,", ctl->cnames[i]);
+ }
+
+ /* Chop the trailing ','-s */
+ dir[--doff] = '\0';
+ if (opt)
+ opt[ooff - 1] = '\0';
+
+ return doff;
+}
+
+static const char *special_cpuset_props[] = {
+ "cpuset.cpus",
+ "cpuset.mems",
+ NULL,
+};
+
+static int userns_move(void *arg, int fd, pid_t pid)
+{
+ char pidbuf[32];
+ int cg, len, err;
+
+ len = snprintf(pidbuf, sizeof(pidbuf), "%d", pid);
+
+ if (len >= sizeof(pidbuf)) {
+ pr_err("pid printing failed: %d\n", pid);
+ return -1;
+ }
+
+ cg = get_service_fd(CGROUP_YARD);
+ err = fd = openat(cg, arg, O_WRONLY);
+ if (fd >= 0) {
+ err = write(fd, pidbuf, len);
+ close(fd);
+ }
+
+ if (err < 0) {
+ pr_perror("Can't move %s into %s (%d/%d)", pidbuf, (char *)arg, err, fd);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int move_in_cgroup(CgSetEntry *se)
+{
+ int i;
+
+ pr_info("Move into %d\n", se->id);
+ for (i = 0; i < se->n_ctls; i++) {
+ char aux[PATH_MAX];
+ int fd = -1, err, j, aux_off;
+ CgMemberEntry *ce = se->ctls[i];
+ CgControllerEntry *ctrl = NULL;
+
+ for (j = 0; j < n_controllers; j++) {
+ CgControllerEntry *cur = controllers[j];
+ if (cgroup_contains(cur->cnames, cur->n_cnames, ce->name)) {
+ ctrl = cur;
+ break;
+ }
+ }
+
+ if (!ctrl) {
+ pr_err("No cg_controller_entry found for %s/%s\n", ce->name, ce->path);
+ return -1;
+ }
+
+ aux_off = ctrl_dir_and_opt(ctrl, aux, sizeof(aux), NULL, 0);
+
+ snprintf(aux + aux_off, sizeof(aux) - aux_off, "/%s/tasks", ce->path);
+ pr_debug(" `-> %s\n", aux);
+ err = userns_call(userns_move, UNS_ASYNC, aux, strlen(aux) + 1, -1);
+ if (err < 0) {
+ pr_perror("Can't move into %s (%d/%d)", aux, err, fd);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+int prepare_task_cgroup(struct pstree_item *me)
+{
+ CgSetEntry *se;
+ u32 current_cgset;
+
+ if (!rsti(me)->cg_set)
+ return 0;
+
+ if (me->parent)
+ current_cgset = rsti(me->parent)->cg_set;
+ else
+ current_cgset = root_cg_set;
+
+ if (rsti(me)->cg_set == current_cgset) {
+ pr_info("Cgroups %d inherited from parent\n", current_cgset);
+ return 0;
+ }
+
+ se = find_rst_set_by_id(rsti(me)->cg_set);
+ if (!se) {
+ pr_err("No set %d found\n", rsti(me)->cg_set);
+ return -1;
+ }
+
+ return move_in_cgroup(se);
+}
+
+void fini_cgroup(void)
+{
+ if (!cg_yard)
+ return;
+
+ close_service_fd(CGROUP_YARD);
+ umount2(cg_yard, MNT_DETACH);
+ rmdir(cg_yard);
+ xfree(cg_yard);
+ cg_yard = NULL;
+}
+
+static int restore_perms(int fd, const char *path, CgroupPerms *perms)
+{
+ struct stat sb;
+
+ if (perms) {
+ if (fstat(fd, &sb) < 0) {
+ pr_perror("stat of property %s failed", path);
+ return -1;
+ }
+
+ /* only chmod/chown if the perms are actually different: we aren't
+ * allowed to chmod some cgroup props (e.g. the read only ones), so we
+ * don't want to try if the perms already match.
+ */
+ if (sb.st_mode != (mode_t) perms->mode && fchmod(fd, perms->mode) < 0) {
+ pr_perror("chmod of %s failed", path);
+ return -1;
+ }
+
+ if ((sb.st_uid != perms->uid || sb.st_gid != perms->gid) &&
+ fchown(fd, perms->uid, perms->gid)) {
+ pr_perror("chown of %s failed", path);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int restore_cgroup_prop(const CgroupPropEntry * cg_prop_entry_p,
+ char *path, int off)
+{
+ FILE *f;
+ int cg, fd;
+ CgroupPerms *perms = cg_prop_entry_p->perms;
+
+ if (!cg_prop_entry_p->value) {
+ pr_err("cg_prop_entry->value was empty when should have had a value\n");
+ return -1;
+ }
+
+ if (snprintf(path + off, PATH_MAX - off, "/%s", cg_prop_entry_p->name) >= PATH_MAX) {
+ pr_err("snprintf output was truncated for %s\n", cg_prop_entry_p->name);
+ return -1;
+ }
+
+ pr_info("Restoring cgroup property value [%s] to [%s]\n", cg_prop_entry_p->value, path);
+
+ cg = get_service_fd(CGROUP_YARD);
+ f = fopenat(cg, path, "w+");
+ if (!f) {
+ pr_perror("Failed opening %s for writing", path);
+ return -1;
+ }
+
+ fd = fileno(f);
+ if (fd < 0) {
+ fclose(f);
+ pr_err("bad file stream?");
+ return -1;
+ }
+
+ if (restore_perms(fd, path, perms) < 0) {
+ fclose(f);
+ return -1;
+ }
+
+ /* skip these two since restoring their values doesn't make sense */
+ if (!strcmp(cg_prop_entry_p->name, "cgroup.procs") || !strcmp(cg_prop_entry_p->name, "tasks")) {
+ fclose(f);
+ return 0;
+ }
+
+ if (fprintf(f, "%s", cg_prop_entry_p->value) < 0) {
+ fclose(f);
+ pr_err("Failed writing %s to %s\n", cg_prop_entry_p->value, path);
+ return -1;
+ }
+
+ if (fclose(f) != 0) {
+ pr_perror("Failed closing %s", path);
+ return -1;
+ }
+
+ return 0;
+}
+
+static CgroupPropEntry *freezer_state_entry;
+static char freezer_path[PATH_MAX];
+
+int restore_freezer_state(void)
+{
+ size_t freezer_path_len;
+
+ if (!freezer_state_entry)
+ return 0;
+
+ freezer_path_len = strlen(freezer_path);
+ return restore_cgroup_prop(freezer_state_entry, freezer_path, freezer_path_len);
+}
+
+static void add_freezer_state_for_restore(CgroupPropEntry *entry, char *path, size_t path_len)
+{
+ BUG_ON(freezer_state_entry);
+ BUG_ON(path_len >= sizeof(freezer_path));
+
+ freezer_state_entry = entry;
+ /* Path is not null terminated at path_len */
+ strncpy(freezer_path, path, path_len);
+ freezer_path[path_len] = 0;
+}
+
+static int prepare_cgroup_dir_properties(char *path, int off, CgroupDirEntry **ents,
+ unsigned int n_ents)
+{
+ unsigned int i, j;
+
+ for (i = 0; i < n_ents; i++) {
+ CgroupDirEntry *e = ents[i];
+ size_t off2 = off;
+
+ if (strcmp(e->dir_name, "") == 0)
+ goto skip; /* skip root cgroups */
+
+ off2 += sprintf(path + off, "/%s", e->dir_name);
+ if (e->n_properties > 0) {
+ for (j = 0; j < e->n_properties; ++j) {
+ if (!strcmp(e->properties[j]->name, "freezer.state")) {
+ add_freezer_state_for_restore(e->properties[j], path, off2);
+ continue; /* skip restore now */
+ }
+ if (restore_cgroup_prop(e->properties[j], path, off2) < 0)
+ return -1;
+ }
+ }
+skip:
+ if (prepare_cgroup_dir_properties(path, off2, e->children, e->n_children) < 0)
+ return -1;
+ }
+
+ return 0;
+}
+
+int prepare_cgroup_properties(void)
+{
+ char cname_path[PATH_MAX];
+ unsigned int i, off;
+
+ for (i = 0; i < n_controllers; i++) {
+ CgControllerEntry *c = controllers[i];
+
+ if (c->n_cnames < 1) {
+ pr_err("Each CgControllerEntry should have at least 1 cname\n");
+ return -1;
+ }
+
+ off = ctrl_dir_and_opt(c, cname_path, sizeof(cname_path), NULL, 0);
+ if (prepare_cgroup_dir_properties(cname_path, off, c->dirs, c->n_dirs) < 0)
+ return -1;
+ }
+
+ return 0;
+}
+
+static int restore_special_cpuset_props(char *paux, size_t off, CgroupDirEntry *e)
+{
+ int i, j;
+
+ pr_info("Restore special cpuset props\n");
+
+ for (i = 0; special_cpuset_props[i]; i++) {
+ const char *name = special_cpuset_props[i];
+
+ for (j = 0; j < e->n_properties; j++) {
+ CgroupPropEntry *prop = e->properties[j];
+
+ if (strcmp(name, prop->name) == 0)
+ if (restore_cgroup_prop(prop, paux, off) < 0)
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int prepare_dir_perms(int cg, char *path, CgroupPerms *perms)
+{
+ int fd, ret;
+
+ fd = openat(cg, path, O_DIRECTORY);
+ if (fd < 0) {
+ pr_perror("failed to open cg dir fd (%s) for chowning", path);
+ return -1;
+ }
+
+ ret = restore_perms(fd, path, perms);
+ close(fd);
+ return ret;
+}
+
+static int prepare_cgroup_dirs(char **controllers, int n_controllers, char *paux, size_t off,
+ CgroupDirEntry **ents, size_t n_ents)
+{
+ size_t i, j;
+ CgroupDirEntry *e;
+ int cg = get_service_fd(CGROUP_YARD);
+
+ for (i = 0; i < n_ents; i++) {
+ size_t off2 = off;
+ e = ents[i];
+
+ off2 += sprintf(paux + off, "/%s", e->dir_name);
+
+ if (faccessat(cg, paux, F_OK, 0) < 0) {
+ if (errno != ENOENT) {
+ pr_perror("Failed accessing cgroup dir %s", paux);
+ return -1;
+ }
+
+ if (opts.manage_cgroups & (CG_MODE_NONE | CG_MODE_PROPS)) {
+ pr_err("Cgroup dir %s doesn't exist\n", paux);
+ return -1;
+ }
+
+ if (mkdirpat(cg, paux)) {
+ pr_perror("Can't make cgroup dir %s", paux);
+ return -1;
+ }
+ pr_info("Created cgroup dir %s\n", paux);
+
+ if (prepare_dir_perms(cg, paux, e->dir_perms) < 0)
+ return -1;
+
+ for (j = 0; j < n_controllers; j++) {
+ if (strcmp(controllers[j], "cpuset") == 0) {
+ if (restore_special_cpuset_props(paux, off2, e) < 0) {
+ pr_err("Restoring special cpuset props failed!\n");
+ return -1;
+ }
+ }
+ }
+ } else {
+ pr_info("Determined cgroup dir %s already exist\n", paux);
+
+ if (opts.manage_cgroups & CG_MODE_STRICT) {
+ pr_err("Abort restore of existing cgroups\n");
+ return -1;
+ }
+
+ if (opts.manage_cgroups & (CG_MODE_SOFT | CG_MODE_NONE)) {
+ pr_info("Skip restoring properties on cgroup dir %s\n", paux);
+ if (e->n_properties > 0) {
+ xfree(e->properties);
+ e->properties = NULL;
+ e->n_properties = 0;
+ }
+ }
+
+ if (!(opts.manage_cgroups & CG_MODE_NONE) &&
+ prepare_dir_perms(cg, paux, e->dir_perms) < 0)
+ return -1;
+ }
+
+ if (prepare_cgroup_dirs(controllers, n_controllers, paux, off2,
+ e->children, e->n_children) < 0)
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Prepare the CGROUP_YARD service descriptor. This guy is
+ * tmpfs mount with the set of ctl->name directories each
+ * one having the respective cgroup mounted.
+ *
+ * It's required for two reasons.
+ *
+ * First, if we move more than one task into cgroups it's
+ * faster to have cgroup tree visible by them all in sime
+ * single place. Searching for this thing existing in the
+ * criu's space is not nice, as parsing /proc/mounts is not
+ * very fast, other than this not all cgroups may be mounted.
+ *
+ * Second, when we have user-namespaces support we will
+ * loose the ability to mount cgroups on-demand, so prepare
+ * them in advance.
+ */
+
+static int prepare_cgroup_sfd(CgroupEntry *ce)
+{
+ int off, i, ret;
+ char paux[PATH_MAX];
+
+ if (!opts.manage_cgroups)
+ return 0;
+
+ pr_info("Preparing cgroups yard (cgroups restore mode %#x)\n",
+ opts.manage_cgroups);
+
+ off = sprintf(paux, ".criu.cgyard.XXXXXX");
+ if (mkdtemp(paux) == NULL) {
+ pr_perror("Can't make temp cgyard dir");
+ return -1;
+ }
+
+ cg_yard = xstrdup(paux);
+ if (!cg_yard) {
+ rmdir(paux);
+ return -1;
+ }
+
+ if (make_yard(cg_yard))
+ goto err;
+
+ pr_debug("Opening %s as cg yard\n", cg_yard);
+ i = open(cg_yard, O_DIRECTORY);
+ if (i < 0) {
+ pr_perror("Can't open cgyard");
+ goto err;
+ }
+
+ ret = install_service_fd(CGROUP_YARD, i);
+ close(i);
+ if (ret < 0)
+ goto err;
+
+ paux[off++] = '/';
+
+ for (i = 0; i < ce->n_controllers; i++) {
+ int ctl_off = off, yard_off;
+ char opt[128], *yard;
+ CgControllerEntry *ctrl = ce->controllers[i];
+
+ if (ctrl->n_cnames < 1) {
+ pr_err("Each cg_controller_entry must have at least 1 controller\n");
+ goto err;
+ }
+
+ ctl_off += ctrl_dir_and_opt(ctrl,
+ paux + ctl_off, sizeof(paux) - ctl_off,
+ opt, sizeof(opt));
+
+ /* Create controller if not yet present */
+ if (access(paux, F_OK)) {
+ pr_debug("\tMaking controller dir %s (%s)\n", paux, opt);
+ if (mkdir(paux, 0700)) {
+ pr_perror("\tCan't make controller dir %s", paux);
+ return -1;
+ }
+ if (mount("none", paux, "cgroup", 0, opt) < 0) {
+ pr_perror("\tCan't mount controller dir %s", paux);
+ return -1;
+ }
+ }
+
+ /*
+ * Finally handle all cgroups for this controller.
+ */
+ yard = paux + strlen(cg_yard) + 1;
+ yard_off = ctl_off - (strlen(cg_yard) + 1);
+ if (opts.manage_cgroups &&
+ prepare_cgroup_dirs(ctrl->cnames, ctrl->n_cnames, yard, yard_off,
+ ctrl->dirs, ctrl->n_dirs))
+ goto err;
+ }
+
+ return 0;
+
+err:
+ fini_cgroup();
+ return -1;
+}
+
+static int rewrite_cgsets(CgroupEntry *cge, char **controllers, int n_controllers,
+ char *from, char *to)
+{
+ int i, j;
+ for (i = 0; i < cge->n_sets; i++) {
+ CgSetEntry *set = cge->sets[i];
+ for (j = 0; j < set->n_ctls; j++) {
+ CgMemberEntry *cg = set->ctls[j];
+ if (cgroup_contains(controllers, n_controllers, cg->name) &&
+ /* +1 to get rid of leading / */
+ strstartswith(cg->path + 1, from)) {
+
+ char *tmp = cg->path;
+
+ /* +1 to get rid of leading /, again */
+ cg->path = xsprintf("%s%s", to, cg->path +
+ strlen(from) + 1);
+ if (!cg->path)
+ return -1;
+ free(tmp);
+ }
+ }
+
+ }
+ return 0;
+}
+
+static int rewrite_cgroup_roots(CgroupEntry *cge)
+{
+ int i, j;
+ struct cg_root_opt *o;
+ char *newroot = NULL;
+
+ for (i = 0; i < cge->n_controllers; i++) {
+ CgControllerEntry *ctrl = cge->controllers[i];
+ newroot = opts.new_global_cg_root;
+
+ list_for_each_entry(o, &opts.new_cgroup_roots, node) {
+ if (cgroup_contains(ctrl->cnames, ctrl->n_cnames, o->controller)) {
+ newroot = o->newroot;
+ break;
+ }
+
+ }
+
+ if (newroot) {
+ for (j = 0; j < ctrl->n_dirs; j++) {
+ CgroupDirEntry *cgde = ctrl->dirs[j];
+ char *m;
+
+ pr_info("rewriting %s to %s\n", cgde->dir_name, newroot);
+ if (rewrite_cgsets(cge, ctrl->cnames, ctrl->n_cnames, cgde->dir_name, newroot))
+ return -1;
+
+ m = xstrdup(newroot);
+ if (!m)
+ return -1;
+
+ free(cgde->dir_name);
+ cgde->dir_name = m;
+ }
+ }
+ }
+
+ return 0;
+}
+
+int prepare_cgroup(void)
+{
+ int ret;
+ struct cr_img *img;
+ CgroupEntry *ce;
+
+ img = open_image(CR_FD_CGROUP, O_RSTR);
+ if (!img)
+ return -1;
+
+ ret = pb_read_one_eof(img, &ce, PB_CGROUP);
+ close_image(img);
+ if (ret <= 0) /* Zero is OK -- no sets there. */
+ return ret;
+
+ if (rewrite_cgroup_roots(ce))
+ return -1;
+
+ n_sets = ce->n_sets;
+ rst_sets = ce->sets;
+ n_controllers = ce->n_controllers;
+ controllers = ce->controllers;
+
+ if (n_sets)
+ /*
+ * We rely on the fact that all sets contain the same
+ * set of controllers. This is checked during dump
+ * with cg_set_compare(CGCMP_ISSUB) call.
+ */
+ ret = prepare_cgroup_sfd(ce);
+ else
+ ret = 0;
+
+ return ret;
+}
+
+int new_cg_root_add(char *controller, char *newroot)
+{
+ struct cg_root_opt *o;
+
+ if (!controller) {
+ opts.new_global_cg_root = newroot;
+ return 0;
+ }
+
+ o = xmalloc(sizeof(*o));
+ if (!o)
+ return -1;
+
+ o->controller = controller;
+ o->newroot = newroot;
+ list_add(&o->node, &opts.new_cgroup_roots);
+ return 0;
+}
diff --git a/criu/cr-check.c b/criu/cr-check.c
new file mode 100644
index 000000000000..2bb0d9a2e69f
--- /dev/null
+++ b/criu/cr-check.c
@@ -0,0 +1,958 @@
+#include <unistd.h>
+#include <linux/netlink.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <sys/eventfd.h>
+#include <sys/epoll.h>
+#include <sys/inotify.h>
+#include <sys/signalfd.h>
+#include <sys/ptrace.h>
+#include <sys/wait.h>
+#include <sys/socket.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <linux/if.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
+#include <sys/ioctl.h>
+#include <termios.h>
+#include <sys/mman.h>
+#include <netinet/in.h>
+#include <sys/prctl.h>
+#include <sched.h>
+#include <linux/aio_abi.h>
+
+#include "proc_parse.h"
+#include "sockets.h"
+#include "crtools.h"
+#include "log.h"
+#include "util-pie.h"
+#include "prctl.h"
+#include "files.h"
+#include "sk-inet.h"
+#include "proc_parse.h"
+#include "mount.h"
+#include "tty.h"
+#include "ptrace.h"
+#include "kerndat.h"
+#include "timerfd.h"
+#include "tun.h"
+#include "namespaces.h"
+#include "pstree.h"
+#include "cr_options.h"
+
+static int check_tty(void)
+{
+ int master = -1, slave = -1;
+ const int lock = 1;
+ struct termios t;
+ char *slavename;
+ int ret = -1;
+
+ if (ARRAY_SIZE(t.c_cc) < TERMIOS_NCC) {
+ pr_msg("struct termios has %d @c_cc while "
+ "at least %d expected.\n",
+ (int)ARRAY_SIZE(t.c_cc),
+ TERMIOS_NCC);
+ goto out;
+ }
+
+ master = open("/dev/ptmx", O_RDWR);
+ if (master < 0) {
+ pr_perror("Can't open /dev/ptmx");
+ goto out;
+ }
+
+ if (ioctl(master, TIOCSPTLCK, &lock)) {
+ pr_perror("Can't lock pty master");
+ goto out;
+ }
+
+ slavename = ptsname(master);
+ slave = open(slavename, O_RDWR);
+ if (slave < 0) {
+ if (errno != EIO) {
+ pr_perror("Unexpected error on locked pty");
+ goto out;
+ }
+ } else {
+ pr_err("Managed to open locked pty.\n");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ close_safe(&master);
+ close_safe(&slave);
+ return ret;
+}
+
+static int check_map_files(void)
+{
+ int ret;
+
+ ret = access("/proc/self/map_files", R_OK);
+ if (!ret)
+ return 0;
+
+ pr_perror("/proc/<pid>/map_files is inaccessible");
+ return -1;
+}
+
+static int check_sock_diag(void)
+{
+ int ret;
+ struct ns_id ns;
+
+ ns.ns_pid = 0;
+ ns.type = NS_CRIU;
+ ns.net.nlsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG);
+ if (ns.net.nlsk < 0) {
+ pr_perror("Can't make diag socket for check");
+ return -1;
+ }
+
+ ret = collect_sockets(&ns);
+ if (!ret)
+ return 0;
+
+ pr_msg("The sock diag infrastructure is incomplete.\n");
+ pr_msg("Make sure you have:\n");
+ pr_msg(" 1. *_DIAG kernel config options turned on;\n");
+ pr_msg(" 2. *_diag.ko modules loaded (if compiled as modules).\n");
+ return -1;
+}
+
+static int check_ns_last_pid(void)
+{
+ int ret;
+
+ ret = access("/proc/" LAST_PID_PATH, W_OK);
+ if (!ret)
+ return 0;
+
+ pr_perror("%s sysctl is inaccessible", LAST_PID_PATH);
+ return -1;
+}
+
+static int check_sock_peek_off(void)
+{
+ int sk;
+ int ret, off, sz;
+
+ sk = socket(PF_UNIX, SOCK_DGRAM, 0);
+ if (sk < 0) {
+ pr_perror("Can't create unix socket for check");
+ return -1;
+ }
+
+ sz = sizeof(off);
+ ret = getsockopt(sk, SOL_SOCKET, SO_PEEK_OFF, &off, (socklen_t *)&sz);
+ close(sk);
+
+ if ((ret == 0) && (off == -1) && (sz == sizeof(int)))
+ return 0;
+
+ pr_msg("SO_PEEK_OFF sockoption doesn't work.\n");
+ return -1;
+}
+
+static int check_kcmp(void)
+{
+ int ret = syscall(SYS_kcmp, getpid(), -1, -1, -1, -1);
+
+ if (ret != -ENOSYS)
+ return 0;
+
+ errno = -ret;
+ pr_perror("System call kcmp is not supported");
+ return -1;
+}
+
+static int check_prctl(void)
+{
+ unsigned long user_auxv = 0;
+ unsigned int *tid_addr;
+ unsigned int size = 0;
+ int ret;
+
+ ret = prctl(PR_GET_TID_ADDRESS, (unsigned long)&tid_addr, 0, 0, 0);
+ if (ret) {
+ pr_msg("prctl: PR_GET_TID_ADDRESS is not supported");
+ return -1;
+ }
+
+ /*
+ * Either new or old interface must be supported in the kernel.
+ */
+ ret = prctl(PR_SET_MM, PR_SET_MM_MAP_SIZE, (unsigned long)&size, 0, 0);
+ if (ret) {
+ if (!opts.check_ms_kernel) {
+ pr_msg("prctl: PR_SET_MM_MAP is not supported, which "
+ "is required for restoring user namespaces\n");
+ return -1;
+ } else
+ pr_warn("Skipping unssuported PR_SET_MM_MAP\n");
+
+ ret = prctl(PR_SET_MM, PR_SET_MM_BRK, brk(0), 0, 0);
+ if (ret) {
+ if (ret == -EPERM)
+ pr_msg("prctl: One needs CAP_SYS_RESOURCE capability to perform testing\n");
+ else
+ pr_msg("prctl: PR_SET_MM is not supported\n");
+ return -1;
+ }
+
+ ret = prctl(PR_SET_MM, PR_SET_MM_EXE_FILE, -1, 0, 0);
+ if (ret != -EBADF) {
+ pr_msg("prctl: PR_SET_MM_EXE_FILE is not supported (%d)\n", ret);
+ return -1;
+ }
+
+ ret = prctl(PR_SET_MM, PR_SET_MM_AUXV, (long)&user_auxv, sizeof(user_auxv), 0);
+ if (ret) {
+ pr_msg("prctl: PR_SET_MM_AUXV is not supported\n");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int check_fcntl(void)
+{
+ u32 v[2];
+ int fd;
+
+ fd = open("/proc/self/comm", O_RDONLY);
+ if (fd < 0) {
+ pr_perror("Can't open self comm file");
+ return -1;
+ }
+
+ if (fcntl(fd, F_GETOWNER_UIDS, (long)v)) {
+ pr_perror("Can'r fetch file owner UIDs");
+ close(fd);
+ return -1;
+ }
+
+ close(fd);
+ return 0;
+}
+
+static int check_proc_stat(void)
+{
+ struct proc_pid_stat stat;
+ int ret;
+
+ ret = parse_pid_stat(getpid(), &stat);
+ if (ret) {
+ pr_msg("procfs: stat extension is not supported\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int check_one_fdinfo(union fdinfo_entries *e, void *arg)
+{
+ *(int *)arg = (int)e->efd.counter;
+ return 0;
+}
+
+static int check_fdinfo_eventfd(void)
+{
+ int fd, ret;
+ int cnt = 13, proc_cnt = 0;
+
+ fd = eventfd(cnt, 0);
+ if (fd < 0) {
+ pr_perror("Can't make eventfd");
+ return -1;
+ }
+
+ ret = parse_fdinfo(fd, FD_TYPES__EVENTFD, check_one_fdinfo, &proc_cnt);
+ close(fd);
+
+ if (ret) {
+ pr_err("Error parsing proc fdinfo\n");
+ return -1;
+ }
+
+ if (proc_cnt != cnt) {
+ pr_err("Counter mismatch (or not met) %d want %d\n",
+ proc_cnt, cnt);
+ return -1;
+ }
+
+ pr_info("Eventfd fdinfo works OK (%d vs %d)\n", cnt, proc_cnt);
+ return 0;
+}
+
+static int check_one_sfd(union fdinfo_entries *e, void *arg)
+{
+ return 0;
+}
+
+int check_mnt_id(void)
+{
+ struct fdinfo_common fdinfo = { .mnt_id = -1 };
+ int ret;
+
+ ret = parse_fdinfo(get_service_fd(LOG_FD_OFF), FD_TYPES__UND, NULL, &fdinfo);
+ if (ret < 0)
+ return -1;
+
+ if (fdinfo.mnt_id == -1) {
+ pr_err("fdinfo doesn't contain the mnt_id field\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int check_fdinfo_signalfd(void)
+{
+ int fd, ret;
+ sigset_t mask;
+
+ sigemptyset(&mask);
+ sigaddset(&mask, SIGUSR1);
+ fd = signalfd(-1, &mask, 0);
+ if (fd < 0) {
+ pr_perror("Can't make signalfd");
+ return -1;
+ }
+
+ ret = parse_fdinfo(fd, FD_TYPES__SIGNALFD, check_one_sfd, NULL);
+ close(fd);
+
+ if (ret) {
+ pr_err("Error parsing proc fdinfo\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int check_one_epoll(union fdinfo_entries *e, void *arg)
+{
+ *(int *)arg = e->epl.e.tfd;
+ free_event_poll_entry(e);
+ return 0;
+}
+
+static int check_fdinfo_eventpoll(void)
+{
+ int efd, pfd[2], proc_fd = 0, ret = -1;
+ struct epoll_event ev;
+
+ if (pipe(pfd)) {
+ pr_perror("Can't make pipe to watch");
+ return -1;
+ }
+
+ efd = epoll_create(1);
+ if (efd < 0) {
+ pr_perror("Can't make epoll fd");
+ goto pipe_err;
+ }
+
+ memset(&ev, 0, sizeof(ev));
+ ev.events = EPOLLIN | EPOLLOUT;
+
+ if (epoll_ctl(efd, EPOLL_CTL_ADD, pfd[0], &ev)) {
+ pr_perror("Can't add epoll tfd");
+ goto epoll_err;
+ }
+
+ ret = parse_fdinfo(efd, FD_TYPES__EVENTPOLL, check_one_epoll, &proc_fd);
+ if (ret) {
+ pr_err("Error parsing proc fdinfo\n");
+ goto epoll_err;
+ }
+
+ if (pfd[0] != proc_fd) {
+ pr_err("TFD mismatch (or not met) %d want %d\n",
+ proc_fd, pfd[0]);
+ ret = -1;
+ goto epoll_err;
+ }
+
+ pr_info("Epoll fdinfo works OK (%d vs %d)\n", pfd[0], proc_fd);
+
+epoll_err:
+ close(efd);
+pipe_err:
+ close(pfd[0]);
+ close(pfd[1]);
+
+ return ret;
+}
+
+static int check_one_inotify(union fdinfo_entries *e, void *arg)
+{
+ *(int *)arg = e->ify.e.wd;
+ free_inotify_wd_entry(e);
+ return 0;
+}
+
+static int check_fdinfo_inotify(void)
+{
+ int ifd, wd, proc_wd = -1, ret;
+
+ ifd = inotify_init1(0);
+ if (ifd < 0) {
+ pr_perror("Can't make inotify fd");
+ return -1;
+ }
+
+ wd = inotify_add_watch(ifd, ".", IN_ALL_EVENTS);
+ if (wd < 0) {
+ pr_perror("Can't add watch");
+ close(ifd);
+ return -1;
+ }
+
+ ret = parse_fdinfo(ifd, FD_TYPES__INOTIFY, check_one_inotify, &proc_wd);
+ close(ifd);
+
+ if (ret < 0) {
+ pr_err("Error parsing proc fdinfo\n");
+ return -1;
+ }
+
+ if (wd != proc_wd) {
+ pr_err("WD mismatch (or not met) %d want %d\n", proc_wd, wd);
+ return -1;
+ }
+
+ pr_info("Inotify fdinfo works OK (%d vs %d)\n", wd, proc_wd);
+ return 0;
+}
+
+static int check_fdinfo_ext(void)
+{
+ int ret = 0;
+
+ ret |= check_fdinfo_eventfd();
+ ret |= check_fdinfo_eventpoll();
+ ret |= check_fdinfo_signalfd();
+ ret |= check_fdinfo_inotify();
+
+ return ret;
+}
+
+static int check_unaligned_vmsplice(void)
+{
+ int p[2], ret;
+ char buf; /* :) */
+ struct iovec iov;
+
+ ret = pipe(p);
+ if (ret < 0) {
+ pr_perror("Can't create pipe");
+ return ret;
+ }
+ iov.iov_base = &buf;
+ iov.iov_len = sizeof(buf);
+ ret = vmsplice(p[1], &iov, 1, SPLICE_F_GIFT | SPLICE_F_NONBLOCK);
+ if (ret < 0) {
+ pr_perror("Unaligned vmsplice doesn't work");
+ goto err;
+ }
+
+ pr_info("Unaligned vmsplice works OK\n");
+ ret = 0;
+err:
+ close(p[0]);
+ close(p[1]);
+
+ return ret;
+}
+
+#ifndef SO_GET_FILTER
+#define SO_GET_FILTER SO_ATTACH_FILTER
+#endif
+
+static int check_so_gets(void)
+{
+ int sk, ret = -1;
+ socklen_t len;
+ char name[IFNAMSIZ];
+
+ sk = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP);
+ if (sk < 0) {
+ pr_perror("No socket");
+ return -1;
+ }
+
+ len = 0;
+ if (getsockopt(sk, SOL_SOCKET, SO_GET_FILTER, NULL, &len)) {
+ pr_perror("Can't get socket filter");
+ goto err;
+ }
+
+ len = sizeof(name);
+ if (getsockopt(sk, SOL_SOCKET, SO_BINDTODEVICE, name, &len)) {
+ pr_perror("Can't get socket bound dev");
+ goto err;
+ }
+
+ ret = 0;
+err:
+ close(sk);
+ return ret;
+}
+
+static int check_ipc(void)
+{
+ int ret;
+
+ ret = access("/proc/sys/kernel/sem_next_id", R_OK | W_OK);
+ if (!ret)
+ return 0;
+
+ pr_perror("/proc/sys/kernel/sem_next_id is inaccessible");
+ return -1;
+}
+
+static int check_sigqueuinfo()
+{
+ siginfo_t info = { .si_code = 1 };
+
+ signal(SIGUSR1, SIG_IGN);
+
+ if (syscall(SYS_rt_sigqueueinfo, getpid(), SIGUSR1, &info)) {
+ pr_perror("Unable to send siginfo with positive si_code to itself");
+ return -1;
+ }
+
+ return 0;
+}
+
+static pid_t fork_and_ptrace_attach(int (*child_setup)(void))
+{
+ pid_t pid;
+ int sk_pair[2], sk;
+ char c = 0;
+
+ if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sk_pair)) {
+ pr_perror("socketpair");
+ return -1;
+ }
+
+ pid = fork();
+ if (pid < 0) {
+ pr_perror("fork");
+ return -1;
+ } else if (pid == 0) {
+ sk = sk_pair[1];
+ close(sk_pair[0]);
+
+ if (child_setup && child_setup() != 0)
+ exit(1);
+
+ if (write(sk, &c, 1) != 1) {
+ pr_perror("write");
+ exit(1);
+ }
+
+ while (1)
+ sleep(1000);
+ exit(1);
+ }
+
+ sk = sk_pair[0];
+ close(sk_pair[1]);
+
+ if (read(sk, &c, 1) != 1) {
+ close(sk);
+ kill(pid, SIGKILL);
+ pr_perror("read");
+ return -1;
+ }
+
+ close(sk);
+
+ if (ptrace(PTRACE_ATTACH, pid, NULL, NULL) == -1) {
+ pr_perror("Unable to ptrace the child");
+ kill(pid, SIGKILL);
+ return -1;
+ }
+
+ waitpid(pid, NULL, 0);
+
+ return pid;
+}
+
+static int check_ptrace_peeksiginfo()
+{
+ struct ptrace_peeksiginfo_args arg;
+ siginfo_t siginfo;
+ pid_t pid, ret = 0;
+ k_rtsigset_t mask;
+
+ pid = fork_and_ptrace_attach(NULL);
+ if (pid < 0)
+ return -1;
+
+ arg.flags = 0;
+ arg.off = 0;
+ arg.nr = 1;
+
+ if (ptrace(PTRACE_PEEKSIGINFO, pid, &arg, &siginfo) != 0) {
+ pr_perror("Unable to dump pending signals");
+ ret = -1;
+ }
+
+ if (ptrace(PTRACE_GETSIGMASK, pid, sizeof(mask), &mask) != 0) {
+ pr_perror("Unable to dump signal blocking mask");
+ ret = -1;
+ }
+
+ kill(pid, SIGKILL);
+ return ret;
+}
+
+static int check_ptrace_suspend_seccomp(void)
+{
+ pid_t pid;
+ int ret = 0;
+
+ pid = fork_and_ptrace_attach(NULL);
+ if (pid < 0)
+ return -1;
+
+ if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP) < 0) {
+ if (errno == EINVAL) {
+ pr_err("Kernel doesn't support PTRACE_O_SUSPEND_SECCOMP\n");
+ } else {
+ pr_perror("couldn't suspend seccomp");
+ }
+ ret = -1;
+ }
+
+ kill(pid, SIGKILL);
+ return ret;
+}
+
+static int setup_seccomp_filter(void)
+{
+ struct sock_filter filter[] = {
+ BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr)),
+ /* Allow all syscalls except ptrace */
+ BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_ptrace, 0, 1),
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL),
+ BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW),
+ };
+
+ struct sock_fprog bpf_prog = {
+ .len = (unsigned short)(sizeof(filter)/sizeof(filter[0])),
+ .filter = filter,
+ };
+
+ if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, (long) &bpf_prog, 0, 0) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int check_ptrace_dump_seccomp_filters(void)
+{
+ pid_t pid;
+ int ret = 0, len;
+
+ pid = fork_and_ptrace_attach(setup_seccomp_filter);
+ if (pid < 0)
+ return -1;
+
+ len = ptrace(PTRACE_SECCOMP_GET_FILTER, pid, 0, NULL);
+ if (len < 0) {
+ ret = -1;
+ pr_perror("Dumping seccomp filters not supported");
+ }
+
+ kill(pid, SIGKILL);
+ return ret;
+}
+
+static int check_mem_dirty_track(void)
+{
+ if (kerndat_get_dirty_track() < 0)
+ return -1;
+
+ if (!kdat.has_dirty_track)
+ pr_warn("Dirty tracking is OFF. Memory snapshot will not work.\n");
+ return 0;
+}
+
+static int check_posix_timers(void)
+{
+ int ret;
+
+ ret = access("/proc/self/timers", R_OK);
+ if (!ret)
+ return 0;
+
+ pr_msg("/proc/<pid>/timers file is missing.\n");
+ return -1;
+}
+
+static unsigned long get_ring_len(unsigned long addr)
+{
+ FILE *maps;
+ char buf[256];
+
+ maps = fopen("/proc/self/maps", "r");
+ if (!maps) {
+ pr_perror("No maps proc file");
+ return 0;
+ }
+
+ while (fgets(buf, sizeof(buf), maps)) {
+ unsigned long start, end;
+ int r, tail;
+
+ r = sscanf(buf, "%lx-%lx %*s %*s %*s %*s %n\n", &start, &end, &tail);
+ if (r != 2) {
+ fclose(maps);
+ pr_err("Bad maps format %d.%d (%s)\n", r, tail, buf + tail);
+ return 0;
+ }
+
+ if (start == addr) {
+ fclose(maps);
+ if (strcmp(buf + tail, "/[aio] (deleted)\n"))
+ goto notfound;
+
+ return end - start;
+ }
+ }
+
+ fclose(maps);
+notfound:
+ pr_err("No AIO ring at expected location\n");
+ return 0;
+}
+
+static int check_aio_remap(void)
+{
+ aio_context_t ctx = 0;
+ unsigned long len;
+ void *naddr;
+ int r;
+
+ if (syscall(SYS_io_setup, 16, &ctx) < 0) {
+ pr_err("No AIO syscall\n");
+ return -1;
+ }
+
+ len = get_ring_len((unsigned long) ctx);
+ if (!len)
+ return -1;
+
+ naddr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, 0, 0);
+ if (naddr == MAP_FAILED) {
+ pr_perror("Can't find place for new AIO ring");
+ return -1;
+ }
+
+ if (mremap((void *)ctx, len, len, MREMAP_FIXED | MREMAP_MAYMOVE, naddr) == MAP_FAILED) {
+ pr_perror("Can't remap AIO ring");
+ return -1;
+ }
+
+ ctx = (aio_context_t)naddr;
+ r = syscall(SYS_io_getevents, ctx, 0, 1, NULL, NULL);
+ if (r < 0) {
+ if (!opts.check_ms_kernel) {
+ pr_err("AIO remap doesn't work properly\n");
+ return -1;
+ } else
+ pr_warn("Skipping unsupported AIO remap\n");
+ }
+
+ return 0;
+}
+
+static int check_fdinfo_lock(void)
+{
+ if (kerndat_fdinfo_has_lock())
+ return -1;
+
+ if (!kdat.has_fdinfo_lock) {
+ if (!opts.check_ms_kernel) {
+ pr_err("fdinfo doesn't contain the lock field\n");
+ return -1;
+ } else {
+ pr_warn("fdinfo doesn't contain the lock field\n");
+ }
+ }
+
+ return 0;
+}
+
+struct clone_arg {
+ /*
+ * Reserve some space for clone() to locate arguments
+ * and retcode in this place
+ */
+ char stack[128] __stack_aligned__;
+ char stack_ptr[0];
+};
+
+static int clone_cb(void *_arg) {
+ exit(0);
+}
+
+static int check_clone_parent_vs_pid()
+{
+ struct clone_arg ca;
+ pid_t pid;
+
+ pid = clone(clone_cb, ca.stack_ptr, CLONE_NEWPID | CLONE_PARENT, &ca);
+ if (pid < 0) {
+ pr_err("CLONE_PARENT | CLONE_NEWPID don't work together\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int (*chk_feature)(void);
+
+int cr_check(void)
+{
+ struct ns_id ns = { .type = NS_CRIU, .ns_pid = PROC_SELF, .nd = &mnt_ns_desc };
+ int ret = 0;
+
+ if (!is_root_user())
+ return -1;
+
+ root_item = alloc_pstree_item();
+ if (root_item == NULL)
+ return -1;
+
+ root_item->pid.real = getpid();
+
+ if (collect_pstree_ids())
+ return -1;
+
+ ns.id = root_item->ids->mnt_ns_id;
+
+ mntinfo = collect_mntinfo(&ns, false);
+ if (mntinfo == NULL)
+ return -1;
+
+ if (chk_feature) {
+ ret = chk_feature();
+ goto out;
+ }
+
+ ret |= check_map_files();
+ ret |= check_sock_diag();
+ ret |= check_ns_last_pid();
+ ret |= check_sock_peek_off();
+ ret |= check_kcmp();
+ ret |= check_prctl();
+ ret |= check_fcntl();
+ ret |= check_proc_stat();
+ ret |= check_tcp();
+ ret |= check_fdinfo_ext();
+ ret |= check_unaligned_vmsplice();
+ ret |= check_tty();
+ ret |= check_so_gets();
+ ret |= check_ipc();
+ ret |= check_sigqueuinfo();
+ ret |= check_ptrace_peeksiginfo();
+ ret |= check_ptrace_suspend_seccomp();
+ ret |= check_ptrace_dump_seccomp_filters();
+ ret |= check_mem_dirty_track();
+ ret |= check_posix_timers();
+ ret |= check_tun_cr(0);
+ ret |= check_timerfd();
+ ret |= check_mnt_id();
+ ret |= check_aio_remap();
+ ret |= check_fdinfo_lock();
+ ret |= check_clone_parent_vs_pid();
+
+out:
+ if (!ret)
+ print_on_level(DEFAULT_LOGLEVEL, "Looks good.\n");
+
+ return ret;
+}
+
+static int check_tun(void)
+{
+ /*
+ * In case there's no TUN support at all we
+ * should report error. Unlike this plain criu
+ * check would report "Looks good" in this case
+ * since C/R effectively works, just not for TUN.
+ */
+ return check_tun_cr(-1);
+}
+
+static int check_userns(void)
+{
+ int ret;
+ unsigned long size = 0;
+
+ ret = access("/proc/self/ns/user", F_OK);
+ if (ret) {
+ pr_perror("No userns proc file");
+ return -1;
+ }
+
+ ret = prctl(PR_SET_MM, PR_SET_MM_MAP_SIZE, (unsigned long)&size, 0, 0);
+ if (ret) {
+ errno = -ret;
+ pr_perror("No new prctl API");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int check_loginuid(void)
+{
+ if (kerndat_loginuid(false) < 0)
+ return -1;
+
+ if (!kdat.has_loginuid) {
+ pr_warn("Loginuid restore is OFF.\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+int check_add_feature(char *feat)
+{
+ if (!strcmp(feat, "mnt_id"))
+ chk_feature = check_mnt_id;
+ else if (!strcmp(feat, "aio_remap"))
+ chk_feature = check_aio_remap;
+ else if (!strcmp(feat, "timerfd"))
+ chk_feature = check_timerfd;
+ else if (!strcmp(feat, "tun"))
+ chk_feature = check_tun;
+ else if (!strcmp(feat, "userns"))
+ chk_feature = check_userns;
+ else if (!strcmp(feat, "fdinfo_lock"))
+ chk_feature = check_fdinfo_lock;
+ else if (!strcmp(feat, "seccomp_suspend"))
+ chk_feature = check_ptrace_suspend_seccomp;
+ else if (!strcmp(feat, "seccomp_filters"))
+ chk_feature = check_ptrace_dump_seccomp_filters;
+ else if (!strcmp(feat, "loginuid"))
+ chk_feature = check_loginuid;
+ else {
+ pr_err("Unknown feature %s\n", feat);
+ return -1;
+ }
+
+ return 0;
+}
diff --git a/criu/cr-dedup.c b/criu/cr-dedup.c
new file mode 100644
index 000000000000..b453c3e61f4e
--- /dev/null
+++ b/criu/cr-dedup.c
@@ -0,0 +1,197 @@
+#include <sys/uio.h>
+#include <fcntl.h>
+#include <linux/falloc.h>
+#include <unistd.h>
+
+#include "crtools.h"
+#include "page-read.h"
+#include "restorer.h"
+
+#define MAX_BUNCH_SIZE 256
+
+static int cr_dedup_one_pagemap(int pid);
+
+int cr_dedup(void)
+{
+ int close_ret, ret = 0;
+ int pid;
+ DIR * dirp;
+ struct dirent *ent;
+
+ dirp = opendir(CR_PARENT_LINK);
+ if (dirp == NULL) {
+ pr_perror("Can't enter previous snapshot folder, error=%d", errno);
+ ret = -1;
+ goto err;
+ }
+
+ while (1) {
+ errno = 0;
+ ent = readdir(dirp);
+ if (ent == NULL) {
+ if (errno) {
+ pr_perror("Failed readdir, error=%d", errno);
+ ret = -1;
+ goto err;
+ }
+ break;
+ }
+
+ ret = sscanf(ent->d_name, "pagemap-%d.img", &pid);
+ if (ret == 1) {
+ pr_info("pid=%d\n", pid);
+ ret = cr_dedup_one_pagemap(pid);
+ if (ret < 0)
+ break;
+ }
+ }
+
+err:
+ if (dirp) {
+ close_ret = closedir(dirp);
+ if (close_ret == -1)
+ return close_ret;
+ }
+
+ if (ret < 0)
+ return ret;
+
+ pr_info("Deduplicated\n");
+ return 0;
+}
+
+static int cr_dedup_one_pagemap(int pid)
+{
+ int ret;
+ struct page_read pr;
+ struct page_read * prp;
+ struct iovec iov;
+
+ ret = open_page_read(pid, &pr, PR_TASK | PR_MOD);
+ if (ret <= 0) {
+ ret = -1;
+ goto exit;
+ }
+
+ prp = pr.parent;
+ if (!prp)
+ goto exit;
+
+ ret = pr.get_pagemap(&pr, &iov);
+ if (ret <= 0)
+ goto exit;
+
+ while (1) {
+ pr_debug("dedup iovec base=%p, len=%zu\n", iov.iov_base, iov.iov_len);
+ if (!pr.pe->in_parent) {
+ ret = dedup_one_iovec(prp, &iov);
+ if (ret)
+ goto exit;
+ }
+
+ pr.put_pagemap(&pr);
+ ret = pr.get_pagemap(&pr, &iov);
+ if (ret <= 0)
+ goto exit;
+ }
+exit:
+ pr.close(&pr);
+
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+static inline bool can_extend_batch(struct iovec *bunch,
+ unsigned long off, unsigned long len)
+{
+ return /* The next region is the continuation of the existing */
+ ((unsigned long)bunch->iov_base + bunch->iov_len == off) &&
+ /* The resulting region is non empty and is small enough */
+ (bunch->iov_len == 0 || bunch->iov_len + len < MAX_BUNCH_SIZE * PAGE_SIZE);
+}
+
+int punch_hole(struct page_read *pr, unsigned long off, unsigned long len,
+ bool cleanup)
+{
+ int ret;
+ struct iovec * bunch = &pr->bunch;
+
+ if (!cleanup && can_extend_batch(bunch, off, len)) {
+ pr_debug("pr%d:Extend bunch len from %zu to %lu\n", pr->id,
+ bunch->iov_len, bunch->iov_len + len);
+ bunch->iov_len += len;
+ } else {
+ if (bunch->iov_len > 0) {
+ pr_debug("Punch!/%p/%zu/\n", bunch->iov_base, bunch->iov_len);
+ ret = fallocate(img_raw_fd(pr->pi), FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+ (unsigned long)bunch->iov_base, bunch->iov_len);
+ if (ret != 0) {
+ pr_perror("Error punching hole");
+ return -1;
+ }
+ }
+ bunch->iov_base = (void *)off;
+ bunch->iov_len = len;
+ pr_debug("pr%d:New bunch/%p/%zu/\n", pr->id, bunch->iov_base, bunch->iov_len);
+ }
+ return 0;
+}
+
+int dedup_one_iovec(struct page_read *pr, struct iovec *iov)
+{
+ unsigned long off;
+ unsigned long off_real;
+ unsigned long iov_end;
+
+ iov_end = (unsigned long)iov->iov_base + iov->iov_len;
+ off = (unsigned long)iov->iov_base;
+ while (1) {
+ int ret;
+ struct iovec piov;
+ unsigned long piov_end;
+ struct iovec tiov;
+ struct page_read * prp;
+
+ ret = seek_pagemap_page(pr, off, false);
+ if (ret == -1)
+ return -1;
+
+ if (ret == 0) {
+ if (off < pr->cvaddr && pr->cvaddr < iov_end)
+ off = pr->cvaddr;
+ else
+ return 0;
+ }
+
+ if (!pr->pe)
+ return -1;
+ pagemap2iovec(pr->pe, &piov);
+ piov_end = (unsigned long)piov.iov_base + piov.iov_len;
+ off_real = lseek(img_raw_fd(pr->pi), 0, SEEK_CUR);
+ if (!pr->pe->in_parent) {
+ ret = punch_hole(pr, off_real, min(piov_end, iov_end) - off, false);
+ if (ret == -1)
+ return ret;
+ }
+
+ prp = pr->parent;
+ if (prp) {
+ /* recursively */
+ pr_debug("Go to next parent level\n");
+ tiov.iov_base = (void*)off;
+ tiov.iov_len = min(piov_end, iov_end) - off;
+ ret = dedup_one_iovec(prp, &tiov);
+ if (ret != 0)
+ return -1;
+ }
+
+ if (piov_end < iov_end) {
+ off = piov_end;
+ continue;
+ } else
+ return 0;
+ }
+ return 0;
+}
diff --git a/criu/cr-dump.c b/criu/cr-dump.c
new file mode 100644
index 000000000000..385b7bb7b25a
--- /dev/null
+++ b/criu/cr-dump.c
@@ -0,0 +1,1720 @@
+#include <sys/time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <signal.h>
+#include <limits.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+
+#include <fcntl.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/vfs.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/wait.h>
+
+#include <sys/sendfile.h>
+
+#include <sched.h>
+#include <sys/resource.h>
+
+#include "protobuf.h"
+#include "protobuf/fdinfo.pb-c.h"
+#include "protobuf/fs.pb-c.h"
+#include "protobuf/mm.pb-c.h"
+#include "protobuf/creds.pb-c.h"
+#include "protobuf/core.pb-c.h"
+#include "protobuf/file-lock.pb-c.h"
+#include "protobuf/rlimit.pb-c.h"
+#include "protobuf/siginfo.pb-c.h"
+
+#include "asm/types.h"
+#include "list.h"
+#include "imgset.h"
+#include "file-ids.h"
+#include "kcmp-ids.h"
+#include "compiler.h"
+#include "crtools.h"
+#include "cr_options.h"
+#include "servicefd.h"
+#include "string.h"
+#include "ptrace.h"
+#include "util.h"
+#include "namespaces.h"
+#include "image.h"
+#include "proc_parse.h"
+#include "parasite.h"
+#include "parasite-syscall.h"
+#include "files.h"
+#include "files-reg.h"
+#include "shmem.h"
+#include "sk-inet.h"
+#include "pstree.h"
+#include "mount.h"
+#include "tty.h"
+#include "net.h"
+#include "sk-packet.h"
+#include "cpu.h"
+#include "elf.h"
+#include "cgroup.h"
+#include "file-lock.h"
+#include "page-xfer.h"
+#include "kerndat.h"
+#include "stats.h"
+#include "mem.h"
+#include "page-pipe.h"
+#include "posix-timer.h"
+#include "vdso.h"
+#include "vma.h"
+#include "cr-service.h"
+#include "plugin.h"
+#include "irmap.h"
+#include "sysfs_parse.h"
+#include "action-scripts.h"
+#include "aio.h"
+#include "lsm.h"
+#include "seccomp.h"
+#include "seize.h"
+#include "fault-injection.h"
+
+#include "asm/dump.h"
+
+static char loc_buf[PAGE_SIZE];
+
+static void close_vma_file(struct vma_area *vma)
+{
+ if (vma->vm_file_fd < 0)
+ return;
+ if (vma->e->status & VMA_AREA_SOCKET)
+ return;
+ if (vma->file_borrowed)
+ return;
+ if (vma_area_is(vma, VMA_AREA_AIORING))
+ return;
+
+ close(vma->vm_file_fd);
+}
+
+void free_mappings(struct vm_area_list *vma_area_list)
+{
+ struct vma_area *vma_area, *p;
+
+ list_for_each_entry_safe(vma_area, p, &vma_area_list->h, list) {
+ close_vma_file(vma_area);
+ if (!vma_area->file_borrowed)
+ free(vma_area->vmst);
+ free(vma_area);
+ }
+
+ INIT_LIST_HEAD(&vma_area_list->h);
+ vma_area_list->nr = 0;
+}
+
+int collect_mappings(pid_t pid, struct vm_area_list *vma_area_list)
+{
+ int ret = -1;
+
+ pr_info("\n");
+ pr_info("Collecting mappings (pid: %d)\n", pid);
+ pr_info("----------------------------------------\n");
+
+ ret = parse_smaps(pid, vma_area_list);
+ if (ret < 0)
+ goto err;
+
+ pr_info("Collected, longest area occupies %lu pages\n", vma_area_list->longest);
+ pr_info_vma_list(&vma_area_list->h);
+
+ pr_info("----------------------------------------\n");
+err:
+ return ret;
+}
+
+static int dump_sched_info(int pid, ThreadCoreEntry *tc)
+{
+ int ret;
+ struct sched_param sp;
+
+ BUILD_BUG_ON(SCHED_OTHER != 0); /* default in proto message */
+
+ ret = sched_getscheduler(pid);
+ if (ret < 0) {
+ pr_perror("Can't get sched policy for %d", pid);
+ return -1;
+ }
+
+ pr_info("%d has %d sched policy\n", pid, ret);
+ tc->has_sched_policy = true;
+ tc->sched_policy = ret;
+
+ if ((ret == SCHED_RR) || (ret == SCHED_FIFO)) {
+ ret = sched_getparam(pid, &sp);
+ if (ret < 0) {
+ pr_perror("Can't get sched param for %d", pid);
+ return -1;
+ }
+
+ pr_info("\tdumping %d prio for %d\n", sp.sched_priority, pid);
+ tc->has_sched_prio = true;
+ tc->sched_prio = sp.sched_priority;
+ }
+
+ /*
+ * The nice is ignored for RT sched policies, but is stored
+ * in kernel. Thus we have to take it with us in the image.
+ */
+
+ errno = 0;
+ ret = getpriority(PRIO_PROCESS, pid);
+ if (errno) {
+ pr_perror("Can't get nice for %d", pid);
+ return -1;
+ }
+
+ pr_info("\tdumping %d nice for %d\n", ret, pid);
+ tc->has_sched_nice = true;
+ tc->sched_nice = ret;
+
+ return 0;
+}
+
+struct cr_imgset *glob_imgset;
+
+static int collect_fds(pid_t pid, struct parasite_drain_fd *dfds)
+{
+ struct dirent *de;
+ DIR *fd_dir;
+ int n;
+
+ pr_info("\n");
+ pr_info("Collecting fds (pid: %d)\n", pid);
+ pr_info("----------------------------------------\n");
+
+ fd_dir = opendir_proc(pid, "fd");
+ if (!fd_dir)
+ return -1;
+
+ n = 0;
+ while ((de = readdir(fd_dir))) {
+ if (dir_dots(de))
+ continue;
+
+ if (n > PARASITE_MAX_FDS - 1)
+ return -ENOMEM;
+
+ dfds->fds[n++] = atoi(de->d_name);
+ }
+
+ dfds->nr_fds = n;
+ pr_info("Found %d file descriptors\n", n);
+ pr_info("----------------------------------------\n");
+
+ closedir(fd_dir);
+
+ return 0;
+}
+
+static int fill_fd_params_special(int fd, struct fd_parms *p)
+{
+ *p = FD_PARMS_INIT;
+
+ if (fstat(fd, &p->stat) < 0) {
+ pr_perror("Can't fstat exe link");
+ return -1;
+ }
+
+ if (get_fd_mntid(fd, &p->mnt_id))
+ return -1;
+
+ return 0;
+}
+
+static int dump_task_exe_link(pid_t pid, MmEntry *mm)
+{
+ struct fd_parms params;
+ int fd, ret = 0;
+
+ fd = open_proc_path(pid, "exe");
+ if (fd < 0)
+ return -1;
+
+ if (fill_fd_params_special(fd, ¶ms))
+ return -1;
+
+ if (fd_id_generate_special(¶ms, &mm->exe_file_id))
+ ret = dump_one_reg_file(fd, mm->exe_file_id, ¶ms);
+
+ close(fd);
+ return ret;
+}
+
+static int dump_task_fs(pid_t pid, struct parasite_dump_misc *misc, struct cr_imgset *imgset)
+{
+ struct fd_parms p;
+ FsEntry fe = FS_ENTRY__INIT;
+ int fd, ret;
+
+ fe.has_umask = true;
+ fe.umask = misc->umask;
+
+ fd = open_proc_path(pid, "cwd");
+ if (fd < 0)
+ return -1;
+
+ if (fill_fd_params_special(fd, &p))
+ return -1;
+
+ if (fd_id_generate_special(&p, &fe.cwd_id)) {
+ ret = dump_one_reg_file(fd, fe.cwd_id, &p);
+ if (ret < 0)
+ return ret;
+ }
+
+ close(fd);
+
+ fd = open_proc_path(pid, "root");
+ if (fd < 0)
+ return -1;
+
+ if (fill_fd_params_special(fd, &p))
+ return -1;
+
+ if (fd_id_generate_special(&p, &fe.root_id)) {
+ ret = dump_one_reg_file(fd, fe.root_id, &p);
+ if (ret < 0)
+ return ret;
+ }
+
+ close(fd);
+
+ pr_info("Dumping task cwd id %#x root id %#x\n",
+ fe.cwd_id, fe.root_id);
+
+ return pb_write_one(img_from_set(imgset, CR_FD_FS), &fe, PB_FS);
+}
+
+static inline u_int64_t encode_rlim(unsigned long val)
+{
+ return val == RLIM_INFINITY ? -1 : val;
+}
+
+static int dump_task_rlimits(int pid, TaskRlimitsEntry *rls)
+{
+ int res;
+
+ for (res = 0; res <rls->n_rlimits ; res++) {
+ struct rlimit lim;
+
+ if (prlimit(pid, res, NULL, &lim)) {
+ pr_perror("Can't get rlimit %d", res);
+ return -1;
+ }
+
+ rls->rlimits[res]->cur = encode_rlim(lim.rlim_cur);
+ rls->rlimits[res]->max = encode_rlim(lim.rlim_max);
+ }
+
+ return 0;
+}
+
+static int dump_pid_misc(pid_t pid, TaskCoreEntry *tc)
+{
+ int ret;
+
+ if (kdat.has_loginuid) {
+ pr_info("dumping /proc/%d/loginuid\n", pid);
+
+ tc->has_loginuid = true;
+ tc->loginuid = parse_pid_loginuid(pid, &ret, false);
+ tc->loginuid = userns_uid(tc->loginuid);
+ /*
+ * loginuid dumping is critical, as if not correctly
+ * restored, you may loss ability to login via SSH to CT
+ */
+ if (ret < 0)
+ return ret;
+ } else {
+ tc->has_loginuid = false;
+ }
+
+ pr_info("dumping /proc/%d/oom_score_adj\n", pid);
+
+ tc->oom_score_adj = parse_pid_oom_score_adj(pid, &ret);
+ /*
+ * oom_score_adj dumping is not very critical, as it will affect
+ * on victim in OOM situation and one will find dumping error in log
+ */
+ if (ret < 0)
+ tc->has_oom_score_adj = false;
+ else
+ tc->has_oom_score_adj = true;
+
+ return 0;
+}
+
+static int dump_filemap(pid_t pid, struct vma_area *vma_area,
+ const struct cr_imgset *imgset)
+{
+ struct fd_parms p = FD_PARMS_INIT;
+ VmaEntry *vma = vma_area->e;
+ int ret = 0;
+ u32 id;
+
+ BUG_ON(!vma_area->vmst);
+ p.stat = *vma_area->vmst;
+ p.mnt_id = vma_area->mnt_id;
+
+ /*
+ * AUFS support to compensate for the kernel bug
+ * exposing branch pathnames in map_files.
+ *
+ * If the link found in vma_get_mapfile() pointed
+ * inside a branch, we should use the pathname
+ * from root that was saved in vma_area->aufs_rpath.
+ */
+ if (vma_area->aufs_rpath) {
+ struct fd_link aufs_link;
+
+ strlcpy(aufs_link.name, vma_area->aufs_rpath,
+ sizeof(aufs_link.name));
+ aufs_link.len = strlen(aufs_link.name);
+ p.link = &aufs_link;
+ }
+
+ /* Flags will be set during restore in get_filemap_fd() */
+
+ if (fd_id_generate_special(&p, &id))
+ ret = dump_one_reg_file(vma_area->vm_file_fd, id, &p);
+
+ vma->shmid = id;
+ return ret;
+}
+
+static int check_sysvipc_map_dump(pid_t pid, VmaEntry *vma)
+{
+ if (root_ns_mask & CLONE_NEWIPC)
+ return 0;
+
+ pr_err("Task %d with SysVIPC shmem map @%"PRIx64" doesn't live in IPC ns\n",
+ pid, vma->start);
+ return -1;
+}
+
+static int get_task_auxv(pid_t pid, MmEntry *mm)
+{
+ auxv_t mm_saved_auxv[AT_VECTOR_SIZE];
+ int fd, i, ret;
+
+ pr_info("Obtaining task auvx ...\n");
+
+ fd = open_proc(pid, "auxv");
+ if (fd < 0)
+ return -1;
+
+ ret = read(fd, mm_saved_auxv, sizeof(mm_saved_auxv));
+ if (ret < 0) {
+ ret = -1;
+ pr_perror("Error reading %d's auxv", pid);
+ goto err;
+ } else {
+ mm->n_mm_saved_auxv = ret / sizeof(auxv_t);
+ for (i = 0; i < mm->n_mm_saved_auxv; i++)
+ mm->mm_saved_auxv[i] = (u64)mm_saved_auxv[i];
+ }
+
+ ret = 0;
+err:
+ close_safe(&fd);
+ return ret;
+}
+
+static int dump_task_mm(pid_t pid, const struct proc_pid_stat *stat,
+ const struct parasite_dump_misc *misc,
+ const struct vm_area_list *vma_area_list,
+ const struct cr_imgset *imgset)
+{
+ MmEntry mme = MM_ENTRY__INIT;
+ struct vma_area *vma_area;
+ int ret = -1, i = 0;
+
+ pr_info("\n");
+ pr_info("Dumping mm (pid: %d)\n", pid);
+ pr_info("----------------------------------------\n");
+
+ mme.n_vmas = vma_area_list->nr;
+ mme.vmas = xmalloc(mme.n_vmas * sizeof(VmaEntry *));
+ if (!mme.vmas)
+ goto err;
+
+ list_for_each_entry(vma_area, &vma_area_list->h, list) {
+ VmaEntry *vma = vma_area->e;
+
+ pr_info_vma(vma_area);
+
+ if (!vma_entry_is(vma, VMA_AREA_REGULAR))
+ ret = 0;
+ else if (vma_entry_is(vma, VMA_AREA_SYSVIPC))
+ ret = check_sysvipc_map_dump(pid, vma);
+ else if (vma_entry_is(vma, VMA_ANON_SHARED))
+ ret = add_shmem_area(pid, vma);
+ else if (vma_entry_is(vma, VMA_FILE_PRIVATE) ||
+ vma_entry_is(vma, VMA_FILE_SHARED))
+ ret = dump_filemap(pid, vma_area, imgset);
+ else if (vma_entry_is(vma, VMA_AREA_SOCKET))
+ ret = dump_socket_map(vma_area);
+ else
+ ret = 0;
+ if (ret)
+ goto err;
+
+ mme.vmas[i++] = vma;
+
+ if (vma_entry_is(vma, VMA_AREA_AIORING)) {
+ ret = dump_aio_ring(&mme, vma_area);
+ if (ret)
+ goto err;
+ }
+ }
+
+ mme.mm_start_code = stat->start_code;
+ mme.mm_end_code = stat->end_code;
+ mme.mm_start_data = stat->start_data;
+ mme.mm_end_data = stat->end_data;
+ mme.mm_start_stack = stat->start_stack;
+ mme.mm_start_brk = stat->start_brk;
+
+ mme.mm_arg_start = stat->arg_start;
+ mme.mm_arg_end = stat->arg_end;
+ mme.mm_env_start = stat->env_start;
+ mme.mm_env_end = stat->env_end;
+
+ mme.mm_brk = misc->brk;
+
+ mme.dumpable = misc->dumpable;
+ mme.has_dumpable = true;
+
+ mme.n_mm_saved_auxv = AT_VECTOR_SIZE;
+ mme.mm_saved_auxv = xmalloc(pb_repeated_size(&mme, mm_saved_auxv));
+ if (!mme.mm_saved_auxv)
+ goto err;
+
+ if (get_task_auxv(pid, &mme))
+ goto err;
+
+ if (dump_task_exe_link(pid, &mme))
+ goto err;
+
+ ret = pb_write_one(img_from_set(imgset, CR_FD_MM), &mme, PB_MM);
+ xfree(mme.mm_saved_auxv);
+ free_aios(&mme);
+err:
+ return ret;
+}
+
+static int get_task_futex_robust_list(pid_t pid, ThreadCoreEntry *info)
+{
+ struct robust_list_head *head = NULL;
+ size_t len = 0;
+ int ret;
+
+ ret = syscall(SYS_get_robust_list, pid, &head, &len);
+ if (ret == -ENOSYS) {
+ /*
+ * If the kernel says get_robust_list is not implemented, then
+ * check whether set_robust_list is also not implemented, in
+ * that case we can assume it is empty, since set_robust_list
+ * is the only way to populate it. This case is possible when
+ * "futex_cmpxchg_enabled" is unset in the kernel.
+ *
+ * The following system call should always fail, even if it is
+ * implemented, in which case it will return -EINVAL because
+ * len should be greater than zero.
+ */
+ if (syscall(SYS_set_robust_list, NULL, 0) != -ENOSYS)
+ goto err;
+
+ head = NULL;
+ len = 0;
+ } else if (ret) {
+ goto err;
+ }
+
+ info->futex_rla = encode_pointer(head);
+ info->futex_rla_len = (u32)len;
+
+ return 0;
+
+err:
+ pr_err("Failed obtaining futex robust list on %d\n", pid);
+ return -1;
+}
+
+static int get_task_personality(pid_t pid, u32 *personality)
+{
+ int fd, ret = -1;
+
+ pr_info("Obtaining personality ... \n");
+
+ fd = open_proc(pid, "personality");
+ if (fd < 0)
+ goto err;
+
+ ret = read(fd, loc_buf, sizeof(loc_buf) - 1);
+ close(fd);
+
+ if (ret >= 0) {
+ loc_buf[ret] = '\0';
+ *personality = atoi(loc_buf);
+ }
+err:
+ return ret;
+}
+
+static DECLARE_KCMP_TREE(vm_tree, KCMP_VM);
+static DECLARE_KCMP_TREE(fs_tree, KCMP_FS);
+static DECLARE_KCMP_TREE(files_tree, KCMP_FILES);
+static DECLARE_KCMP_TREE(sighand_tree, KCMP_SIGHAND);
+
+static int dump_task_kobj_ids(struct pstree_item *item)
+{
+ int new;
+ struct kid_elem elem;
+ int pid = item->pid.real;
+ TaskKobjIdsEntry *ids = item->ids;
+
+ elem.pid = pid;
+ elem.idx = 0; /* really 0 for all */
+ elem.genid = 0; /* FIXME optimize */
+
+ new = 0;
+ ids->vm_id = kid_generate_gen(&vm_tree, &elem, &new);
+ if (!ids->vm_id || !new) {
+ pr_err("Can't make VM id for %d\n", pid);
+ return -1;
+ }
+
+ new = 0;
+ ids->fs_id = kid_generate_gen(&fs_tree, &elem, &new);
+ if (!ids->fs_id || !new) {
+ pr_err("Can't make FS id for %d\n", pid);
+ return -1;
+ }
+
+ new = 0;
+ ids->files_id = kid_generate_gen(&files_tree, &elem, &new);
+ if (!ids->files_id || (!new && !shared_fdtable(item))) {
+ pr_err("Can't make FILES id for %d\n", pid);
+ return -1;
+ }
+
+ new = 0;
+ ids->sighand_id = kid_generate_gen(&sighand_tree, &elem, &new);
+ if (!ids->sighand_id || !new) {
+ pr_err("Can't make IO id for %d\n", pid);
+ return -1;
+ }
+
+ return 0;
+}
+
+int get_task_ids(struct pstree_item *item)
+{
+ int ret;
+
+ item->ids = xmalloc(sizeof(*item->ids));
+ if (!item->ids)
+ goto err;
+
+ task_kobj_ids_entry__init(item->ids);
+
+ if (item->state != TASK_DEAD) {
+ ret = dump_task_kobj_ids(item);
+ if (ret)
+ goto err_free;
+
+ ret = dump_task_ns_ids(item);
+ if (ret)
+ goto err_free;
+ }
+
+ return 0;
+
+err_free:
+ xfree(item->ids);
+ item->ids = NULL;
+err:
+ return -1;
+}
+
+static int dump_task_ids(struct pstree_item *item, const struct cr_imgset *cr_imgset)
+{
+ return pb_write_one(img_from_set(cr_imgset, CR_FD_IDS), item->ids, PB_IDS);
+}
+
+int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread *ti)
+{
+ int ret;
+ ThreadCoreEntry *tc = core->thread_core;
+
+ ret = collect_lsm_profile(pid, tc->creds);
+ if (!ret)
+ ret = get_task_futex_robust_list(pid, tc);
+ if (!ret)
+ ret = dump_sched_info(pid, tc);
+ if (!ret) {
+ core_put_tls(core, ti->tls);
+ CORE_THREAD_ARCH_INFO(core)->clear_tid_addr = encode_pointer(ti->tid_addr);
+ BUG_ON(!tc->sas);
+ copy_sas(tc->sas, &ti->sas);
+ if (ti->pdeath_sig) {
+ tc->has_pdeath_sig = true;
+ tc->pdeath_sig = ti->pdeath_sig;
+ }
+ }
+
+ return ret;
+}
+
+static int dump_task_core_all(struct parasite_ctl *ctl,
+ struct pstree_item *item,
+ const struct proc_pid_stat *stat,
+ const struct cr_imgset *cr_imgset)
+{
+ struct cr_img *img;
+ CoreEntry *core = item->core[0];
+ pid_t pid = item->pid.real;
+ int ret = -1;
+ struct proc_status_creds *creds;
+
+ pr_info("\n");
+ pr_info("Dumping core (pid: %d)\n", pid);
+ pr_info("----------------------------------------\n");
+
+ ret = get_task_personality(pid, &core->tc->personality);
+ if (ret < 0)
+ goto err;
+
+ creds = dmpi(item)->pi_creds;
+ if (creds->seccomp_mode != SECCOMP_MODE_DISABLED) {
+ pr_info("got seccomp mode %d for %d\n", creds->seccomp_mode, item->pid.virt);
+ core->tc->has_seccomp_mode = true;
+ core->tc->seccomp_mode = creds->seccomp_mode;
+
+ if (creds->seccomp_mode == SECCOMP_MODE_FILTER) {
+ core->tc->has_seccomp_filter = true;
+ core->tc->seccomp_filter = creds->last_filter;
+ }
+ }
+
+ strlcpy((char *)core->tc->comm, stat->comm, TASK_COMM_LEN);
+ core->tc->flags = stat->flags;
+ core->tc->task_state = item->state;
+ core->tc->exit_code = 0;
+
+ ret = parasite_dump_thread_leader_seized(ctl, pid, core);
+ if (ret)
+ goto err;
+
+ ret = dump_pid_misc(pid, core->tc);
+ if (ret)
+ goto err;
+
+ ret = dump_task_rlimits(pid, core->tc->rlimits);
+ if (ret)
+ goto err;
+
+ core->tc->has_cg_set = true;
+ ret = dump_task_cgroup(item, &core->tc->cg_set);
+ if (ret)
+ goto err;
+
+ img = img_from_set(cr_imgset, CR_FD_CORE);
+ ret = pb_write_one(img, core, PB_CORE);
+ if (ret < 0)
+ goto err;
+
+err:
+ pr_info("----------------------------------------\n");
+
+ return ret;
+}
+
+static int collect_pstree_ids_predump(void)
+{
+ struct pstree_item *item;
+ struct {
+ struct pstree_item i;
+ struct dmp_info d;
+ } crt = { };
+
+ /*
+ * This thing is normally done inside
+ * write_img_inventory().
+ */
+
+ crt.i.state = TASK_ALIVE;
+ crt.i.pid.real = getpid();
+
+ if (predump_task_ns_ids(&crt.i))
+ return -1;
+
+ for_each_pstree_item(item) {
+ if (item->state == TASK_DEAD)
+ continue;
+
+ if (predump_task_ns_ids(item))
+ return -1;
+ }
+
+ return 0;
+}
+
+int collect_pstree_ids(void)
+{
+ struct pstree_item *item;
+
+ for_each_pstree_item(item)
+ if (get_task_ids(item))
+ return -1;
+
+ return 0;
+}
+
+static int collect_file_locks(void)
+{
+ return parse_file_locks();
+}
+
+static int dump_task_thread(struct parasite_ctl *parasite_ctl,
+ const struct pstree_item *item, int id)
+{
+ struct pid *tid = &item->threads[id];
+ CoreEntry *core = item->core[id];
+ pid_t pid = tid->real;
+ int ret = -1;
+ struct cr_img *img;
+
+ pr_info("\n");
+ pr_info("Dumping core for thread (pid: %d)\n", pid);
+ pr_info("----------------------------------------\n");
+
+ ret = parasite_dump_thread_seized(parasite_ctl, id, tid, core);
+ if (ret) {
+ pr_err("Can't dump thread for pid %d\n", pid);
+ goto err;
+ }
+
+ img = open_image(CR_FD_CORE, O_DUMP, tid->virt);
+ if (!img)
+ goto err;
+
+ ret = pb_write_one(img, core, PB_CORE);
+
+ close_image(img);
+err:
+ pr_info("----------------------------------------\n");
+ return ret;
+}
+
+static int dump_one_zombie(const struct pstree_item *item,
+ const struct proc_pid_stat *pps)
+{
+ CoreEntry *core;
+ int ret = -1;
+ struct cr_img *img;
+
+ core = core_entry_alloc(0, 1);
+ if (!core)
+ return -1;
+
+ strlcpy((char *)core->tc->comm, pps->comm, TASK_COMM_LEN);
+ core->tc->task_state = TASK_DEAD;
+ core->tc->exit_code = pps->exit_code;
+
+ img = open_image(CR_FD_CORE, O_DUMP, item->pid.virt);
+ if (!img)
+ goto err;
+
+ ret = pb_write_one(img, core, PB_CORE);
+ close_image(img);
+err:
+ core_entry_free(core);
+ return ret;
+}
+
+#define SI_BATCH 32
+
+static int dump_signal_queue(pid_t tid, SignalQueueEntry **sqe, bool group)
+{
+ struct ptrace_peeksiginfo_args arg;
+ int ret;
+ SignalQueueEntry *queue = NULL;
+
+ pr_debug("Dump %s signals of %d\n", group ? "shared" : "private", tid);
+
+ arg.nr = SI_BATCH;
+ arg.flags = 0;
+ if (group)
+ arg.flags |= PTRACE_PEEKSIGINFO_SHARED;
+ arg.off = 0;
+
+ queue = xmalloc(sizeof(*queue));
+ if (!queue)
+ return -1;
+
+ signal_queue_entry__init(queue);
+
+ while (1) {
+ int nr, si_pos;
+ siginfo_t *si;
+
+ si = xmalloc(SI_BATCH * sizeof(*si));
+ if (!si) {
+ ret = -1;
+ break;
+ }
+
+ nr = ret = ptrace(PTRACE_PEEKSIGINFO, tid, &arg, si);
+ if (ret == 0)
+ break; /* Finished */
+
+ if (ret < 0) {
+ if (errno == EIO) {
+ pr_warn("ptrace doesn't support PTRACE_PEEKSIGINFO\n");
+ ret = 0;
+ } else
+ pr_perror("ptrace");
+
+ break;
+ }
+
+ queue->n_signals += nr;
+ queue->signals = xrealloc(queue->signals, sizeof(*queue->signals) * queue->n_signals);
+ if (!queue->signals) {
+ ret = -1;
+ break;
+ }
+
+ for (si_pos = queue->n_signals - nr;
+ si_pos < queue->n_signals; si_pos++) {
+ SiginfoEntry *se;
+
+ se = xmalloc(sizeof(*se));
+ if (!se) {
+ ret = -1;
+ break;
+ }
+
+ siginfo_entry__init(se);
+ se->siginfo.len = sizeof(siginfo_t);
+ se->siginfo.data = (void *)si++; /* XXX we don't free cores, but when
+ * we will, this would cause problems
+ */
+ queue->signals[si_pos] = se;
+ }
+
+ if (ret < 0)
+ break;
+
+ arg.off += nr;
+ }
+
+ *sqe = queue;
+ return ret;
+}
+
+static int dump_task_signals(pid_t pid, struct pstree_item *item)
+{
+ int i, ret;
+
+ /* Dump private signals for each thread */
+ for (i = 0; i < item->nr_threads; i++) {
+ ret = dump_signal_queue(item->threads[i].real, &item->core[i]->thread_core->signals_p, false);
+ if (ret) {
+ pr_err("Can't dump private signals for thread %d\n", item->threads[i].real);
+ return -1;
+ }
+ }
+
+ /* Dump shared signals */
+ ret = dump_signal_queue(pid, &item->core[0]->tc->signals_s, true);
+ if (ret) {
+ pr_err("Can't dump shared signals (pid: %d)\n", pid);
+ return -1;
+ }
+
+ return 0;
+}
+
+static struct proc_pid_stat pps_buf;
+
+static int dump_task_threads(struct parasite_ctl *parasite_ctl,
+ const struct pstree_item *item)
+{
+ int i;
+
+ for (i = 0; i < item->nr_threads; i++) {
+ /* Leader is already dumped */
+ if (item->pid.real == item->threads[i].real) {
+ item->threads[i].virt = item->pid.virt;
+ continue;
+ }
+ if (dump_task_thread(parasite_ctl, item, i))
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * What this routine does is just reads pid-s of dead
+ * tasks in item's children list from item's ns proc.
+ *
+ * It does *not* find wihch real pid corresponds to
+ * which virtual one, but it's not required -- all we
+ * need to dump for zombie can be found in the same
+ * ns proc.
+ */
+
+static int fill_zombies_pids(struct pstree_item *item)
+{
+ struct pstree_item *child;
+ int i, nr;
+ pid_t *ch;
+
+ /*
+ * Pids read here are virtual -- caller has set up
+ * the proc of target pid namespace.
+ */
+ if (parse_children(item->pid.virt, &ch, &nr) < 0)
+ return -1;
+
+ /*
+ * Step 1 -- filter our ch's pid of alive tasks
+ */
+ list_for_each_entry(child, &item->children, sibling) {
+ if (child->pid.virt < 0)
+ continue;
+ for (i = 0; i < nr; i++) {
+ if (ch[i] == child->pid.virt) {
+ ch[i] = -1;
+ break;
+ }
+ }
+ }
+
+ /*
+ * Step 2 -- assign remaining pids from ch on
+ * children's items in arbitrary order. The caller
+ * will then re-read everything needed to dump
+ * zombies using newly obtained virtual pids.
+ */
+ i = 0;
+ list_for_each_entry(child, &item->children, sibling) {
+ if (child->pid.virt > 0)
+ continue;
+ for (; i < nr; i++) {
+ if (ch[i] < 0)
+ continue;
+ child->pid.virt = ch[i];
+ ch[i] = -1;
+ break;
+ }
+ BUG_ON(i == nr);
+ }
+
+ xfree(ch);
+
+ return 0;
+}
+
+static int dump_zombies(void)
+{
+ struct pstree_item *item;
+ int ret = -1;
+ int pidns = root_ns_mask & CLONE_NEWPID;
+
+ if (pidns && set_proc_fd(get_service_fd(CR_PROC_FD_OFF)))
+ return -1;
+
+ /*
+ * We dump zombies separately becase for pid-ns case
+ * we'd have to resolve their pids w/o parasite via
+ * target ns' proc.
+ */
+
+ for_each_pstree_item(item) {
+ if (item->state != TASK_DEAD)
+ continue;
+
+ if (item->pid.virt < 0) {
+ if (!pidns)
+ item->pid.virt = item->pid.real;
+ else if (root_item == item) {
+ pr_err("A root task is dead\n");
+ goto err;
+ } else if (fill_zombies_pids(item->parent))
+ goto err;
+ }
+
+ pr_info("Obtaining zombie stat ... \n");
+ if (parse_pid_stat(item->pid.virt, &pps_buf) < 0)
+ goto err;
+
+ item->sid = pps_buf.sid;
+ item->pgid = pps_buf.pgid;
+
+ BUG_ON(!list_empty(&item->children));
+ if (dump_one_zombie(item, &pps_buf) < 0)
+ goto err;
+ }
+
+ ret = 0;
+err:
+ if (pidns)
+ close_proc();
+
+ return ret;
+}
+
+static int pre_dump_one_task(struct pstree_item *item, struct list_head *ctls)
+{
+ pid_t pid = item->pid.real;
+ struct vm_area_list vmas;
+ struct parasite_ctl *parasite_ctl;
+ int ret = -1;
+ struct parasite_dump_misc misc;
+
+ INIT_LIST_HEAD(&vmas.h);
+ vmas.nr = 0;
+
+ pr_info("========================================\n");
+ pr_info("Pre-dumping task (pid: %d)\n", pid);
+ pr_info("========================================\n");
+
+ if (item->state == TASK_STOPPED) {
+ pr_warn("Stopped tasks are not supported\n");
+ return 0;
+ }
+
+ if (item->state == TASK_DEAD)
+ return 0;
+
+ ret = collect_mappings(pid, &vmas);
+ if (ret) {
+ pr_err("Collect mappings (pid: %d) failed with %d\n", pid, ret);
+ goto err;
+ }
+
+ ret = -1;
+ parasite_ctl = parasite_infect_seized(pid, item, &vmas);
+ if (!parasite_ctl) {
+ pr_err("Can't infect (pid: %d) with parasite\n", pid);
+ goto err_free;
+ }
+
+ ret = parasite_fixup_vdso(parasite_ctl, pid, &vmas);
+ if (ret) {
+ pr_err("Can't fixup vdso VMAs (pid: %d)\n", pid);
+ goto err_cure;
+ }
+
+ ret = parasite_dump_misc_seized(parasite_ctl, &misc);
+ if (ret) {
+ pr_err("Can't dump misc (pid: %d)\n", pid);
+ goto err_cure;
+ }
+
+ ret = predump_task_files(pid);
+ if (ret) {
+ pr_err("Pre-dumping files failed (pid: %d)\n", pid);
+ goto err_cure;
+ }
+
+ parasite_ctl->pid.virt = item->pid.virt = misc.pid;
+
+ ret = parasite_dump_pages_seized(parasite_ctl, &vmas, ¶site_ctl->mem_pp);
+ if (ret)
+ goto err_cure;
+
+ if (parasite_cure_remote(parasite_ctl))
+ pr_err("Can't cure (pid: %d) from parasite\n", pid);
+ list_add_tail(¶site_ctl->pre_list, ctls);
+err_free:
+ free_mappings(&vmas);
+err:
+ return ret;
+
+err_cure:
+ if (parasite_cure_seized(parasite_ctl))
+ pr_err("Can't cure (pid: %d) from parasite\n", pid);
+ goto err_free;
+}
+
+static int dump_one_task(struct pstree_item *item)
+{
+ pid_t pid = item->pid.real;
+ struct vm_area_list vmas;
+ struct parasite_ctl *parasite_ctl;
+ int ret, exit_code = -1;
+ struct parasite_dump_misc misc;
+ struct cr_imgset *cr_imgset = NULL;
+ struct parasite_drain_fd *dfds = NULL;
+ struct proc_posix_timers_stat proc_args;
+
+ INIT_LIST_HEAD(&vmas.h);
+ vmas.nr = 0;
+
+ pr_info("========================================\n");
+ pr_info("Dumping task (pid: %d)\n", pid);
+ pr_info("========================================\n");
+
+ if (item->state == TASK_DEAD)
+ /*
+ * zombies are dumped separately in dump_zombies()
+ */
+ return 0;
+
+ pr_info("Obtaining task stat ... \n");
+ ret = parse_pid_stat(pid, &pps_buf);
+ if (ret < 0)
+ goto err;
+
+ ret = collect_mappings(pid, &vmas);
+ if (ret) {
+ pr_err("Collect mappings (pid: %d) failed with %d\n", pid, ret);
+ goto err;
+ }
+
+ if (!shared_fdtable(item)) {
+ dfds = xmalloc(sizeof(*dfds));
+ if (!dfds)
+ goto err;
+
+ ret = collect_fds(pid, dfds);
+ if (ret) {
+ pr_err("Collect fds (pid: %d) failed with %d\n", pid, ret);
+ goto err;
+ }
+
+ parasite_ensure_args_size(drain_fds_size(dfds));
+ }
+
+ ret = parse_posix_timers(pid, &proc_args);
+ if (ret < 0) {
+ pr_err("Can't read posix timers file (pid: %d)\n", pid);
+ goto err;
+ }
+
+ parasite_ensure_args_size(posix_timers_dump_size(proc_args.timer_n));
+
+ ret = dump_task_signals(pid, item);
+ if (ret) {
+ pr_err("Dump %d signals failed %d\n", pid, ret);
+ goto err;
+ }
+
+ parasite_ctl = parasite_infect_seized(pid, item, &vmas);
+ if (!parasite_ctl) {
+ pr_err("Can't infect (pid: %d) with parasite\n", pid);
+ goto err;
+ }
+
+ if (fault_injected(FI_DUMP_EARLY)) {
+ pr_info("fault: CRIU sudden detach\n");
+ BUG();
+ }
+
+ if (root_ns_mask & CLONE_NEWPID && root_item == item) {
+ int pfd;
+
+ pfd = parasite_get_proc_fd_seized(parasite_ctl);
+ if (pfd < 0) {
+ pr_err("Can't get proc fd (pid: %d)\n", pid);
+ goto err_cure_imgset;
+ }
+
+ if (install_service_fd(CR_PROC_FD_OFF, pfd) < 0)
+ goto err_cure_imgset;
+
+ close(pfd);
+ }
+
+ ret = parasite_fixup_vdso(parasite_ctl, pid, &vmas);
+ if (ret) {
+ pr_err("Can't fixup vdso VMAs (pid: %d)\n", pid);
+ goto err_cure_imgset;
+ }
+
+ ret = parasite_check_aios(parasite_ctl, &vmas); /* FIXME -- merge with above */
+ if (ret) {
+ pr_err("Failed to check aio rings (pid: %d)\n", pid);
+ goto err_cure_imgset;
+ }
+
+ ret = parasite_dump_misc_seized(parasite_ctl, &misc);
+ if (ret) {
+ pr_err("Can't dump misc (pid: %d)\n", pid);
+ goto err_cure_imgset;
+ }
+
+ parasite_ctl->pid.virt = item->pid.virt = misc.pid;
+ item->sid = misc.sid;
+ item->pgid = misc.pgid;
+
+ pr_info("sid=%d pgid=%d pid=%d\n",
+ item->sid, item->pgid, item->pid.virt);
+
+ if (item->sid == 0) {
+ pr_err("A session leader of %d(%d) is outside of its pid namespace\n",
+ item->pid.real, item->pid.virt);
+ goto err_cure;
+ }
+
+ cr_imgset = cr_task_imgset_open(item->pid.virt, O_DUMP);
+ if (!cr_imgset)
+ goto err_cure;
+
+ ret = dump_task_ids(item, cr_imgset);
+ if (ret) {
+ pr_err("Dump ids (pid: %d) failed with %d\n", pid, ret);
+ goto err_cure;
+ }
+
+ if (dfds) {
+ ret = dump_task_files_seized(parasite_ctl, item, dfds);
+ if (ret) {
+ pr_err("Dump files (pid: %d) failed with %d\n", pid, ret);
+ goto err_cure;
+ }
+ }
+
+ ret = parasite_dump_pages_seized(parasite_ctl, &vmas, NULL);
+ if (ret)
+ goto err_cure;
+
+ ret = parasite_dump_sigacts_seized(parasite_ctl, cr_imgset);
+ if (ret) {
+ pr_err("Can't dump sigactions (pid: %d) with parasite\n", pid);
+ goto err_cure;
+ }
+
+ ret = parasite_dump_itimers_seized(parasite_ctl, item);
+ if (ret) {
+ pr_err("Can't dump itimers (pid: %d)\n", pid);
+ goto err_cure;
+ }
+
+ ret = parasite_dump_posix_timers_seized(&proc_args, parasite_ctl, item);
+ if (ret) {
+ pr_err("Can't dump posix timers (pid: %d)\n", pid);
+ goto err_cure;
+ }
+
+ ret = dump_task_core_all(parasite_ctl, item, &pps_buf, cr_imgset);
+ if (ret) {
+ pr_err("Dump core (pid: %d) failed with %d\n", pid, ret);
+ goto err_cure;
+ }
+
+ ret = parasite_stop_daemon(parasite_ctl);
+ if (ret) {
+ pr_err("Can't cure (pid: %d) from parasite\n", pid);
+ goto err;
+ }
+
+ ret = dump_task_threads(parasite_ctl, item);
+ if (ret) {
+ pr_err("Can't dump threads\n");
+ goto err;
+ }
+
+ ret = parasite_cure_seized(parasite_ctl);
+ if (ret) {
+ pr_err("Can't cure (pid: %d) from parasite\n", pid);
+ goto err;
+ }
+
+ ret = dump_task_mm(pid, &pps_buf, &misc, &vmas, cr_imgset);
+ if (ret) {
+ pr_err("Dump mappings (pid: %d) failed with %d\n", pid, ret);
+ goto err;
+ }
+
+ ret = dump_task_fs(pid, &misc, cr_imgset);
+ if (ret) {
+ pr_err("Dump fs (pid: %d) failed with %d\n", pid, ret);
+ goto err;
+ }
+
+ close_cr_imgset(&cr_imgset);
+ exit_code = 0;
+err:
+ close_pid_proc();
+ free_mappings(&vmas);
+ xfree(dfds);
+ return exit_code;
+
+err_cure:
+ close_cr_imgset(&cr_imgset);
+err_cure_imgset:
+ parasite_cure_seized(parasite_ctl);
+ goto err;
+}
+
+typedef void (*sa_handler_t)(int);
+
+static int setup_alarm_handler(sa_handler_t handler)
+{
+ struct sigaction sa = {
+ .sa_handler = handler,
+ .sa_flags = 0,
+ };
+
+ sigemptyset(&sa.sa_mask);
+ sigaddset(&sa.sa_mask, SIGALRM);
+ if (sigaction(SIGALRM, &sa, NULL)) {
+ pr_perror("Unable to setup SIGALRM handler");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int cr_pre_dump_finish(struct list_head *ctls, int ret)
+{
+ struct parasite_ctl *ctl, *n;
+
+ pstree_switch_state(root_item,
+ ret ? TASK_ALIVE : opts.final_state);
+ free_pstree(root_item);
+
+ timing_stop(TIME_FROZEN);
+
+ pr_info("Pre-dumping tasks' memory\n");
+ list_for_each_entry_safe(ctl, n, ctls, pre_list) {
+ struct page_xfer xfer;
+
+ pr_info("\tPre-dumping %d\n", ctl->pid.virt);
+ timing_start(TIME_MEMWRITE);
+ ret = open_page_xfer(&xfer, CR_FD_PAGEMAP, ctl->pid.virt);
+ if (ret < 0)
+ break;
+
+ ret = page_xfer_dump_pages(&xfer, ctl->mem_pp, 0);
+
+ xfer.close(&xfer);
+
+ if (ret)
+ break;
+
+ timing_stop(TIME_MEMWRITE);
+
+ destroy_page_pipe(ctl->mem_pp);
+ list_del(&ctl->pre_list);
+ parasite_cure_local(ctl);
+ }
+
+ if (irmap_predump_run())
+ ret = -1;
+
+ if (disconnect_from_page_server())
+ ret = -1;
+
+ if (bfd_flush_images())
+ ret = -1;
+
+ if (ret)
+ pr_err("Pre-dumping FAILED.\n");
+ else {
+ write_stats(DUMP_STATS);
+ pr_info("Pre-dumping finished successfully\n");
+ }
+ return ret;
+}
+
+void pre_dump_alarm_handler(int signum)
+{
+ LIST_HEAD(empty_list);
+
+ pr_err("Timeout reached\n");
+ cr_pre_dump_finish(&empty_list, -1);
+ exit(-1);
+}
+
+int cr_pre_dump_tasks(pid_t pid)
+{
+ struct pstree_item *item;
+ int ret = -1;
+ LIST_HEAD(ctls);
+
+ if (!opts.track_mem) {
+ pr_info("Enforcing memory tracking for pre-dump.\n");
+ opts.track_mem = true;
+ }
+
+ if (opts.final_state == TASK_DEAD) {
+ pr_info("Enforcing tasks run after pre-dump.\n");
+ opts.final_state = TASK_ALIVE;
+ }
+
+ if (init_stats(DUMP_STATS))
+ goto err;
+
+ if (cr_plugin_init(CR_PLUGIN_STAGE__PRE_DUMP))
+ goto err;
+
+ if (kerndat_init())
+ goto err;
+
+ if (irmap_load_cache())
+ goto err;
+
+ if (cpu_init())
+ goto err;
+
+ if (vdso_init())
+ goto err;
+
+ if (connect_to_page_server())
+ goto err;
+
+ if (setup_alarm_handler(pre_dump_alarm_handler))
+ goto err;
+
+ if (collect_pstree(pid))
+ goto err;
+
+ if (collect_pstree_ids_predump())
+ goto err;
+
+ if (collect_namespaces(false) < 0)
+ goto err;
+
+ for_each_pstree_item(item)
+ if (pre_dump_one_task(item, &ctls))
+ goto err;
+
+ if (irmap_predump_prep())
+ goto err;
+
+ ret = 0;
+err:
+ return cr_pre_dump_finish(&ctls, ret);
+}
+
+static int cr_dump_finish(int ret)
+{
+ int post_dump_ret = 0;
+
+ if (disconnect_from_page_server())
+ ret = -1;
+
+ close_cr_imgset(&glob_imgset);
+
+ if (bfd_flush_images())
+ ret = -1;
+
+ cr_plugin_fini(CR_PLUGIN_STAGE__DUMP, ret);
+
+ if (!ret) {
+ /*
+ * It might be a migration case, where we're asked
+ * to dump everything, then some script transfer
+ * image on a new node and we're supposed to kill
+ * dumpee because it continue running somewhere
+ * else.
+ *
+ * Thus ask user via script if we're to break
+ * checkpoint.
+ */
+ post_dump_ret = run_scripts(ACT_POST_DUMP);
+ if (post_dump_ret) {
+ post_dump_ret = WEXITSTATUS(post_dump_ret);
+ pr_info("Post dump script passed with %d\n", post_dump_ret);
+ }
+ }
+
+ /*
+ * Dump is complete at this stage. To choose what
+ * to do next we need to consider the following
+ * scenarios
+ *
+ * - error happened during checkpoint: just clean up
+ * everything and continue execution of the dumpee;
+ *
+ * - dump successed but post-dump script returned
+ * some ret code: same as in previous scenario --
+ * just clean up everything and continue execution,
+ * we will return script ret code back to criu caller
+ * and it's up to a caller what to do with running instance
+ * of the dumpee -- either kill it, or continue running;
+ *
+ * - dump successed but -R option passed, pointing that
+ * we're asked to continue execution of the dumpee. It's
+ * assumed that a user will use post-dump script to keep
+ * consistency of the FS and other resources, we simply
+ * start rollback procedure and cleanup everyhting.
+ */
+ if (ret || post_dump_ret || opts.final_state == TASK_ALIVE) {
+ network_unlock();
+ delete_link_remaps();
+ }
+ pstree_switch_state(root_item,
+ (ret || post_dump_ret) ?
+ TASK_ALIVE : opts.final_state);
+ timing_stop(TIME_FROZEN);
+ free_pstree(root_item);
+ free_file_locks();
+ free_link_remaps();
+ free_aufs_branches();
+ free_userns_maps();
+
+ close_service_fd(CR_PROC_FD_OFF);
+
+ if (ret) {
+ pr_err("Dumping FAILED.\n");
+ } else {
+ write_stats(DUMP_STATS);
+ pr_info("Dumping finished successfully\n");
+ }
+ return post_dump_ret ? : (ret != 0);
+}
+
+void dump_alarm_handler(int signum)
+{
+ pr_err("Timeout reached\n");
+ cr_dump_finish(-1);
+ exit(-1);
+}
+
+int cr_dump_tasks(pid_t pid)
+{
+ InventoryEntry he = INVENTORY_ENTRY__INIT;
+ struct pstree_item *item;
+ int pre_dump_ret = 0;
+ int ret = -1;
+
+ pr_info("========================================\n");
+ pr_info("Dumping processes (pid: %d)\n", pid);
+ pr_info("========================================\n");
+
+ pre_dump_ret = run_scripts(ACT_PRE_DUMP);
+ if (pre_dump_ret != 0) {
+ pr_err("Pre dump script failed with %d!\n", pre_dump_ret);
+ goto err;
+ }
+ if (init_stats(DUMP_STATS))
+ goto err;
+
+ if (cr_plugin_init(CR_PLUGIN_STAGE__DUMP))
+ goto err;
+
+ if (kerndat_init())
+ goto err;
+
+ if (irmap_load_cache())
+ goto err;
+
+ if (cpu_init())
+ goto err;
+
+ if (vdso_init())
+ goto err;
+
+ if (parse_cg_info())
+ goto err;
+
+ if (prepare_inventory(&he))
+ goto err;
+
+ if (opts.cpu_cap & (CPU_CAP_CPU | CPU_CAP_INS)) {
+ if (cpu_dump_cpuinfo())
+ goto err;
+ }
+
+ if (connect_to_page_server())
+ goto err;
+
+ if (setup_alarm_handler(dump_alarm_handler))
+ goto err;
+
+ /*
+ * The collect_pstree will also stop (PTRACE_SEIZE) the tasks
+ * thus ensuring that they don't modify anything we collect
+ * afterwards.
+ */
+
+ if (collect_pstree(pid))
+ goto err;
+
+ if (collect_pstree_ids())
+ goto err;
+
+ if (network_lock())
+ goto err;
+
+ if (collect_file_locks())
+ goto err;
+
+ if (collect_namespaces(true) < 0)
+ goto err;
+
+ glob_imgset = cr_glob_imgset_open(O_DUMP);
+ if (!glob_imgset)
+ goto err;
+
+ if (collect_seccomp_filters() < 0)
+ goto err;
+
+ for_each_pstree_item(item) {
+ if (dump_one_task(item))
+ goto err;
+ }
+
+ /* MNT namespaces are dumped after files to save remapped links */
+ if (dump_mnt_namespaces() < 0)
+ goto err;
+
+ if (dump_file_locks())
+ goto err;
+
+ if (dump_verify_tty_sids())
+ goto err;
+
+ if (dump_zombies())
+ goto err;
+
+ if (dump_pstree(root_item))
+ goto err;
+
+ if (root_ns_mask)
+ if (dump_namespaces(root_item, root_ns_mask) < 0)
+ goto err;
+
+ ret = dump_cgroups();
+ if (ret)
+ goto err;
+
+ ret = cr_dump_shmem();
+ if (ret)
+ goto err;
+
+ ret = fix_external_unix_sockets();
+ if (ret)
+ goto err;
+
+ ret = tty_verify_active_pairs();
+ if (ret)
+ goto err;
+
+ ret = write_img_inventory(&he);
+ if (ret)
+ goto err;
+err:
+ return cr_dump_finish(ret);
+}
diff --git a/criu/cr-errno.c b/criu/cr-errno.c
new file mode 100644
index 000000000000..b62bb545a174
--- /dev/null
+++ b/criu/cr-errno.c
@@ -0,0 +1,12 @@
+static int cr_errno;
+
+int get_cr_errno(void)
+{
+ return cr_errno;
+}
+
+void set_cr_errno(int new_err)
+{
+ if (!cr_errno)
+ cr_errno = new_err;
+}
diff --git a/criu/cr-exec.c b/criu/cr-exec.c
new file mode 100644
index 000000000000..8beb80f88914
--- /dev/null
+++ b/criu/cr-exec.c
@@ -0,0 +1,170 @@
+#include <unistd.h>
+#include <string.h>
+#include <stdlib.h>
+#include "crtools.h"
+#include "ptrace.h"
+#include "parasite-syscall.h"
+#include "vma.h"
+#include "log.h"
+
+struct syscall_exec_desc {
+ char *name;
+ unsigned nr;
+};
+
+static struct syscall_exec_desc sc_exec_table[] = {
+#define SYSCALL(__name, __nr) { .name = #__name, .nr = __nr, },
+#include "sys-exec-tbl.c"
+#undef SYSCALL
+ { }, /* terminator */
+};
+
+static struct syscall_exec_desc *find_syscall(char *name)
+{
+ int i;
+
+ for (i = 0; sc_exec_table[i].name != NULL; i++)
+ if (!strcmp(sc_exec_table[i].name, name))
+ return &sc_exec_table[i];
+
+ return NULL;
+}
+
+#define MAX_ARGS 6
+
+static int execute_syscall(struct parasite_ctl *ctl,
+ struct syscall_exec_desc *scd, char **opt)
+{
+ int i, err;
+ unsigned long args[MAX_ARGS] = {}, ret, r_mem_size = 0;
+ unsigned int ret_args[MAX_ARGS] = {};
+ void *r_mem = NULL;
+
+ for (i = 0; i < MAX_ARGS; i++) {
+ if (opt[i] == NULL)
+ break;
+
+ /*
+ * &foo -- argument string "foo"
+ * @<size> -- ret-arg of size <size>
+ */
+
+ if ((opt[i][0] == '&') || (opt[i][0] == '@')) {
+ int len;
+
+ if (!r_mem) {
+ err = parasite_map_exchange(ctl, PAGE_SIZE);
+ if (err)
+ return err;
+
+ r_mem_size = PAGE_SIZE;
+ r_mem = ctl->local_map;
+ }
+
+ if (opt[i][0] == '&') {
+ len = strlen(opt[i]);
+ if (r_mem_size < len) {
+ pr_err("Arg size overflow\n");
+ return -1;
+ }
+
+ memcpy(r_mem, opt[i] + 1, len);
+ } else {
+ len = strtol(opt[i] + 1, NULL, 0);
+ if (!len || (r_mem_size < len)) {
+ pr_err("Bad argument size %d\n", len);
+ return -1;
+ }
+
+ ret_args[i] = len;
+ }
+
+ args[i] = (unsigned long)ctl->remote_map + (r_mem - ctl->local_map);
+ pr_info("Pushing %c mem arg [%s]\n", opt[i][0], (char *)r_mem);
+ r_mem_size -= len;
+ r_mem += len;
+ } else
+ args[i] = strtol(opt[i], NULL, 0);
+ }
+
+ pr_info("Calling %d with %lu %lu %lu %lu %lu %lu\n", scd->nr,
+ args[0], args[1], args[2], args[3], args[4], args[5]);
+
+ err = syscall_seized(ctl, scd->nr, &ret,
+ args[0], args[1], args[2], args[3], args[4], args[5]);
+ if (err)
+ return err;
+
+ pr_msg("Syscall returned %lx(%d)\n", ret, (int)ret);
+ for (i = 0; i < MAX_ARGS; i++) {
+ unsigned long addr;
+
+ if (!ret_args[i])
+ continue;
+
+ pr_msg("Argument %d returns:\n", i);
+ addr = (unsigned long)ctl->local_map + (args[i] - (unsigned long)ctl->remote_map);
+ print_data(0, (unsigned char *)addr, ret_args[i]);
+ }
+
+ return 0;
+}
+
+int cr_exec(int pid, char **opt)
+{
+ char *sys_name = opt[0];
+ struct syscall_exec_desc *si;
+ struct parasite_ctl *ctl;
+ struct vm_area_list vmas;
+ int ret = -1, prev_state;
+ struct proc_status_creds *creds;
+
+ if (!sys_name) {
+ pr_err("Syscall name required\n");
+ goto out;
+ }
+
+ si = find_syscall(sys_name);
+ if (!si) {
+ pr_err("Unknown syscall [%s]\n", sys_name);
+ goto out;
+ }
+
+ if (seize_catch_task(pid))
+ goto out;
+
+ prev_state = ret = seize_wait_task(pid, -1, &creds);
+ if (ret < 0) {
+ pr_err("Can't seize task %d\n", pid);
+ goto out;
+ }
+
+ /*
+ * We don't seize a task's threads here, and there is no reason to
+ * compare threads' creds in this use case anyway, so let's just free
+ * the creds.
+ */
+ free(creds);
+
+ ret = collect_mappings(pid, &vmas);
+ if (ret) {
+ pr_err("Can't collect vmas for %d\n", pid);
+ goto out_unseize;
+ }
+
+ ctl = parasite_prep_ctl(pid, &vmas);
+ if (!ctl) {
+ pr_err("Can't prep ctl %d\n", pid);
+ goto out_unseize;
+ }
+
+ ret = execute_syscall(ctl, si, opt + 1);
+ if (ret < 0)
+ pr_err("Can't execute syscall remotely\n");
+
+ parasite_cure_seized(ctl);
+out_unseize:
+ unseize_task(pid, prev_state, prev_state);
+out:
+ return ret;
+}
diff --git a/criu/cr-restore.c b/criu/cr-restore.c
new file mode 100644
index 000000000000..0985b8675f64
--- /dev/null
+++ b/criu/cr-restore.c
@@ -0,0 +1,3364 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <limits.h>
+#include <unistd.h>
+#include <errno.h>
+#include <dirent.h>
+#include <string.h>
+
+#include <fcntl.h>
+#include <grp.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/vfs.h>
+#include <sys/wait.h>
+#include <sys/file.h>
+#include <sys/shm.h>
+#include <sys/mount.h>
+#include <sys/prctl.h>
+
+#include <sched.h>
+
+#include <sys/sendfile.h>
+
+#include "ptrace.h"
+#include "compiler.h"
+#include "asm/types.h"
+#include "asm/restorer.h"
+
+#include "cr_options.h"
+#include "servicefd.h"
+#include "image.h"
+#include "util.h"
+#include "util-pie.h"
+#include "log.h"
+#include "restorer.h"
+#include "sockets.h"
+#include "sk-packet.h"
+#include "lock.h"
+#include "files.h"
+#include "files-reg.h"
+#include "pipes.h"
+#include "fifo.h"
+#include "sk-inet.h"
+#include "eventfd.h"
+#include "eventpoll.h"
+#include "signalfd.h"
+#include "proc_parse.h"
+#include "restorer-blob.h"
+#include "crtools.h"
+#include "namespaces.h"
+#include "mem.h"
+#include "mount.h"
+#include "fsnotify.h"
+#include "pstree.h"
+#include "net.h"
+#include "tty.h"
+#include "cpu.h"
+#include "file-lock.h"
+#include "page-read.h"
+#include "vdso.h"
+#include "stats.h"
+#include "tun.h"
+#include "vma.h"
+#include "kerndat.h"
+#include "rst-malloc.h"
+#include "plugin.h"
+#include "cgroup.h"
+#include "timerfd.h"
+#include "file-lock.h"
+#include "action-scripts.h"
+#include "aio.h"
+#include "lsm.h"
+#include "seccomp.h"
+#include "bitmap.h"
+#include "fault-injection.h"
+#include "parasite-syscall.h"
+
+#include "protobuf.h"
+#include "protobuf/sa.pb-c.h"
+#include "protobuf/timer.pb-c.h"
+#include "protobuf/vma.pb-c.h"
+#include "protobuf/rlimit.pb-c.h"
+#include "protobuf/pagemap.pb-c.h"
+#include "protobuf/siginfo.pb-c.h"
+
+#include "asm/restore.h"
+#include "asm/atomic.h"
+#include "asm/bitops.h"
+
+#include "cr-errno.h"
+
+#include "pie/pie-relocs.h"
+
+#ifndef arch_export_restore_thread
+#define arch_export_restore_thread __export_restore_thread
+#endif
+
+#ifndef arch_export_restore_task
+#define arch_export_restore_task __export_restore_task
+#endif
+
+#ifndef arch_export_unmap
+#define arch_export_unmap __export_unmap
+#endif
+
+static struct pstree_item *current;
+
+static int restore_task_with_children(void *);
+static int sigreturn_restore(pid_t pid, CoreEntry *core);
+static int prepare_restorer_blob(void);
+static int prepare_rlimits(int pid, CoreEntry *core);
+static int prepare_posix_timers(int pid, CoreEntry *core);
+static int prepare_signals(int pid, CoreEntry *core);
+
+static int root_as_sibling;
+static unsigned long helpers_pos = 0;
+static int n_helpers = 0;
+static unsigned long zombies_pos = 0;
+static int n_zombies = 0;
+
+static int crtools_prepare_shared(void)
+{
+ if (prepare_shared_fdinfo())
+ return -1;
+
+ /* We might want to remove ghost files on failed restore */
+ if (collect_remaps_and_regfiles())
+ return -1;
+
+ /* dead pid remap needs to allocate task helpers which all tasks need
+ * to see */
+ if (prepare_procfs_remaps())
+ return -1;
+
+ /* Connections are unlocked from criu */
+ if (collect_inet_sockets())
+ return -1;
+
+ if (tty_prep_fds())
+ return -1;
+
+ if (prepare_cgroup())
+ return -1;
+
+ return 0;
+}
+
+/*
+ * Collect order information:
+ * - reg_file should be before remap, as the latter needs
+ * to find file_desc objects
+ * - per-pid collects (mm and fd) should be after remap and
+ * reg_file since both per-pid ones need to get fdesc-s
+ * and bump counters on remaps if they exist
+ */
+
+static struct collect_image_info *cinfos[] = {
+ &nsfile_cinfo,
+ &pipe_cinfo,
+ &fifo_cinfo,
+ &unix_sk_cinfo,
+ &packet_sk_cinfo,
+ &netlink_sk_cinfo,
+ &eventfd_cinfo,
+ &epoll_tfd_cinfo,
+ &epoll_cinfo,
+ &signalfd_cinfo,
+ &inotify_cinfo,
+ &inotify_mark_cinfo,
+ &fanotify_cinfo,
+ &fanotify_mark_cinfo,
+ &tty_info_cinfo,
+ &tty_cinfo,
+ &tunfile_cinfo,
+ &ext_file_cinfo,
+ &timerfd_cinfo,
+ &file_locks_cinfo,
+};
+
+static int root_prepare_shared(void)
+{
+ int ret = 0, i;
+ struct pstree_item *pi;
+
+ pr_info("Preparing info about shared resources\n");
+
+ if (prepare_shared_tty())
+ return -1;
+
+ if (prepare_shared_reg_files())
+ return -1;
+
+ if (prepare_remaps())
+ return -1;
+
+ if (prepare_seccomp_filters())
+ return -1;
+
+ for (i = 0; i < ARRAY_SIZE(cinfos); i++) {
+ ret = collect_image(cinfos[i]);
+ if (ret)
+ return -1;
+ }
+
+ if (collect_pipes())
+ return -1;
+ if (collect_fifo())
+ return -1;
+ if (collect_unix_sockets())
+ return -1;
+
+ if (tty_verify_active_pairs())
+ return -1;
+
+ for_each_pstree_item(pi) {
+ if (pi->state == TASK_HELPER)
+ continue;
+
+ ret = prepare_mm_pid(pi);
+ if (ret < 0)
+ break;
+
+ ret = prepare_fd_pid(pi);
+ if (ret < 0)
+ break;
+
+ ret = prepare_fs_pid(pi);
+ if (ret < 0)
+ break;
+ }
+
+ if (ret < 0)
+ goto err;
+
+ mark_pipe_master();
+
+ ret = tty_setup_slavery();
+ if (ret)
+ goto err;
+
+ ret = resolve_unix_peers();
+ if (ret)
+ goto err;
+
+ ret = prepare_restorer_blob();
+ if (ret)
+ goto err;
+
+ show_saved_shmems();
+ show_saved_files();
+err:
+ return ret;
+}
+
+/* Map a private vma, if it is not mapped by a parent yet */
+static int map_private_vma(struct vma_area *vma, void **tgt_addr,
+ struct vma_area **pvma, struct list_head *pvma_list)
+{
+ int ret;
+ void *addr, *paddr = NULL;
+ unsigned long nr_pages, size;
+ struct vma_area *p = *pvma;
+
+ if (vma_area_is(vma, VMA_FILE_PRIVATE)) {
+ ret = get_filemap_fd(vma);
+ if (ret < 0) {
+ pr_err("Can't fixup VMA's fd\n");
+ return -1;
+ }
+ vma->e->fd = ret;
+ }
+
+ nr_pages = vma_entry_len(vma->e) / PAGE_SIZE;
+ vma->page_bitmap = xzalloc(BITS_TO_LONGS(nr_pages) * sizeof(long));
+ if (vma->page_bitmap == NULL)
+ return -1;
+
+ list_for_each_entry_from(p, pvma_list, list) {
+ if (p->e->start > vma->e->start)
+ break;
+
+ if (!vma_area_is_private(p, kdat.task_size))
+ continue;
+
+ if (p->e->end != vma->e->end ||
+ p->e->start != vma->e->start)
+ continue;
+
+ /* Check flags, which must be identical for both vma-s */
+ if ((vma->e->flags ^ p->e->flags) & (MAP_GROWSDOWN | MAP_ANONYMOUS))
+ break;
+
+ if (!(vma->e->flags & MAP_ANONYMOUS) &&
+ vma->e->shmid != p->e->shmid)
+ break;
+
+ pr_info("COW 0x%016"PRIx64"-0x%016"PRIx64" 0x%016"PRIx64" vma\n",
+ vma->e->start, vma->e->end, vma->e->pgoff);
+ paddr = decode_pointer(p->premmaped_addr);
+
+ break;
+ }
+
+ /*
+ * A grow-down VMA has a guard page, which protect a VMA below it.
+ * So one more page is mapped here to restore content of the first page
+ */
+ if (vma->e->flags & MAP_GROWSDOWN) {
+ vma->e->start -= PAGE_SIZE;
+ if (paddr)
+ paddr -= PAGE_SIZE;
+ }
+
+ size = vma_entry_len(vma->e);
+ if (paddr == NULL) {
+ /*
+ * The respective memory area was NOT found in the parent.
+ * Map a new one.
+ */
+ pr_info("Map 0x%016"PRIx64"-0x%016"PRIx64" 0x%016"PRIx64" vma\n",
+ vma->e->start, vma->e->end, vma->e->pgoff);
+
+ addr = mmap(*tgt_addr, size,
+ vma->e->prot | PROT_WRITE,
+ vma->e->flags | MAP_FIXED,
+ vma->e->fd, vma->e->pgoff);
+
+ if (addr == MAP_FAILED) {
+ pr_perror("Unable to map ANON_VMA");
+ return -1;
+ }
+
+ *pvma = p;
+ } else {
+ /*
+ * This region was found in parent -- remap it to inherit physical
+ * pages (if any) from it (and COW them later if required).
+ */
+ vma->ppage_bitmap = p->page_bitmap;
+
+ addr = mremap(paddr, size, size,
+ MREMAP_FIXED | MREMAP_MAYMOVE, *tgt_addr);
+ if (addr != *tgt_addr) {
+ pr_perror("Unable to remap a private vma");
+ return -1;
+ }
+
+ *pvma = list_entry(p->list.next, struct vma_area, list);
+ }
+
+ vma->premmaped_addr = (unsigned long) addr;
+ pr_debug("\tpremap 0x%016"PRIx64"-0x%016"PRIx64" -> %016lx\n",
+ vma->e->start, vma->e->end, (unsigned long)addr);
+
+ if (vma->e->flags & MAP_GROWSDOWN) { /* Skip gurad page */
+ vma->e->start += PAGE_SIZE;
+ vma->premmaped_addr += PAGE_SIZE;
+ }
+
+ if (vma_area_is(vma, VMA_FILE_PRIVATE))
+ close(vma->e->fd);
+
+ *tgt_addr += size;
+ return 0;
+}
+
+static int premap_priv_vmas(struct vm_area_list *vmas, void *at)
+{
+ struct list_head *parent_vmas;
+ struct vma_area *pvma, *vma;
+ unsigned long pstart = 0;
+ int ret = 0;
+ LIST_HEAD(empty);
+
+ /*
+ * Keep parent vmas at hands to check whether we can "inherit" them.
+ * See comments in map_private_vma.
+ */
+ if (current->parent)
+ parent_vmas = &rsti(current->parent)->vmas.h;
+ else
+ parent_vmas = ∅
+
+ pvma = list_first_entry(parent_vmas, struct vma_area, list);
+
+ list_for_each_entry(vma, &vmas->h, list) {
+ if (pstart > vma->e->start) {
+ ret = -1;
+ pr_err("VMA-s are not sorted in the image file\n");
+ break;
+ }
+ pstart = vma->e->start;
+
+ if (!vma_area_is_private(vma, kdat.task_size))
+ continue;
+
+ ret = map_private_vma(vma, &at, &pvma, parent_vmas);
+ if (ret < 0)
+ break;
+ }
+
+ return ret;
+}
+
+static int restore_priv_vma_content(void)
+{
+ struct vma_area *vma;
+ int ret = 0;
+ struct list_head *vmas = &rsti(current)->vmas.h;
+
+ unsigned int nr_restored = 0;
+ unsigned int nr_shared = 0;
+ unsigned int nr_droped = 0;
+ unsigned int nr_compared = 0;
+ unsigned long va;
+ struct page_read pr;
+
+ vma = list_first_entry(vmas, struct vma_area, list);
+
+ ret = open_page_read(current->pid.virt, &pr, PR_TASK);
+ if (ret <= 0)
+ return -1;
+
+ /*
+ * Read page contents.
+ */
+ while (1) {
+ unsigned long off, i, nr_pages;
+ struct iovec iov;
+
+ ret = pr.get_pagemap(&pr, &iov);
+ if (ret <= 0)
+ break;
+
+ va = (unsigned long)iov.iov_base;
+ nr_pages = iov.iov_len / PAGE_SIZE;
+
+ for (i = 0; i < nr_pages; i++) {
+ unsigned char buf[PAGE_SIZE];
+ void *p;
+
+ /*
+ * The lookup is over *all* possible VMAs
+ * read from image file.
+ */
+ while (va >= vma->e->end) {
+ if (vma->list.next == vmas)
+ goto err_addr;
+ vma = list_entry(vma->list.next, struct vma_area, list);
+ }
+
+ /*
+ * Make sure the page address is inside existing VMA
+ * and the VMA it refers to still private one, since
+ * there is no guarantee that the data from pagemap is
+ * valid.
+ */
+ if (va < vma->e->start)
+ goto err_addr;
+ else if (unlikely(!vma_area_is_private(vma, kdat.task_size))) {
+ pr_err("Trying to restore page for non-private VMA\n");
+ goto err_addr;
+ }
+
+ off = (va - vma->e->start) / PAGE_SIZE;
+ p = decode_pointer((off) * PAGE_SIZE +
+ vma->premmaped_addr);
+
+ set_bit(off, vma->page_bitmap);
+ if (vma->ppage_bitmap) { /* inherited vma */
+ clear_bit(off, vma->ppage_bitmap);
+
+ ret = pr.read_pages(&pr, va, 1, buf);
+ if (ret < 0)
+ goto err_read;
+
+ va += PAGE_SIZE;
+ nr_compared++;
+
+ if (memcmp(p, buf, PAGE_SIZE) == 0) {
+ nr_shared++; /* the page is cowed */
+ continue;
+ }
+
+ nr_restored++;
+ memcpy(p, buf, PAGE_SIZE);
+ } else {
+ int nr;
+
+ /*
+ * Try to read as many pages as possible at once.
+ *
+ * Within the current pagemap we still have
+ * nr_pages - i pages (not all, as we might have
+ * switched VMA above), within the current VMA
+ * we have at most (vma->end - current_addr) bytes.
+ */
+
+ nr = min_t(int, nr_pages - i, (vma->e->end - va) / PAGE_SIZE);
+
+ ret = pr.read_pages(&pr, va, nr, p);
+ if (ret < 0)
+ goto err_read;
+
+ va += nr * PAGE_SIZE;
+ nr_restored += nr;
+ i += nr - 1;
+
+ bitmap_set(vma->page_bitmap, off + 1, nr - 1);
+ }
+
+ }
+
+ if (pr.put_pagemap)
+ pr.put_pagemap(&pr);
+ }
+
+err_read:
+ pr.close(&pr);
+ if (ret < 0)
+ return ret;
+
+ /* Remove pages, which were not shared with a child */
+ list_for_each_entry(vma, vmas, list) {
+ unsigned long size, i = 0;
+ void *addr = decode_pointer(vma->premmaped_addr);
+
+ if (vma->ppage_bitmap == NULL)
+ continue;
+
+ size = vma_entry_len(vma->e) / PAGE_SIZE;
+ while (1) {
+ /* Find all pages, which are not shared with this child */
+ i = find_next_bit(vma->ppage_bitmap, size, i);
+
+ if ( i >= size)
+ break;
+
+ ret = madvise(addr + PAGE_SIZE * i,
+ PAGE_SIZE, MADV_DONTNEED);
+ if (ret < 0) {
+ pr_perror("madvise failed");
+ return -1;
+ }
+ i++;
+ nr_droped++;
+ }
+ }
+
+ cnt_add(CNT_PAGES_COMPARED, nr_compared);
+ cnt_add(CNT_PAGES_SKIPPED_COW, nr_shared);
+ cnt_add(CNT_PAGES_RESTORED, nr_restored);
+
+ pr_info("nr_restored_pages: %d\n", nr_restored);
+ pr_info("nr_shared_pages: %d\n", nr_shared);
+ pr_info("nr_droped_pages: %d\n", nr_droped);
+
+ return 0;
+
+err_addr:
+ pr_err("Page entry address %lx outside of VMA %lx-%lx\n",
+ va, (long)vma->e->start, (long)vma->e->end);
+ return -1;
+}
+
+static int prepare_mappings(void)
+{
+ int ret = 0;
+ void *addr;
+ struct vm_area_list *vmas;
+
+ void *old_premmapped_addr = NULL;
+ unsigned long old_premmapped_len;
+
+ vmas = &rsti(current)->vmas;
+ if (vmas->nr == 0) /* Zombie */
+ goto out;
+
+ /* Reserve a place for mapping private vma-s one by one */
+ addr = mmap(NULL, vmas->priv_size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+ if (addr == MAP_FAILED) {
+ ret = -1;
+ pr_perror("Unable to reserve memory (%lu bytes)", vmas->priv_size);
+ goto out;
+ }
+
+ old_premmapped_addr = rsti(current)->premmapped_addr;
+ old_premmapped_len = rsti(current)->premmapped_len;
+ rsti(current)->premmapped_addr = addr;
+ rsti(current)->premmapped_len = vmas->priv_size;
+
+ ret = premap_priv_vmas(vmas, addr);
+ if (ret < 0)
+ goto out;
+
+ ret = restore_priv_vma_content();
+ if (ret < 0)
+ goto out;
+
+ if (old_premmapped_addr) {
+ ret = munmap(old_premmapped_addr, old_premmapped_len);
+ if (ret < 0)
+ pr_perror("Unable to unmap %p(%lx)",
+ old_premmapped_addr, old_premmapped_len);
+ }
+
+out:
+ return ret;
+}
+
+/*
+ * A gard page must be unmapped after restoring content and
+ * forking children to restore COW memory.
+ */
+static int unmap_guard_pages()
+{
+ struct vma_area *vma;
+ struct list_head *vmas = &rsti(current)->vmas.h;
+
+ list_for_each_entry(vma, vmas, list) {
+ if (!vma_area_is_private(vma, kdat.task_size))
+ continue;
+
+ if (vma->e->flags & MAP_GROWSDOWN) {
+ void *addr = decode_pointer(vma->premmaped_addr);
+
+ if (munmap(addr - PAGE_SIZE, PAGE_SIZE)) {
+ pr_perror("Can't unmap guard page");
+ return -1;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int open_vmas(int pid)
+{
+ struct vma_area *vma;
+ int ret = 0;
+ struct list_head *vmas = &rsti(current)->vmas.h;
+
+ list_for_each_entry(vma, vmas, list) {
+ if (!(vma_area_is(vma, VMA_AREA_REGULAR)))
+ continue;
+
+ pr_info("Opening 0x%016"PRIx64"-0x%016"PRIx64" 0x%016"PRIx64" (%x) vma\n",
+ vma->e->start, vma->e->end,
+ vma->e->pgoff, vma->e->status);
+
+ if (vma_area_is(vma, VMA_AREA_SYSVIPC))
+ ret = vma->e->shmid;
+ else if (vma_area_is(vma, VMA_ANON_SHARED))
+ ret = get_shmem_fd(pid, vma->e);
+ else if (vma_area_is(vma, VMA_FILE_SHARED))
+ ret = get_filemap_fd(vma);
+ else if (vma_area_is(vma, VMA_AREA_SOCKET))
+ ret = get_socket_fd(pid, vma->e);
+ else
+ continue;
+
+ if (ret < 0) {
+ pr_err("Can't fixup fd\n");
+ break;
+ }
+
+ pr_info("\t`- setting %d as mapping fd\n", ret);
+ vma->e->fd = ret;
+ }
+
+ return ret < 0 ? -1 : 0;
+}
+
+static rt_sigaction_t sigchld_act;
+static rt_sigaction_t parent_act[SIGMAX];
+
+static bool sa_inherited(int sig, rt_sigaction_t *sa)
+{
+ rt_sigaction_t *pa;
+
+ if (current == root_item)
+ return false; /* XXX -- inherit from CRIU? */
+
+ pa = &parent_act[sig];
+ return pa->rt_sa_handler == sa->rt_sa_handler &&
+ pa->rt_sa_flags == sa->rt_sa_flags &&
+ pa->rt_sa_restorer == sa->rt_sa_restorer &&
+ pa->rt_sa_mask.sig[0] == sa->rt_sa_mask.sig[0];
+}
+
+static int prepare_sigactions(void)
+{
+ int pid = current->pid.virt;
+ rt_sigaction_t act;
+ struct cr_img *img;
+ SaEntry *e;
+ int sig, rst = 0;
+ int ret = 0;
+
+ if (!task_alive(current))
+ return 0;
+
+ pr_info("Restore sigacts for %d\n", pid);
+
+ img = open_image(CR_FD_SIGACT, O_RSTR, pid);
+ if (!img)
+ return -1;
+
+ for (sig = 1; sig <= SIGMAX; sig++) {
+ if (sig == SIGKILL || sig == SIGSTOP)
+ continue;
+
+ ret = pb_read_one_eof(img, &e, PB_SIGACT);
+ if (ret == 0) {
+ if (sig != SIGMAX_OLD + 1) { /* backward compatibility */
+ pr_err("Unexpected EOF %d\n", sig);
+ ret = -1;
+ break;
+ }
+ pr_warn("This format of sigacts-%d.img is deprecated\n", pid);
+ break;
+ }
+ if (ret < 0)
+ break;
+
+ ASSIGN_TYPED(act.rt_sa_handler, decode_pointer(e->sigaction));
+ ASSIGN_TYPED(act.rt_sa_flags, e->flags);
+ ASSIGN_TYPED(act.rt_sa_restorer, decode_pointer(e->restorer));
+ ASSIGN_TYPED(act.rt_sa_mask.sig[0], e->mask);
+
+ sa_entry__free_unpacked(e, NULL);
+
+ if (sig == SIGCHLD) {
+ sigchld_act = act;
+ continue;
+ }
+
+ if (sa_inherited(sig - 1, &act))
+ continue;
+
+ /*
+ * A pure syscall is used, because glibc
+ * sigaction overwrites se_restorer.
+ */
+ ret = syscall(SYS_rt_sigaction, sig, &act, NULL, sizeof(k_rtsigset_t));
+ if (ret < 0) {
+ errno = -ret;
+ pr_perror("Can't restore sigaction");
+ goto err;
+ }
+
+ parent_act[sig - 1] = act;
+ rst++;
+ }
+
+ pr_info("Restored %d/%d sigacts\n", rst,
+ SIGMAX - 3 /* KILL, STOP and CHLD */);
+
+err:
+ close_image(img);
+ return ret;
+}
+
+static int collect_child_pids(int state, int *n)
+{
+ struct pstree_item *pi;
+
+ *n = 0;
+ list_for_each_entry(pi, ¤t->children, sibling) {
+ pid_t *child;
+
+ if (pi->state != state)
+ continue;
+
+ child = rst_mem_alloc(sizeof(*child), RM_PRIVATE);
+ if (!child)
+ return -1;
+
+ (*n)++;
+ *child = pi->pid.virt;
+ }
+
+ return 0;
+}
+
+static int collect_helper_pids()
+{
+ helpers_pos = rst_mem_align_cpos(RM_PRIVATE);
+ return collect_child_pids(TASK_HELPER, &n_helpers);
+}
+
+static int collect_zombie_pids()
+{
+ zombies_pos = rst_mem_align_cpos(RM_PRIVATE);
+ return collect_child_pids(TASK_DEAD, &n_zombies);
+}
+
+static int open_cores(int pid, CoreEntry *leader_core)
+{
+ int i, tpid;
+ CoreEntry **cores = NULL;
+
+ cores = xmalloc(sizeof(*cores)*current->nr_threads);
+ if (!cores)
+ goto err;
+
+ for (i = 0; i < current->nr_threads; i++) {
+ tpid = current->threads[i].virt;
+
+ if (tpid == pid)
+ cores[i] = leader_core;
+ else {
+ struct cr_img *img;
+
+ img = open_image(CR_FD_CORE, O_RSTR, tpid);
+ if (!img) {
+ pr_err("Can't open core data for thread %d\n", tpid);
+ goto err;
+ }
+
+ if (pb_read_one(img, &cores[i], PB_CORE) <= 0) {
+ close_image(img);
+ goto err;
+ }
+
+ close_image(img);
+ }
+ }
+
+ current->core = cores;
+
+ return 0;
+err:
+ xfree(cores);
+ return -1;
+}
+
+static int prepare_oom_score_adj(int value)
+{
+ int fd, ret = 0;
+ char buf[11];
+
+ fd = open_proc_rw(PROC_SELF, "oom_score_adj");
+ if (fd < 0)
+ return -1;
+
+ snprintf(buf, 11, "%d", value);
+
+ if (write(fd, buf, 11) < 0) {
+ pr_perror("Write %s to /proc/self/oom_score_adj failed", buf);
+ ret = -1;
+ }
+
+ close(fd);
+ return ret;
+}
+
+static int prepare_proc_misc(pid_t pid, TaskCoreEntry *tc)
+{
+ int ret;
+
+ /* loginuid value is critical to restore */
+ if (kdat.has_loginuid && tc->has_loginuid &&
+ tc->loginuid != INVALID_UID) {
+ ret = prepare_loginuid(tc->loginuid, LOG_ERROR);
+ if (ret < 0)
+ return ret;
+ }
+
+ /* oom_score_adj is not critical: only log errors */
+ if (tc->has_oom_score_adj && tc->oom_score_adj != 0)
+ prepare_oom_score_adj(tc->oom_score_adj);
+
+ return 0;
+}
+
+static int restore_one_alive_task(int pid, CoreEntry *core)
+{
+ pr_info("Restoring resources\n");
+
+ rst_mem_switch_to_private();
+
+ if (prepare_fds(current))
+ return -1;
+
+ if (prepare_file_locks(pid))
+ return -1;
+
+ if (open_vmas(pid))
+ return -1;
+
+ if (open_cores(pid, core))
+ return -1;
+
+ if (prepare_signals(pid, core))
+ return -1;
+
+ if (prepare_posix_timers(pid, core))
+ return -1;
+
+ if (prepare_rlimits(pid, core) < 0)
+ return -1;
+
+ if (collect_helper_pids() < 0)
+ return -1;
+
+ if (collect_zombie_pids() < 0)
+ return -1;
+
+ if (inherit_fd_fini() < 0)
+ return -1;
+
+ if (prepare_proc_misc(pid, core->tc))
+ return -1;
+
+ return sigreturn_restore(pid, core);
+}
+
+static void zombie_prepare_signals(void)
+{
+ sigset_t blockmask;
+ int sig;
+ struct sigaction act;
+
+ sigfillset(&blockmask);
+ sigprocmask(SIG_UNBLOCK, &blockmask, NULL);
+
+ memset(&act, 0, sizeof(act));
+ act.sa_handler = SIG_DFL;
+
+ for (sig = 1; sig <= SIGMAX; sig++)
+ sigaction(sig, &act, NULL);
+}
+
+#define SIG_FATAL_MASK ( \
+ (1 << SIGHUP) |\
+ (1 << SIGINT) |\
+ (1 << SIGQUIT) |\
+ (1 << SIGILL) |\
+ (1 << SIGTRAP) |\
+ (1 << SIGABRT) |\
+ (1 << SIGIOT) |\
+ (1 << SIGBUS) |\
+ (1 << SIGFPE) |\
+ (1 << SIGKILL) |\
+ (1 << SIGUSR1) |\
+ (1 << SIGSEGV) |\
+ (1 << SIGUSR2) |\
+ (1 << SIGPIPE) |\
+ (1 << SIGALRM) |\
+ (1 << SIGTERM) |\
+ (1 << SIGXCPU) |\
+ (1 << SIGXFSZ) |\
+ (1 << SIGVTALRM)|\
+ (1 << SIGPROF) |\
+ (1 << SIGPOLL) |\
+ (1 << SIGIO) |\
+ (1 << SIGSYS) |\
+ (1 << SIGUNUSED)|\
+ (1 << SIGSTKFLT)|\
+ (1 << SIGPWR) \
+ )
+
+static inline int sig_fatal(int sig)
+{
+ return (sig > 0) && (sig < SIGMAX) && (SIG_FATAL_MASK & (1UL << sig));
+}
+
+struct task_entries *task_entries;
+static unsigned long task_entries_pos;
+
+static int restore_one_zombie(CoreEntry *core)
+{
+ int exit_code = core->tc->exit_code;
+
+ pr_info("Restoring zombie with %d code\n", exit_code);
+
+ if (inherit_fd_fini() < 0)
+ return -1;
+
+ prctl(PR_SET_NAME, (long)(void *)core->tc->comm, 0, 0, 0);
+
+ if (task_entries != NULL) {
+ restore_finish_stage(CR_STATE_RESTORE);
+ zombie_prepare_signals();
+ }
+
+ if (exit_code & 0x7f) {
+ int signr;
+
+ /* prevent generating core files */
+ if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0))
+ pr_perror("Can't drop the dumpable flag");
+
+ signr = exit_code & 0x7F;
+ if (!sig_fatal(signr)) {
+ pr_warn("Exit with non fatal signal ignored\n");
+ signr = SIGABRT;
+ }
+
+ if (kill(current->pid.virt, signr) < 0)
+ pr_perror("Can't kill myself, will just exit");
+
+ exit_code = 0;
+ }
+
+ exit((exit_code >> 8) & 0x7f);
+
+ /* never reached */
+ BUG_ON(1);
+ return -1;
+}
+
+static int check_core(CoreEntry *core, struct pstree_item *me)
+{
+ int ret = -1;
+
+ if (core->mtype != CORE_ENTRY__MARCH) {
+ pr_err("Core march mismatch %d\n", (int)core->mtype);
+ goto out;
+ }
+
+ if (!core->tc) {
+ pr_err("Core task state data missed\n");
+ goto out;
+ }
+
+ if (core->tc->task_state != TASK_DEAD) {
+ if (!core->ids && !me->ids) {
+ pr_err("Core IDS data missed for non-zombie\n");
+ goto out;
+ }
+
+ if (!CORE_THREAD_ARCH_INFO(core)) {
+ pr_err("Core info data missed for non-zombie\n");
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static int restore_one_task(int pid, CoreEntry *core)
+{
+ int ret;
+
+ /* No more fork()-s => no more per-pid logs */
+
+ if (task_alive(current))
+ ret = restore_one_alive_task(pid, core);
+ else if (current->state == TASK_DEAD)
+ ret = restore_one_zombie(core);
+ else if (current->state == TASK_HELPER) {
+ restore_finish_stage(CR_STATE_RESTORE);
+ ret = 0;
+ } else {
+ pr_err("Unknown state in code %d\n", (int)core->tc->task_state);
+ ret = -1;
+ }
+
+ if (core)
+ core_entry__free_unpacked(core, NULL);
+ return ret;
+}
+
+/* All arguments should be above stack, because it grows down */
+struct cr_clone_arg {
+ /*
+ * Reserve some space for clone() to locate arguments
+ * and retcode in this place
+ */
+ char stack[128] __stack_aligned__;
+ char stack_ptr[0];
+ struct pstree_item *item;
+ unsigned long clone_flags;
+ int fd;
+
+ CoreEntry *core;
+};
+
+static void maybe_clone_parent(struct pstree_item *item,
+ struct cr_clone_arg *ca)
+{
+ /*
+ * zdtm runs in kernel 3.11, which has the problem described below. We
+ * avoid this by including the pdeath_sig test. Once users/zdtm migrate
+ * off of 3.11, this condition can be simplified to just test the
+ * options and not have the pdeath_sig test.
+ */
+ if (opts.restore_sibling) {
+ /*
+ * This means we're called from lib's criu_restore_child().
+ * In that case create the root task as the child one to+
+ * the caller. This is the only way to correctly restore the
+ * pdeath_sig of the root task. But also looks nice.
+ *
+ * Alternatively, if we are --restore-detached, a similar trick is
+ * needed to correctly restore pdeath_sig and prevent processes from
+ * dying once restored.
+ *
+ * There were a problem in kernel 3.11 -- CLONE_PARENT can't be
+ * set together with CLONE_NEWPID, which has been solved in further
+ * versions of the kernels, but we treat 3.11 as a base, so at
+ * least warn a user about potential problems.
+ */
+ rsti(item)->clone_flags |= CLONE_PARENT;
+ root_as_sibling = 1;
+ if (rsti(item)->clone_flags & CLONE_NEWPID)
+ pr_warn("Set CLONE_PARENT | CLONE_NEWPID but it might cause restore problem,"
+ "because not all kernels support such clone flags combinations!\n");
+ } else if (opts.restore_detach) {
+ if (ca->core->thread_core->pdeath_sig)
+ pr_warn("Root task has pdeath_sig configured, so it will receive one _right_"
+ "after restore on CRIU exit\n");
+ }
+}
+
+static inline int fork_with_pid(struct pstree_item *item)
+{
+ struct cr_clone_arg ca;
+ int ret = -1;
+ pid_t pid = item->pid.virt;
+
+ if (item->state != TASK_HELPER) {
+ struct cr_img *img;
+
+ img = open_image(CR_FD_CORE, O_RSTR, pid);
+ if (!img)
+ return -1;
+
+ ret = pb_read_one(img, &ca.core, PB_CORE);
+ close_image(img);
+
+ if (ret < 0)
+ return -1;
+
+ if (check_core(ca.core, item))
+ return -1;
+
+ item->state = ca.core->tc->task_state;
+ rsti(item)->cg_set = ca.core->tc->cg_set;
+
+ rsti(item)->has_seccomp = ca.core->tc->seccomp_mode != SECCOMP_MODE_DISABLED;
+
+ if (item->state == TASK_DEAD)
+ rsti(item->parent)->nr_zombies++;
+ else if (!task_alive(item)) {
+ pr_err("Unknown task state %d\n", item->state);
+ return -1;
+ }
+
+ if (unlikely(item == root_item))
+ maybe_clone_parent(item, &ca);
+ } else {
+ /*
+ * Helper entry will not get moved around and thus
+ * will live in the parent's cgset.
+ */
+ rsti(item)->cg_set = rsti(item->parent)->cg_set;
+ ca.core = NULL;
+ }
+
+ ret = -1;
+
+ ca.item = item;
+ ca.clone_flags = rsti(item)->clone_flags;
+
+ BUG_ON(ca.clone_flags & CLONE_VM);
+
+ pr_info("Forking task with %d pid (flags 0x%lx)\n", pid, ca.clone_flags);
+
+ if (!(ca.clone_flags & CLONE_NEWPID)) {
+ char buf[32];
+ int len;
+
+ ca.fd = open_proc_rw(PROC_GEN, LAST_PID_PATH);
+ if (ca.fd < 0) {
+ pr_perror("%d: Can't open %s", pid, LAST_PID_PATH);
+ goto err;
+ }
+
+ if (flock(ca.fd, LOCK_EX)) {
+ close(ca.fd);
+ pr_perror("%d: Can't lock %s", pid, LAST_PID_PATH);
+ goto err;
+ }
+
+ len = snprintf(buf, sizeof(buf), "%d", pid - 1);
+ if (write(ca.fd, buf, len) != len) {
+ pr_perror("%d: Write %s to %s", pid, buf, LAST_PID_PATH);
+ goto err_unlock;
+ }
+ } else {
+ ca.fd = -1;
+ BUG_ON(pid != INIT_PID);
+ }
+
+ /*
+ * Some kernel modules, such as netwrok packet generator
+ * run kernel thread upon net-namespace creattion taking
+ * the @pid we've been requeting via LAST_PID_PATH interface
+ * so that we can't restore a take with pid needed.
+ *
+ * Here is an idea -- unhare net namespace in callee instead.
+ */
+ ret = clone(restore_task_with_children, ca.stack_ptr,
+ (ca.clone_flags & ~CLONE_NEWNET) | SIGCHLD, &ca);
+
+ if (ret < 0) {
+ pr_perror("Can't fork for %d", pid);
+ goto err_unlock;
+ }
+
+
+ if (item == root_item) {
+ item->pid.real = ret;
+ pr_debug("PID: real %d virt %d\n",
+ item->pid.real, item->pid.virt);
+ }
+
+ if (opts.pidfile && root_item == item) {
+ int pid;
+
+ pid = ret;
+
+ ret = write_pidfile(pid);
+ if (ret < 0) {
+ pr_perror("Can't write pidfile");
+ kill(pid, SIGKILL);
+ }
+ }
+
+err_unlock:
+ if (ca.fd >= 0) {
+ if (flock(ca.fd, LOCK_UN))
+ pr_perror("%d: Can't unlock %s", pid, LAST_PID_PATH);
+
+ close(ca.fd);
+ }
+err:
+ if (ca.core)
+ core_entry__free_unpacked(ca.core, NULL);
+ return ret;
+}
+
+static void sigchld_handler(int signal, siginfo_t *siginfo, void *data)
+{
+ struct pstree_item *pi;
+ pid_t pid = siginfo->si_pid;
+ int status;
+ int exit;
+
+ exit = (siginfo->si_code == CLD_EXITED);
+ status = siginfo->si_status;
+
+ /* skip scripts */
+ if (!current && root_item->pid.real != pid) {
+ pid = waitpid(root_item->pid.real, &status, WNOHANG);
+ if (pid <= 0)
+ return;
+ exit = WIFEXITED(status);
+ status = exit ? WEXITSTATUS(status) : WTERMSIG(status);
+ }
+
+ if (!current && siginfo->si_code == CLD_TRAPPED &&
+ siginfo->si_status == SIGCHLD) {
+ /* The root task is ptraced. Allow it to handle SIGCHLD */
+ ptrace(PTRACE_CONT, siginfo->si_pid, 0, SIGCHLD);
+ return;
+ }
+
+ if (!current || status)
+ goto err;
+
+ while (pid) {
+ pid = waitpid(-1, &status, WNOHANG);
+ if (pid <= 0)
+ return;
+
+ exit = WIFEXITED(status);
+ status = exit ? WEXITSTATUS(status) : WTERMSIG(status);
+ if (status)
+ break;
+
+ /* Exited (with zero code) helpers are OK */
+ list_for_each_entry(pi, ¤t->children, sibling)
+ if (pi->pid.virt == siginfo->si_pid)
+ break;
+
+ BUG_ON(&pi->sibling == ¤t->children);
+ if (pi->state != TASK_HELPER)
+ break;
+ }
+
+err:
+ if (exit)
+ pr_err("%d exited, status=%d\n", pid, status);
+ else
+ pr_err("%d killed by signal %d\n", pid, status);
+
+ futex_abort_and_wake(&task_entries->nr_in_progress);
+}
+
+static int criu_signals_setup(void)
+{
+ int ret;
+ struct sigaction act;
+ sigset_t blockmask;
+
+ ret = sigaction(SIGCHLD, NULL, &act);
+ if (ret < 0) {
+ pr_perror("sigaction() failed");
+ return -1;
+ }
+
+ act.sa_flags |= SA_NOCLDSTOP | SA_SIGINFO | SA_RESTART;
+ act.sa_sigaction = sigchld_handler;
+ sigemptyset(&act.sa_mask);
+ sigaddset(&act.sa_mask, SIGCHLD);
+
+ ret = sigaction(SIGCHLD, &act, NULL);
+ if (ret < 0) {
+ pr_perror("sigaction() failed");
+ return -1;
+ }
+
+ /*
+ * The block mask will be restored in sigreturn.
+ *
+ * TODO: This code should be removed, when a freezer will be added.
+ */
+ sigfillset(&blockmask);
+ sigdelset(&blockmask, SIGCHLD);
+
+ /*
+ * Here we use SIG_SETMASK instead of SIG_BLOCK to avoid the case where
+ * we've been forked from a parent who had blocked SIGCHLD. If SIGCHLD
+ * is blocked when a task dies (e.g. if the task fails to restore
+ * somehow), we hang because our SIGCHLD handler is never run. Since we
+ * depend on SIGCHLD being unblocked, let's set the mask explicitly.
+ */
+ ret = sigprocmask(SIG_SETMASK, &blockmask, NULL);
+ if (ret < 0) {
+ pr_perror("Can't block signals");
+ return -1;
+ }
+
+ return 0;
+}
+
+static void restore_sid(void)
+{
+ pid_t sid;
+
+ /*
+ * SID can only be reset to pid or inherited from parent.
+ * Thus we restore it right here to let our kids inherit
+ * one in case they need it.
+ *
+ * PGIDs are restored late when all tasks are forked and
+ * we can call setpgid() on custom values.
+ */
+
+ if (current->pid.virt == current->sid) {
+ pr_info("Restoring %d to %d sid\n", current->pid.virt, current->sid);
+ sid = setsid();
+ if (sid != current->sid) {
+ pr_perror("Can't restore sid (%d)", sid);
+ exit(1);
+ }
+ } else {
+ sid = getsid(getpid());
+ if (sid != current->sid) {
+ /* Skip the root task if it's not init */
+ if (current == root_item && root_item->pid.virt != INIT_PID)
+ return;
+ pr_err("Requested sid %d doesn't match inherited %d\n",
+ current->sid, sid);
+ exit(1);
+ }
+ }
+}
+
+static void restore_pgid(void)
+{
+ /*
+ * Unlike sessions, process groups (a.k.a. pgids) can be joined
+ * by any task, provided the task with pid == pgid (group leader)
+ * exists. Thus, in order to restore pgid we must make sure that
+ * group leader was born and created the group, then join one.
+ *
+ * We do this _before_ finishing the forking stage to make sure
+ * helpers are still with us.
+ */
+
+ pid_t pgid, my_pgid = current->pgid;
+
+ pr_info("Restoring %d to %d pgid\n", current->pid.virt, my_pgid);
+
+ pgid = getpgrp();
+ if (my_pgid == pgid)
+ return;
+
+ if (my_pgid != current->pid.virt) {
+ struct pstree_item *leader;
+
+ /*
+ * Wait for leader to become such.
+ * Missing leader means we're going to crtools
+ * group (-j option).
+ */
+
+ leader = rsti(current)->pgrp_leader;
+ if (leader) {
+ BUG_ON(my_pgid != leader->pid.virt);
+ futex_wait_until(&rsti(leader)->pgrp_set, 1);
+ }
+ }
+
+ pr_info("\twill call setpgid, mine pgid is %d\n", pgid);
+ if (setpgid(0, my_pgid) != 0) {
+ pr_perror("Can't restore pgid (%d/%d->%d)", current->pid.virt, pgid, current->pgid);
+ exit(1);
+ }
+
+ if (my_pgid == current->pid.virt)
+ futex_set_and_wake(&rsti(current)->pgrp_set, 1);
+}
+
+static int mount_proc(void)
+{
+ int fd, ret;
+ char proc_mountpoint[] = "crtools-proc.XXXXXX";
+
+ if (mkdtemp(proc_mountpoint) == NULL) {
+ pr_perror("mkdtemp failed %s", proc_mountpoint);
+ return -1;
+ }
+
+ pr_info("Mount procfs in %s\n", proc_mountpoint);
+ if (mount("proc", proc_mountpoint, "proc", MS_MGC_VAL | MS_NOSUID | MS_NOEXEC | MS_NODEV, NULL)) {
+ pr_perror("mount failed");
+ rmdir(proc_mountpoint);
+ return -1;
+ }
+
+ ret = fd = open_detach_mount(proc_mountpoint);
+ if (fd >= 0) {
+ ret = set_proc_fd(fd);
+ close(fd);
+ }
+
+ return ret;
+}
+
+/*
+ * Tasks cannot change sid (session id) arbitrary, but can either
+ * inherit one from ancestor, or create a new one with id equal to
+ * their pid. Thus sid-s restore is tied with children creation.
+ */
+
+static int create_children_and_session(void)
+{
+ int ret;
+ struct pstree_item *child;
+
+ pr_info("Restoring children in alien sessions:\n");
+ list_for_each_entry(child, ¤t->children, sibling) {
+ if (!restore_before_setsid(child))
+ continue;
+
+ BUG_ON(child->born_sid != -1 && getsid(getpid()) != child->born_sid);
+
+ ret = fork_with_pid(child);
+ if (ret < 0)
+ return ret;
+ }
+
+ if (current->parent)
+ restore_sid();
+
+ pr_info("Restoring children in our session:\n");
+ list_for_each_entry(child, ¤t->children, sibling) {
+ if (restore_before_setsid(child))
+ continue;
+
+ ret = fork_with_pid(child);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+static int restore_task_with_children(void *_arg)
+{
+ struct cr_clone_arg *ca = _arg;
+ pid_t pid;
+ int ret;
+
+ current = ca->item;
+
+ if (current != root_item) {
+ char buf[12];
+ int fd;
+
+ /* Determine PID in CRIU's namespace */
+ fd = get_service_fd(CR_PROC_FD_OFF);
+ if (fd < 0)
+ goto err;
+
+ ret = readlinkat(fd, "self", buf, sizeof(buf) - 1);
+ if (ret < 0) {
+ pr_perror("Unable to read the /proc/self link");
+ goto err;
+ }
+ buf[ret] = '\0';
+
+ current->pid.real = atoi(buf);
+ pr_debug("PID: real %d virt %d\n",
+ current->pid.real, current->pid.virt);
+ }
+
+ if ( !(ca->clone_flags & CLONE_FILES))
+ close_safe(&ca->fd);
+
+ if (current->state != TASK_HELPER) {
+ ret = clone_service_fd(rsti(current)->service_fd_id);
+ if (ret)
+ goto err;
+ }
+
+ pid = getpid();
+ if (current->pid.virt != pid) {
+ pr_err("Pid %d do not match expected %d\n", pid, current->pid.virt);
+ set_task_cr_err(EEXIST);
+ goto err;
+ }
+
+ ret = log_init_by_pid();
+ if (ret < 0)
+ goto err;
+
+ if (ca->clone_flags & CLONE_NEWNET) {
+ ret = unshare(CLONE_NEWNET);
+ if (ret) {
+ pr_perror("Can't unshare net-namespace");
+ goto err;
+ }
+ }
+
+ if (!(ca->clone_flags & CLONE_FILES)) {
+ ret = close_old_fds();
+ if (ret)
+ goto err;
+ }
+
+ /* Restore root task */
+ if (current->parent == NULL) {
+ if (restore_finish_stage(CR_STATE_RESTORE_NS) < 0)
+ goto err;
+
+ pr_info("Calling restore_sid() for init\n");
+ restore_sid();
+
+ /*
+ * We need non /proc proc mount for restoring pid and mount
+ * namespaces and do not care for the rest of the cases.
+ * Thus -- mount proc at custom location for any new namespace
+ */
+ if (mount_proc())
+ goto err;
+
+ if (prepare_namespace(current, ca->clone_flags))
+ goto err;
+
+ if (root_prepare_shared())
+ goto err;
+
+ if (restore_finish_stage(CR_STATE_RESTORE_SHARED) < 0)
+ goto err;
+ }
+
+ if (restore_task_mnt_ns(current))
+ goto err;
+
+ if (prepare_mappings())
+ goto err;
+
+ /*
+ * Call this _before_ forking to optimize cgroups
+ * restore -- if all tasks live in one set of cgroups
+ * we will only move the root one there, others will
+ * just have it inherited.
+ */
+ if (prepare_task_cgroup(current) < 0)
+ goto err;
+
+ if (prepare_sigactions() < 0)
+ goto err;
+
+ if (fault_injected(FI_RESTORE_ROOT_ONLY)) {
+ pr_info("fault: Restore root task failure!\n");
+ BUG();
+ }
+
+ if (create_children_and_session())
+ goto err;
+
+
+ if (unmap_guard_pages())
+ goto err;
+
+ restore_pgid();
+
+ if (restore_finish_stage(CR_STATE_FORKING) < 0)
+ goto err;
+
+ if (current->parent == NULL) {
+ if (depopulate_roots_yard())
+ goto err;
+
+ fini_restore_mntns();
+ }
+
+ if (restore_one_task(current->pid.virt, ca->core))
+ goto err;
+
+ return 0;
+
+err:
+ if (current->parent == NULL)
+ futex_abort_and_wake(&task_entries->nr_in_progress);
+ exit(1);
+}
+
+static inline int stage_participants(int next_stage)
+{
+ switch (next_stage) {
+ case CR_STATE_FAIL:
+ return 0;
+ case CR_STATE_RESTORE_NS:
+ case CR_STATE_RESTORE_SHARED:
+ return 1;
+ case CR_STATE_FORKING:
+ return task_entries->nr_tasks + task_entries->nr_helpers;
+ case CR_STATE_RESTORE:
+ return task_entries->nr_threads + task_entries->nr_helpers;
+ case CR_STATE_RESTORE_SIGCHLD:
+ return task_entries->nr_threads;
+ case CR_STATE_RESTORE_CREDS:
+ return task_entries->nr_threads;
+ }
+
+ BUG();
+ return -1;
+}
+
+static int restore_wait_inprogress_tasks()
+{
+ int ret;
+ futex_t *np = &task_entries->nr_in_progress;
+
+ futex_wait_while_gt(np, 0);
+ ret = (int)futex_get(np);
+ if (ret < 0) {
+ set_cr_errno(get_task_cr_err());
+ return ret;
+ }
+
+ return 0;
+}
+
+static void __restore_switch_stage(int next_stage)
+{
+ futex_set(&task_entries->nr_in_progress,
+ stage_participants(next_stage));
+ futex_set_and_wake(&task_entries->start, next_stage);
+}
+
+static int restore_switch_stage(int next_stage)
+{
+ __restore_switch_stage(next_stage);
+ return restore_wait_inprogress_tasks();
+}
+
+static int attach_to_tasks(bool root_seized)
+{
+ struct pstree_item *item;
+
+ for_each_pstree_item(item) {
+ pid_t pid = item->pid.real;
+ int status, i;
+
+ if (!task_alive(item))
+ continue;
+
+ if (parse_threads(item->pid.real, &item->threads, &item->nr_threads))
+ return -1;
+
+ for (i = 0; i < item->nr_threads; i++) {
+ pid = item->threads[i].real;
+
+ if (item != root_item || !root_seized || i != 0) {
+ if (ptrace(PTRACE_SEIZE, pid, 0, 0)) {
+ pr_perror("Can't attach to %d", pid);
+ return -1;
+ }
+ }
+ if (ptrace(PTRACE_INTERRUPT, pid, 0, 0)) {
+ pr_perror("Can't interrupt the %d task", pid);
+ return -1;
+ }
+
+
+ if (wait4(pid, &status, __WALL, NULL) != pid) {
+ pr_perror("waitpid(%d) failed", pid);
+ return -1;
+ }
+
+ /*
+ * Suspend seccomp if necessary. We need to do this because
+ * although seccomp is restored at the very end of the
+ * restorer blob (and the final sigreturn is ok), here we're
+ * doing an munmap in the process, which may be blocked by
+ * seccomp and cause the task to be killed.
+ */
+ if (rsti(item)->has_seccomp && suspend_seccomp(pid) < 0)
+ pr_err("failed to suspend seccomp, restore will probably fail...\n");
+
+ if (ptrace(PTRACE_CONT, pid, NULL, NULL) ) {
+ pr_perror("Unable to resume %d", pid);
+ return -1;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int catch_tasks(bool root_seized, enum trace_flags *flag)
+{
+ struct pstree_item *item;
+
+ for_each_pstree_item(item) {
+ pid_t pid = item->pid.real;
+ int status, i, ret;
+
+ if (!task_alive(item))
+ continue;
+
+ if (parse_threads(item->pid.real, &item->threads, &item->nr_threads))
+ return -1;
+
+ for (i = 0; i < item->nr_threads; i++) {
+ pid = item->threads[i].real;
+
+ if (ptrace(PTRACE_INTERRUPT, pid, 0, 0)) {
+ pr_perror("Can't interrupt the %d task", pid);
+ return -1;
+ }
+
+ if (wait4(pid, &status, __WALL, NULL) != pid) {
+ pr_perror("waitpid(%d) failed", pid);
+ return -1;
+ }
+
+ ret = ptrace_stop_pie(pid, rsti(item)->breakpoint, flag);
+ if (ret < 0)
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int clear_breakpoints()
+{
+ struct pstree_item *item;
+ int ret = 0, i;
+
+ for_each_pstree_item(item) {
+ if (!task_alive(item))
+ continue;
+ for (i = 0; i < item->nr_threads; i++)
+ ret |= ptrace_flush_breakpoints(item->threads[i].real);
+ }
+
+ return ret;
+}
+
+static void finalize_restore(void)
+{
+ struct pstree_item *item;
+
+ for_each_pstree_item(item) {
+ pid_t pid = item->pid.real;
+ struct parasite_ctl *ctl;
+
+ if (!task_alive(item))
+ continue;
+
+ /* Unmap the restorer blob */
+ ctl = parasite_prep_ctl(pid, NULL);
+ if (ctl == NULL)
+ continue;
+
+ parasite_unmap(ctl, (unsigned long)rsti(item)->munmap_restorer);
+
+ xfree(ctl);
+
+ if (item->state == TASK_STOPPED)
+ kill(item->pid.real, SIGSTOP);
+ }
+}
+
+static void finalize_restore_detach(int status)
+{
+ struct pstree_item *item;
+
+ for_each_pstree_item(item) {
+ pid_t pid;
+ int i;
+
+ if (!task_alive(item))
+ continue;
+
+ for (i = 0; i < item->nr_threads; i++) {
+ pid = item->threads[i].real;
+ if (pid < 0) {
+ BUG_ON(status >= 0);
+ break;
+ }
+
+ if (ptrace(PTRACE_DETACH, pid, NULL, 0))
+ pr_perror("Unable to execute %d", pid);
+ }
+ }
+}
+
+static void ignore_kids(void)
+{
+ struct sigaction sa = { .sa_handler = SIG_DFL };
+
+ if (sigaction(SIGCHLD, &sa, NULL) < 0)
+ pr_perror("Restoring CHLD sigaction failed");
+}
+
+static unsigned int saved_loginuid;
+
+static int prepare_userns_hook(void)
+{
+ int ret;
+
+ if (!kdat.has_loginuid)
+ return 0;
+ /*
+ * Save old loginuid and set it to INVALID_UID:
+ * this value means that loginuid is unset and it will be inherited.
+ * After you set some value to /proc/<>/loginuid it can't be changed
+ * inside container due to permissions.
+ * But you still can set this value if it was unset.
+ */
+ saved_loginuid = parse_pid_loginuid(getpid(), &ret, false);
+ if (ret < 0)
+ return -1;
+
+ if (prepare_loginuid(INVALID_UID, LOG_ERROR) < 0) {
+ pr_err("Setting loginuid for CT init task failed, CAP_AUDIT_CONTROL?");
+ return -1;
+ }
+ return 0;
+}
+
+static void restore_origin_ns_hook(void)
+{
+ if (!kdat.has_loginuid)
+ return;
+
+ /* not critical: it does not affect CT in any way */
+ if (prepare_loginuid(saved_loginuid, LOG_ERROR) < 0)
+ pr_err("Restore original /proc/self/loginuid failed");
+}
+
+static int restore_root_task(struct pstree_item *init)
+{
+ enum trace_flags flag = TRACE_ALL;
+ int ret, fd, mnt_ns_fd = -1;
+ int clean_remaps = 1;
+
+ ret = run_scripts(ACT_PRE_RESTORE);
+ if (ret != 0) {
+ pr_err("Aborting restore due to pre-restore script ret code %d\n", ret);
+ return -1;
+ }
+
+ fd = open("/proc", O_DIRECTORY | O_RDONLY);
+ if (fd < 0) {
+ pr_perror("Unable to open /proc");
+ return -1;
+ }
+
+ ret = install_service_fd(CR_PROC_FD_OFF, fd);
+ close(fd);
+ if (ret < 0)
+ return -1;
+
+ /*
+ * FIXME -- currently we assume that all the tasks live
+ * in the same set of namespaces. This is done to debug
+ * the ns contents dumping/restoring. Need to revisit
+ * this later.
+ */
+
+ if (init->pid.virt == INIT_PID) {
+ if (!(root_ns_mask & CLONE_NEWPID)) {
+ pr_err("This process tree can only be restored "
+ "in a new pid namespace.\n"
+ "criu should be re-executed with the "
+ "\"--namespace pid\" option.\n");
+ return -1;
+ }
+ } else if (root_ns_mask & CLONE_NEWPID) {
+ pr_err("Can't restore pid namespace without the process init\n");
+ return -1;
+ }
+
+ if (prepare_userns_hook())
+ return -1;
+
+ if (prepare_namespace_before_tasks())
+ return -1;
+
+ futex_set(&task_entries->nr_in_progress,
+ stage_participants(CR_STATE_RESTORE_NS));
+
+ ret = fork_with_pid(init);
+ if (ret < 0)
+ goto out;
+
+ restore_origin_ns_hook();
+
+ if (root_as_sibling) {
+ struct sigaction act;
+ /*
+ * Root task will be our sibling. This means, that
+ * we will not notice when (if) it dies in SIGCHLD
+ * handler, but we should. To do this -- attach to
+ * the guy with ptrace (below) and (!) make the kernel
+ * deliver us the signal when it will get stopped.
+ * It will in case of e.g. segfault before handling
+ * the signal.
+ */
+ sigaction(SIGCHLD, NULL, &act);
+ act.sa_flags &= ~SA_NOCLDSTOP;
+ sigaction(SIGCHLD, &act, NULL);
+
+ if (ptrace(PTRACE_SEIZE, init->pid.real, 0, 0)) {
+ pr_perror("Can't attach to init");
+ goto out_kill;
+ }
+ }
+
+ /*
+ * uid_map and gid_map must be filled from a parent user namespace.
+ * prepare_userns_creds() must be called after filling mappings.
+ */
+ if ((root_ns_mask & CLONE_NEWUSER) && prepare_userns(init))
+ goto out_kill;
+
+ pr_info("Wait until namespaces are created\n");
+ ret = restore_wait_inprogress_tasks();
+ if (ret)
+ goto out_kill;
+
+ if (root_ns_mask & CLONE_NEWNS) {
+ mnt_ns_fd = open_proc(init->pid.real, "ns/mnt");
+ if (mnt_ns_fd < 0) {
+ pr_perror("Can't open init's mntns fd");
+ goto out_kill;
+ }
+ }
+
+ ret = run_scripts(ACT_SETUP_NS);
+ if (ret)
+ goto out_kill;
+
+ timing_start(TIME_FORK);
+ ret = restore_switch_stage(CR_STATE_RESTORE_SHARED);
+ if (ret < 0)
+ goto out_kill;
+
+ ret = run_scripts(ACT_POST_SETUP_NS);
+ if (ret)
+ goto out_kill;
+
+ ret = restore_switch_stage(CR_STATE_FORKING);
+ if (ret < 0)
+ goto out_kill;
+
+ timing_stop(TIME_FORK);
+
+ ret = restore_switch_stage(CR_STATE_RESTORE);
+ if (ret < 0)
+ goto out_kill;
+
+ ret = restore_switch_stage(CR_STATE_RESTORE_SIGCHLD);
+ if (ret < 0)
+ goto out_kill;
+
+ /*
+ * The task_entries->nr_zombies is updated in the
+ * CR_STATE_RESTORE_SIGCHLD in pie code.
+ */
+ task_entries->nr_threads -= atomic_read(&task_entries->nr_zombies);
+
+ /*
+ * There is no need to call try_clean_remaps() after this point,
+ * as restore went OK and all ghosts were removed by the openers.
+ */
+ clean_remaps = 0;
+ close_safe(&mnt_ns_fd);
+ cleanup_mnt_ns();
+
+ ret = stop_usernsd();
+ if (ret < 0)
+ goto out_kill;
+
+ ret = move_veth_to_bridge();
+ if (ret < 0)
+ goto out_kill;
+
+ ret = prepare_cgroup_properties();
+ if (ret < 0)
+ goto out_kill;
+
+ ret = run_scripts(ACT_POST_RESTORE);
+ if (ret != 0) {
+ pr_err("Aborting restore due to post-restore script ret code %d\n", ret);
+ timing_stop(TIME_RESTORE);
+ write_stats(RESTORE_STATS);
+ goto out_kill;
+ }
+
+ /* Unlock network before disabling repair mode on sockets */
+ network_unlock();
+
+ /*
+ * Stop getting sigchld, after we resume the tasks they
+ * may start to exit poking criu in vain.
+ */
+ ignore_kids();
+
+ /*
+ * -------------------------------------------------------------
+ * Below this line nothing should fail, because network is unlocked
+ */
+ attach_to_tasks(root_as_sibling);
+
+ ret = restore_switch_stage(CR_STATE_RESTORE_CREDS);
+ BUG_ON(ret);
+
+ timing_stop(TIME_RESTORE);
+
+ ret = catch_tasks(root_as_sibling, &flag);
+
+ pr_info("Restore finished successfully. Resuming tasks.\n");
+ futex_set_and_wake(&task_entries->start, CR_STATE_COMPLETE);
+
+ if (ret == 0)
+ ret = parasite_stop_on_syscall(task_entries->nr_threads,
+ __NR_rt_sigreturn, flag);
+
+ if (clear_breakpoints())
+ pr_err("Unable to flush breakpoints\n");
+
+ if (ret == 0)
+ finalize_restore();
+
+ if (restore_freezer_state())
+ pr_err("Unable to restore freezer state\n");
+
+ fini_cgroup();
+
+ /* Detaches from processes and they continue run through sigreturn. */
+ finalize_restore_detach(ret);
+
+ write_stats(RESTORE_STATS);
+
+ if (!opts.restore_detach && !opts.exec_cmd)
+ wait(NULL);
+
+ return 0;
+
+out_kill:
+ /*
+ * The processes can be killed only when all of them have been created,
+ * otherwise an external proccesses can be killed.
+ */
+ if (root_ns_mask & CLONE_NEWPID) {
+ int status;
+
+ /* Kill init */
+ if (root_item->pid.real > 0)
+ kill(root_item->pid.real, SIGKILL);
+
+ if (waitpid(root_item->pid.real, &status, 0) < 0)
+ pr_warn("Unable to wait %d: %s",
+ root_item->pid.real, strerror(errno));
+ } else {
+ struct pstree_item *pi;
+
+ for_each_pstree_item(pi)
+ if (pi->pid.virt > 0)
+ kill(pi->pid.virt, SIGKILL);
+ }
+
+out:
+ fini_cgroup();
+ if (clean_remaps)
+ try_clean_remaps(mnt_ns_fd);
+ cleanup_mnt_ns();
+ stop_usernsd();
+ __restore_switch_stage(CR_STATE_FAIL);
+ pr_err("Restoring FAILED.\n");
+ return -1;
+}
+
+static int prepare_task_entries(void)
+{
+ task_entries_pos = rst_mem_align_cpos(RM_SHREMAP);
+ task_entries = rst_mem_alloc(sizeof(*task_entries), RM_SHREMAP);
+ if (!task_entries) {
+ pr_perror("Can't map shmem");
+ return -1;
+ }
+
+ task_entries->nr_threads = 0;
+ task_entries->nr_tasks = 0;
+ task_entries->nr_helpers = 0;
+ atomic_set(&task_entries->nr_zombies, 0);
+ futex_set(&task_entries->start, CR_STATE_RESTORE_NS);
+ mutex_init(&task_entries->userns_sync_lock);
+
+ return 0;
+}
+
+int cr_restore_tasks(void)
+{
+ int ret = -1;
+
+ if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE))
+ return -1;
+
+ if (check_img_inventory() < 0)
+ goto err;
+
+ if (init_stats(RESTORE_STATS))
+ goto err;
+
+ if (kerndat_init_rst())
+ goto err;
+
+ timing_start(TIME_RESTORE);
+
+ if (cpu_init() < 0)
+ goto err;
+
+ if (vdso_init())
+ goto err;
+
+ if (opts.cpu_cap & (CPU_CAP_INS | CPU_CAP_CPU)) {
+ if (cpu_validate_cpuinfo())
+ goto err;
+ }
+
+ if (prepare_task_entries() < 0)
+ goto err;
+
+ if (prepare_pstree() < 0)
+ goto err;
+
+ if (crtools_prepare_shared() < 0)
+ goto err;
+
+ if (criu_signals_setup() < 0)
+ goto err;
+
+ ret = restore_root_task(root_item);
+err:
+ cr_plugin_fini(CR_PLUGIN_STAGE__RESTORE, ret);
+ return ret;
+}
+
+static long restorer_get_vma_hint(struct list_head *tgt_vma_list,
+ struct list_head *self_vma_list, long vma_len)
+{
+ struct vma_area *t_vma, *s_vma;
+ long prev_vma_end = 0;
+ struct vma_area end_vma;
+ VmaEntry end_e;
+
+ end_vma.e = &end_e;
+ end_e.start = end_e.end = kdat.task_size;
+ prev_vma_end = PAGE_SIZE * 0x10; /* CONFIG_LSM_MMAP_MIN_ADDR=65536 */
+
+ s_vma = list_first_entry(self_vma_list, struct vma_area, list);
+ t_vma = list_first_entry(tgt_vma_list, struct vma_area, list);
+
+ while (1) {
+ if (prev_vma_end + vma_len > s_vma->e->start) {
+ if (s_vma->list.next == self_vma_list) {
+ s_vma = &end_vma;
+ continue;
+ }
+ if (s_vma == &end_vma)
+ break;
+ if (prev_vma_end < s_vma->e->end)
+ prev_vma_end = s_vma->e->end;
+ s_vma = list_entry(s_vma->list.next, struct vma_area, list);
+ continue;
+ }
+
+ if (prev_vma_end + vma_len > t_vma->e->start) {
+ if (t_vma->list.next == tgt_vma_list) {
+ t_vma = &end_vma;
+ continue;
+ }
+ if (t_vma == &end_vma)
+ break;
+ if (prev_vma_end < t_vma->e->end)
+ prev_vma_end = t_vma->e->end;
+ t_vma = list_entry(t_vma->list.next, struct vma_area, list);
+ continue;
+ }
+
+ return prev_vma_end;
+ }
+
+ return -1;
+}
+
+static inline int timeval_valid(struct timeval *tv)
+{
+ return (tv->tv_sec >= 0) && ((unsigned long)tv->tv_usec < USEC_PER_SEC);
+}
+
+static inline int decode_itimer(char *n, ItimerEntry *ie, struct itimerval *val)
+{
+ if (ie->isec == 0 && ie->iusec == 0) {
+ memzero_p(val);
+ return 0;
+ }
+
+ val->it_interval.tv_sec = ie->isec;
+ val->it_interval.tv_usec = ie->iusec;
+
+ if (!timeval_valid(&val->it_interval)) {
+ pr_err("Invalid timer interval\n");
+ return -1;
+ }
+
+ if (ie->vsec == 0 && ie->vusec == 0) {
+ /*
+ * Remaining time was too short. Set it to
+ * interval to make the timer armed and work.
+ */
+ val->it_value.tv_sec = ie->isec;
+ val->it_value.tv_usec = ie->iusec;
+ } else {
+ val->it_value.tv_sec = ie->vsec;
+ val->it_value.tv_usec = ie->vusec;
+ }
+
+ if (!timeval_valid(&val->it_value)) {
+ pr_err("Invalid timer value\n");
+ return -1;
+ }
+
+ pr_info("Restored %s timer to %ld.%ld -> %ld.%ld\n", n,
+ val->it_value.tv_sec, val->it_value.tv_usec,
+ val->it_interval.tv_sec, val->it_interval.tv_usec);
+
+ return 0;
+}
+
+/*
+ * Legacy itimers restore from CR_FD_ITIMERS
+ */
+
+static int prepare_itimers_from_fd(int pid, struct task_restore_args *args)
+{
+ int ret = -1;
+ struct cr_img *img;
+ ItimerEntry *ie;
+
+ img = open_image(CR_FD_ITIMERS, O_RSTR, pid);
+ if (!img)
+ return -1;
+
+ ret = pb_read_one(img, &ie, PB_ITIMER);
+ if (ret < 0)
+ goto out;
+ ret = decode_itimer("real", ie, &args->itimers[0]);
+ itimer_entry__free_unpacked(ie, NULL);
+ if (ret < 0)
+ goto out;
+
+ ret = pb_read_one(img, &ie, PB_ITIMER);
+ if (ret < 0)
+ goto out;
+ ret = decode_itimer("virt", ie, &args->itimers[1]);
+ itimer_entry__free_unpacked(ie, NULL);
+ if (ret < 0)
+ goto out;
+
+ ret = pb_read_one(img, &ie, PB_ITIMER);
+ if (ret < 0)
+ goto out;
+ ret = decode_itimer("prof", ie, &args->itimers[2]);
+ itimer_entry__free_unpacked(ie, NULL);
+ if (ret < 0)
+ goto out;
+out:
+ close_image(img);
+ return ret;
+}
+
+static int prepare_itimers(int pid, CoreEntry *core, struct task_restore_args *args)
+{
+ int ret = 0;
+ TaskTimersEntry *tte = core->tc->timers;
+
+ if (!tte)
+ return prepare_itimers_from_fd(pid, args);
+
+ ret |= decode_itimer("real", tte->real, &args->itimers[0]);
+ ret |= decode_itimer("virt", tte->virt, &args->itimers[1]);
+ ret |= decode_itimer("prof", tte->prof, &args->itimers[2]);
+
+ return ret;
+}
+
+static inline int timespec_valid(struct timespec *ts)
+{
+ return (ts->tv_sec >= 0) && ((unsigned long)ts->tv_nsec < NSEC_PER_SEC);
+}
+
+static inline int decode_posix_timer(PosixTimerEntry *pte,
+ struct restore_posix_timer *pt)
+{
+ pt->val.it_interval.tv_sec = pte->isec;
+ pt->val.it_interval.tv_nsec = pte->insec;
+
+ if (!timespec_valid(&pt->val.it_interval)) {
+ pr_err("Invalid timer interval(posix)\n");
+ return -1;
+ }
+
+ if (pte->vsec == 0 && pte->vnsec == 0) {
+ // Remaining time was too short. Set it to
+ // interval to make the timer armed and work.
+ pt->val.it_value.tv_sec = pte->isec;
+ pt->val.it_value.tv_nsec = pte->insec;
+ } else {
+ pt->val.it_value.tv_sec = pte->vsec;
+ pt->val.it_value.tv_nsec = pte->vnsec;
+ }
+
+ if (!timespec_valid(&pt->val.it_value)) {
+ pr_err("Invalid timer value(posix)\n");
+ return -1;
+ }
+
+ pt->spt.it_id = pte->it_id;
+ pt->spt.clock_id = pte->clock_id;
+ pt->spt.si_signo = pte->si_signo;
+ pt->spt.it_sigev_notify = pte->it_sigev_notify;
+ pt->spt.sival_ptr = decode_pointer(pte->sival_ptr);
+ pt->overrun = pte->overrun;
+
+ return 0;
+}
+
+static int cmp_posix_timer_proc_id(const void *p1, const void *p2)
+{
+ return ((struct restore_posix_timer *)p1)->spt.it_id - ((struct restore_posix_timer *)p2)->spt.it_id;
+}
+
+static unsigned long posix_timers_cpos;
+static unsigned int posix_timers_nr;
+
+static void sort_posix_timers(void)
+{
+ /*
+ * This is required for restorer's create_posix_timers(),
+ * it will probe them one-by-one for the desired ID, since
+ * kernel doesn't provide another API for timer creation
+ * with given ID.
+ */
+
+ if (posix_timers_nr > 0)
+ qsort(rst_mem_remap_ptr(posix_timers_cpos, RM_PRIVATE),
+ posix_timers_nr,
+ sizeof(struct restore_posix_timer),
+ cmp_posix_timer_proc_id);
+}
+
+/*
+ * Legacy posix timers restoration from CR_FD_POSIX_TIMERS
+ */
+
+static int prepare_posix_timers_from_fd(int pid)
+{
+ struct cr_img *img;
+ int ret = -1;
+ struct restore_posix_timer *t;
+
+ img = open_image(CR_FD_POSIX_TIMERS, O_RSTR, pid);
+ if (!img)
+ return -1;
+
+ while (1) {
+ PosixTimerEntry *pte;
+
+ ret = pb_read_one_eof(img, &pte, PB_POSIX_TIMER);
+ if (ret <= 0)
+ break;
+
+ t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE);
+ if (!t)
+ break;
+
+ ret = decode_posix_timer(pte, t);
+ if (ret < 0)
+ break;
+
+ posix_timer_entry__free_unpacked(pte, NULL);
+ posix_timers_nr++;
+ }
+
+ close_image(img);
+ if (!ret)
+ sort_posix_timers();
+
+ return ret;
+}
+
+static int prepare_posix_timers(int pid, CoreEntry *core)
+{
+ int i, ret = -1;
+ TaskTimersEntry *tte = core->tc->timers;
+ struct restore_posix_timer *t;
+
+ posix_timers_cpos = rst_mem_align_cpos(RM_PRIVATE);
+
+ if (!tte)
+ return prepare_posix_timers_from_fd(pid);
+
+ posix_timers_nr = tte->n_posix;
+ for (i = 0; i < posix_timers_nr; i++) {
+ t = rst_mem_alloc(sizeof(struct restore_posix_timer), RM_PRIVATE);
+ if (!t)
+ goto out;
+
+ if (decode_posix_timer(tte->posix[i], t))
+ goto out;
+ }
+
+ ret = 0;
+ sort_posix_timers();
+out:
+ return ret;
+}
+
+static inline int verify_cap_size(CredsEntry *ce)
+{
+ return ((ce->n_cap_inh == CR_CAP_SIZE) && (ce->n_cap_eff == CR_CAP_SIZE) &&
+ (ce->n_cap_prm == CR_CAP_SIZE) && (ce->n_cap_bnd == CR_CAP_SIZE));
+}
+
+static int prepare_mm(pid_t pid, struct task_restore_args *args)
+{
+ int exe_fd, i, ret = -1;
+ MmEntry *mm = rsti(current)->mm;
+
+ args->mm = *mm;
+ args->mm.n_mm_saved_auxv = 0;
+ args->mm.mm_saved_auxv = NULL;
+
+ if (mm->n_mm_saved_auxv > AT_VECTOR_SIZE) {
+ pr_err("Image corrupted on pid %d\n", pid);
+ goto out;
+ }
+
+ args->mm_saved_auxv_size = mm->n_mm_saved_auxv*sizeof(auxv_t);
+ for (i = 0; i < mm->n_mm_saved_auxv; ++i) {
+ args->mm_saved_auxv[i] = (auxv_t)mm->mm_saved_auxv[i];
+ }
+
+ exe_fd = open_reg_by_id(mm->exe_file_id);
+ if (exe_fd < 0)
+ goto out;
+
+ args->fd_exe_link = exe_fd;
+ ret = 0;
+out:
+ return ret;
+}
+
+static void *restorer;
+static unsigned long restorer_len;
+
+static int prepare_restorer_blob(void)
+{
+ /*
+ * We map anonymous mapping, not mremap the restorer itself later.
+ * Otherwise the restorer vma would be tied to criu binary which
+ * in turn will lead to set-exe-file prctl to fail with EBUSY.
+ */
+
+ restorer_len = pie_size(restorer_blob);
+ restorer = mmap(NULL, restorer_len,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_PRIVATE | MAP_ANON, 0, 0);
+ if (restorer == MAP_FAILED) {
+ pr_perror("Can't map restorer code");
+ return -1;
+ }
+
+ memcpy(restorer, &restorer_blob, sizeof(restorer_blob));
+ return 0;
+}
+
+static int remap_restorer_blob(void *addr)
+{
+ void *mem;
+
+ mem = mremap(restorer, restorer_len, restorer_len,
+ MREMAP_FIXED | MREMAP_MAYMOVE, addr);
+ if (mem != addr) {
+ pr_perror("Can't remap restorer blob");
+ return -1;
+ }
+
+ ELF_RELOCS_APPLY_RESTORER(addr, addr);
+ return 0;
+}
+
+static int validate_sched_parm(struct rst_sched_param *sp)
+{
+ if ((sp->nice < -20) || (sp->nice > 19))
+ return 0;
+
+ switch (sp->policy) {
+ case SCHED_RR:
+ case SCHED_FIFO:
+ return ((sp->prio > 0) && (sp->prio < 100));
+ case SCHED_IDLE:
+ case SCHED_OTHER:
+ case SCHED_BATCH:
+ return sp->prio == 0;
+ }
+
+ return 0;
+}
+
+static int prep_sched_info(struct rst_sched_param *sp, ThreadCoreEntry *tc)
+{
+ if (!tc->has_sched_policy) {
+ sp->policy = SCHED_OTHER;
+ sp->nice = 0;
+ return 0;
+ }
+
+ sp->policy = tc->sched_policy;
+ sp->nice = tc->sched_nice;
+ sp->prio = tc->sched_prio;
+
+ if (!validate_sched_parm(sp)) {
+ pr_err("Inconsistent sched params received (%d.%d.%d)\n",
+ sp->policy, sp->nice, sp->prio);
+ return -1;
+ }
+
+ return 0;
+}
+
+static unsigned long decode_rlim(u_int64_t ival)
+{
+ return ival == -1 ? RLIM_INFINITY : ival;
+}
+
+static unsigned long rlims_cpos;
+static unsigned int rlims_nr;
+
+/*
+ * Legacy rlimits restore from CR_FD_RLIMIT
+ */
+
+static int prepare_rlimits_from_fd(int pid)
+{
+ struct rlimit *r;
+ int ret;
+ struct cr_img *img;
+
+ /*
+ * Old image -- read from the file.
+ */
+ img = open_image(CR_FD_RLIMIT, O_RSTR, pid);
+ if (!img)
+ return -1;
+
+ while (1) {
+ RlimitEntry *re;
+
+ ret = pb_read_one_eof(img, &re, PB_RLIMIT);
+ if (ret <= 0)
+ break;
+
+ r = rst_mem_alloc(sizeof(*r), RM_PRIVATE);
+ if (!r) {
+ pr_err("Can't allocate memory for resource %d\n",
+ rlims_nr);
+ return -1;
+ }
+
+ r->rlim_cur = decode_rlim(re->cur);
+ r->rlim_max = decode_rlim(re->max);
+ if (r->rlim_cur > r->rlim_max) {
+ pr_err("Can't restore cur > max for %d.%d\n",
+ pid, rlims_nr);
+ r->rlim_cur = r->rlim_max;
+ }
+
+ rlimit_entry__free_unpacked(re, NULL);
+
+ rlims_nr++;
+ }
+
+ close_image(img);
+
+ return 0;
+}
+
+static int prepare_rlimits(int pid, CoreEntry *core)
+{
+ int i;
+ TaskRlimitsEntry *rls = core->tc->rlimits;
+ struct rlimit *r;
+
+ rlims_cpos = rst_mem_align_cpos(RM_PRIVATE);
+
+ if (!rls)
+ return prepare_rlimits_from_fd(pid);
+
+ for (i = 0; i < rls->n_rlimits; i++) {
+ r = rst_mem_alloc(sizeof(*r), RM_PRIVATE);
+ if (!r) {
+ pr_err("Can't allocate memory for resource %d\n", i);
+ return -1;
+ }
+
+ r->rlim_cur = decode_rlim(rls->rlimits[i]->cur);
+ r->rlim_max = decode_rlim(rls->rlimits[i]->max);
+
+ if (r->rlim_cur > r->rlim_max) {
+ pr_warn("Can't restore cur > max for %d.%d\n", pid, i);
+ r->rlim_cur = r->rlim_max;
+ }
+ }
+
+ rlims_nr = rls->n_rlimits;
+ return 0;
+}
+
+static int signal_to_mem(SiginfoEntry *sie)
+{
+ siginfo_t *info, *t;
+
+ info = (siginfo_t *) sie->siginfo.data;
+ t = rst_mem_alloc(sizeof(siginfo_t), RM_PRIVATE);
+ if (!t)
+ return -1;
+
+ memcpy(t, info, sizeof(*info));
+
+ return 0;
+}
+
+static int open_signal_image(int type, pid_t pid, unsigned int *nr)
+{
+ int ret;
+ struct cr_img *img;
+
+ img = open_image(type, O_RSTR, pid);
+ if (!img)
+ return -1;
+
+ *nr = 0;
+ while (1) {
+ SiginfoEntry *sie;
+
+ ret = pb_read_one_eof(img, &sie, PB_SIGINFO);
+ if (ret <= 0)
+ break;
+ if (sie->siginfo.len != sizeof(siginfo_t)) {
+ pr_err("Unknown image format\n");
+ ret = -1;
+ break;
+ }
+
+ ret = signal_to_mem(sie);
+ if (ret)
+ break;
+
+ (*nr)++;
+
+ siginfo_entry__free_unpacked(sie, NULL);
+ }
+
+ close_image(img);
+
+ return ret ? : 0;
+}
+
+static int prepare_one_signal_queue(SignalQueueEntry *sqe, unsigned int *nr)
+{
+ int i;
+
+ for (i = 0; i < sqe->n_signals; i++)
+ if (signal_to_mem(sqe->signals[i]))
+ return -1;
+
+ *nr = sqe->n_signals;
+
+ return 0;
+}
+
+static unsigned long siginfo_cpos;
+static unsigned int siginfo_nr, *siginfo_priv_nr;
+
+static int prepare_signals(int pid, CoreEntry *leader_core)
+{
+ int ret = -1, i;
+
+ siginfo_cpos = rst_mem_align_cpos(RM_PRIVATE);
+ siginfo_priv_nr = xmalloc(sizeof(int) * current->nr_threads);
+ if (siginfo_priv_nr == NULL)
+ goto out;
+
+ /* Prepare shared signals */
+ if (!leader_core->tc->signals_s)/*backward compatibility*/
+ ret = open_signal_image(CR_FD_SIGNAL, pid, &siginfo_nr);
+ else
+ ret = prepare_one_signal_queue(leader_core->tc->signals_s, &siginfo_nr);
+
+ if (ret < 0)
+ goto out;
+
+ for (i = 0; i < current->nr_threads; i++) {
+ if (!current->core[i]->thread_core->signals_p)/*backward compatibility*/
+ ret = open_signal_image(CR_FD_PSIGNAL,
+ current->threads[i].virt, &siginfo_priv_nr[i]);
+ else
+ ret = prepare_one_signal_queue(current->core[i]->thread_core->signals_p,
+ &siginfo_priv_nr[i]);
+ if (ret < 0)
+ goto out;
+ }
+out:
+ return ret;
+}
+
+extern void __gcov_flush(void) __attribute__((weak));
+void __gcov_flush(void) {}
+
+static void rst_reloc_creds(struct thread_restore_args *thread_args,
+ unsigned long *creds_pos_next)
+{
+ struct thread_creds_args *args;
+
+ if (unlikely(!*creds_pos_next))
+ return;
+
+ args = rst_mem_remap_ptr(*creds_pos_next, RM_PRIVATE);
+
+ if (args->lsm_profile)
+ args->lsm_profile = rst_mem_remap_ptr(args->mem_lsm_profile_pos, RM_PRIVATE);
+ if (args->groups)
+ args->groups = rst_mem_remap_ptr(args->mem_groups_pos, RM_PRIVATE);
+
+ *creds_pos_next = args->mem_pos_next;
+ thread_args->creds_args = args;
+}
+
+static struct thread_creds_args *
+rst_prep_creds_args(CredsEntry *ce, unsigned long *prev_pos)
+{
+ unsigned long this_pos;
+ struct thread_creds_args *args;
+
+ if (!verify_cap_size(ce)) {
+ pr_err("Caps size mismatch %d %d %d %d\n",
+ (int)ce->n_cap_inh, (int)ce->n_cap_eff,
+ (int)ce->n_cap_prm, (int)ce->n_cap_bnd);
+ return ERR_PTR(-EINVAL);
+ }
+
+ this_pos = rst_mem_align_cpos(RM_PRIVATE);
+
+ args = rst_mem_alloc(sizeof(*args), RM_PRIVATE);
+ if (!args)
+ return ERR_PTR(-ENOMEM);
+
+ args->cap_last_cap = kdat.last_cap;
+ memcpy(&args->creds, ce, sizeof(args->creds));
+
+ if (ce->lsm_profile || opts.lsm_supplied) {
+ char *rendered, *profile;
+
+ profile = ce->lsm_profile;
+ if (opts.lsm_supplied)
+ profile = opts.lsm_profile;
+
+ if (validate_lsm(profile) < 0)
+ return ERR_PTR(-EINVAL);
+
+ if (profile && render_lsm_profile(profile, &rendered)) {
+ return ERR_PTR(-EINVAL);
+ }
+
+ if (rendered) {
+ size_t lsm_profile_len;
+ char *lsm_profile;
+
+ args->mem_lsm_profile_pos = rst_mem_align_cpos(RM_PRIVATE);
+ lsm_profile_len = strlen(rendered);
+ lsm_profile = rst_mem_alloc(lsm_profile_len + 1, RM_PRIVATE);
+ if (!lsm_profile) {
+ xfree(rendered);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ args = rst_mem_remap_ptr(this_pos, RM_PRIVATE);
+ args->lsm_profile = lsm_profile;
+ strncpy(args->lsm_profile, rendered, lsm_profile_len);
+ xfree(rendered);
+ }
+ } else {
+ args->lsm_profile = NULL;
+ args->mem_lsm_profile_pos = 0;
+ }
+
+ /*
+ * Zap fields which we cant use.
+ */
+ args->creds.cap_inh = NULL;
+ args->creds.cap_eff = NULL;
+ args->creds.cap_prm = NULL;
+ args->creds.cap_bnd = NULL;
+ args->creds.groups = NULL;
+ args->creds.lsm_profile = NULL;
+
+ memcpy(args->cap_inh, ce->cap_inh, sizeof(args->cap_inh));
+ memcpy(args->cap_eff, ce->cap_eff, sizeof(args->cap_eff));
+ memcpy(args->cap_prm, ce->cap_prm, sizeof(args->cap_prm));
+ memcpy(args->cap_bnd, ce->cap_bnd, sizeof(args->cap_bnd));
+
+ if (ce->n_groups) {
+ unsigned int *groups;
+
+ args->mem_groups_pos = rst_mem_align_cpos(RM_PRIVATE);
+ groups = rst_mem_alloc(ce->n_groups * sizeof(u32), RM_PRIVATE);
+ if (!groups)
+ return ERR_PTR(-ENOMEM);
+ args = rst_mem_remap_ptr(this_pos, RM_PRIVATE);
+ args->groups = groups;
+ memcpy(args->groups, ce->groups, ce->n_groups * sizeof(u32));
+ } else {
+ args->groups = NULL;
+ args->mem_groups_pos = 0;
+ }
+
+ args->mem_pos_next = 0;
+
+ if (prev_pos) {
+ if (*prev_pos) {
+ struct thread_creds_args *prev;
+
+ prev = rst_mem_remap_ptr(*prev_pos, RM_PRIVATE);
+ prev->mem_pos_next = this_pos;
+ }
+ *prev_pos = this_pos;
+ }
+ return args;
+}
+
+static int rst_prep_creds_from_img(pid_t pid)
+{
+ CredsEntry *ce = NULL;
+ struct cr_img *img;
+ int ret;
+
+ img = open_image(CR_FD_CREDS, O_RSTR, pid);
+ if (!img)
+ return -ENOENT;
+
+ ret = pb_read_one(img, &ce, PB_CREDS);
+ close_image(img);
+
+ if (ret > 0) {
+ struct thread_creds_args *args;
+
+ args = rst_prep_creds_args(ce, NULL);
+ if (IS_ERR(args))
+ ret = PTR_ERR(args);
+ else
+ ret = 0;
+ }
+ creds_entry__free_unpacked(ce, NULL);
+ return ret;
+}
+
+static int rst_prep_creds(pid_t pid, CoreEntry *core, unsigned long *creds_pos)
+{
+ struct thread_creds_args *args = NULL;
+ unsigned long this_pos = 0;
+ size_t i;
+
+ /*
+ * This is _really_ very old image
+ * format where @thread_core were not
+ * present. It means we don't have
+ * creds either, just ignore and exit
+ * early.
+ */
+ if (unlikely(!core->thread_core)) {
+ *creds_pos = 0;
+ return 0;
+ }
+
+ *creds_pos = rst_mem_align_cpos(RM_PRIVATE);
+
+ /*
+ * Old format: one Creds per task carried in own image file.
+ */
+ if (!core->thread_core->creds)
+ return rst_prep_creds_from_img(pid);
+
+ for (i = 0; i < current->nr_threads; i++) {
+ CredsEntry *ce = current->core[i]->thread_core->creds;
+
+ args = rst_prep_creds_args(ce, &this_pos);
+ if (IS_ERR(args))
+ return PTR_ERR(args);
+ }
+
+ return 0;
+}
+
+static int sigreturn_restore(pid_t pid, CoreEntry *core)
+{
+ void *mem = MAP_FAILED;
+ void *restore_thread_exec_start;
+ void *restore_task_exec_start;
+
+ long new_sp, exec_mem_hint;
+ long ret;
+
+ long restore_bootstrap_len;
+ long rst_mem_size;
+
+ struct task_restore_args *task_args;
+ struct thread_restore_args *thread_args;
+ long args_len;
+
+ struct vma_area *vma;
+ unsigned long tgt_vmas;
+
+#ifdef CONFIG_VDSO
+ unsigned long vdso_rt_size = 0;
+ unsigned long vdso_rt_delta = 0;
+#endif
+
+ unsigned long aio_rings;
+ MmEntry *mm = rsti(current)->mm;
+
+ int n_seccomp_filters = 0;
+ unsigned long seccomp_filter_pos = 0;
+
+ struct vm_area_list self_vmas;
+ struct vm_area_list *vmas = &rsti(current)->vmas;
+ int i;
+
+ unsigned long creds_pos = 0;
+ unsigned long creds_pos_next;
+
+ pr_info("Restore via sigreturn\n");
+
+ /* pr_info_vma_list(&self_vma_list); */
+
+ BUILD_BUG_ON(sizeof(struct task_restore_args) & 1);
+ BUILD_BUG_ON(sizeof(struct thread_restore_args) & 1);
+
+ args_len = round_up(sizeof(*task_args) + sizeof(*thread_args) * current->nr_threads, page_size());
+ pr_info("%d threads require %ldK of memory\n",
+ current->nr_threads, KBYTES(args_len));
+
+ /*
+ * Copy VMAs to private rst memory so that it's able to
+ * walk them and m(un|re)map.
+ */
+
+ tgt_vmas = rst_mem_align_cpos(RM_PRIVATE);
+ list_for_each_entry(vma, &vmas->h, list) {
+ VmaEntry *vme;
+
+ vme = rst_mem_alloc(sizeof(*vme), RM_PRIVATE);
+ if (!vme)
+ goto err_nv;
+
+ *vme = *vma->e;
+
+ if (vma_area_is_private(vma, kdat.task_size))
+ vma_premmaped_start(vme) = vma->premmaped_addr;
+ }
+
+ /*
+ * Put info about AIO rings, they will get remapped
+ */
+
+ aio_rings = rst_mem_align_cpos(RM_PRIVATE);
+ for (i = 0; i < mm->n_aios; i++) {
+ struct rst_aio_ring *raio;
+
+ raio = rst_mem_alloc(sizeof(*raio), RM_PRIVATE);
+ if (!raio)
+ goto err_nv;
+
+ raio->addr = mm->aios[i]->id;
+ raio->nr_req = mm->aios[i]->nr_req;
+ raio->len = mm->aios[i]->ring_len;
+ }
+
+ /*
+ * Get all the tcp sockets fds into rst memory -- restorer
+ * will turn repair off before going sigreturn
+ */
+ if (rst_tcp_socks_prep())
+ goto err_nv;
+
+ /*
+ * Copy timerfd params for restorer args, we need to proceed
+ * timer setting at the very late.
+ */
+ if (rst_timerfd_prep())
+ goto err_nv;
+
+ /*
+ * Read creds info for every thread and allocate memory
+ * needed so we can use this data inside restorer.
+ */
+ if (rst_prep_creds(pid, core, &creds_pos))
+ goto err_nv;
+
+ /*
+ * We're about to search for free VM area and inject the restorer blob
+ * into it. No irrelevent mmaps/mremaps beyond this point, otherwise
+ * this unwanted mapping might get overlapped by the restorer.
+ */
+
+ ret = parse_self_maps_lite(&self_vmas);
+ if (ret < 0)
+ goto err;
+
+ if (seccomp_filters_get_rst_pos(core, &n_seccomp_filters, &seccomp_filter_pos) < 0)
+ goto err;
+
+ rst_mem_size = rst_mem_lock();
+ restore_bootstrap_len = restorer_len + args_len + rst_mem_size;
+
+#ifdef CONFIG_VDSO
+ /*
+ * Figure out how much memory runtime vdso and vvar will need.
+ */
+ vdso_rt_size = vdso_vma_size(&vdso_sym_rt);
+ if (vdso_rt_size) {
+ vdso_rt_delta = ALIGN(restore_bootstrap_len, PAGE_SIZE) - restore_bootstrap_len;
+ vdso_rt_size += vdso_rt_delta;
+ if (vvar_vma_size(&vdso_sym_rt))
+ vdso_rt_size += ALIGN(vvar_vma_size(&vdso_sym_rt), PAGE_SIZE);
+ }
+
+ restore_bootstrap_len += vdso_rt_size;
+#endif
+
+ /*
+ * Restorer is a blob (code + args) that will get mapped in some
+ * place, that should _not_ intersect with both -- current mappings
+ * and mappings of the task we're restoring here. The subsequent
+ * call finds the start address for the restorer.
+ *
+ * After the start address is found we populate it with the restorer
+ * parts one by one (some are remap-ed, some are mmap-ed and copied
+ * or inited from scratch).
+ */
+
+ exec_mem_hint = restorer_get_vma_hint(&vmas->h, &self_vmas.h,
+ restore_bootstrap_len);
+ if (exec_mem_hint == -1) {
+ pr_err("No suitable area for task_restore bootstrap (%ldK)\n",
+ restore_bootstrap_len);
+ goto err;
+ }
+
+ pr_info("Found bootstrap VMA hint at: 0x%lx (needs ~%ldK)\n", exec_mem_hint,
+ KBYTES(restore_bootstrap_len));
+
+ ret = remap_restorer_blob((void *)exec_mem_hint);
+ if (ret < 0)
+ goto err;
+
+ /*
+ * Prepare a memory map for restorer. Note a thread space
+ * might be completely unused so it's here just for convenience.
+ */
+ restore_thread_exec_start = restorer_sym(exec_mem_hint, arch_export_restore_thread);
+ restore_task_exec_start = restorer_sym(exec_mem_hint, arch_export_restore_task);
+ rsti(current)->munmap_restorer = restorer_sym(exec_mem_hint, arch_export_unmap);
+
+ exec_mem_hint += restorer_len;
+
+ /* VMA we need to run task_restore code */
+ mem = mmap((void *)exec_mem_hint, args_len,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANON | MAP_FIXED, 0, 0);
+ if (mem != (void *)exec_mem_hint) {
+ pr_err("Can't mmap section for restore code\n");
+ goto err;
+ }
+
+ exec_mem_hint -= restorer_len;
+
+ memzero(mem, args_len);
+ task_args = mem;
+ thread_args = (struct thread_restore_args *)(task_args + 1);
+
+ task_args->proc_fd = dup(get_service_fd(PROC_FD_OFF));
+ if (task_args->proc_fd < 0) {
+ pr_perror("can't dup proc fd");
+ goto err;
+ }
+
+ /*
+ * Get a reference to shared memory area which is
+ * used to signal if shmem restoration complete
+ * from low-level restore code.
+ *
+ * This shmem area is mapped right after the whole area of
+ * sigreturn rt code. Note we didn't allocated it before
+ * but this area is taken into account for 'hint' memory
+ * address.
+ */
+
+ mem += args_len;
+ if (rst_mem_remap(mem))
+ goto err;
+
+ task_args->breakpoint = &rsti(current)->breakpoint;
+ task_args->task_entries = rst_mem_remap_ptr(task_entries_pos, RM_SHREMAP);
+
+ task_args->rst_mem = mem;
+ task_args->rst_mem_size = rst_mem_size;
+
+ task_args->bootstrap_start = (void *)exec_mem_hint;
+ task_args->bootstrap_len = restore_bootstrap_len;
+
+ task_args->premmapped_addr = (unsigned long)rsti(current)->premmapped_addr;
+ task_args->premmapped_len = rsti(current)->premmapped_len;
+
+ task_args->task_size = kdat.task_size;
+
+#define remap_array(name, nr, cpos) do { \
+ task_args->name##_n = nr; \
+ task_args->name = rst_mem_remap_ptr(cpos, RM_PRIVATE); \
+ } while (0)
+
+ remap_array(vmas, vmas->nr, tgt_vmas);
+ remap_array(posix_timers, posix_timers_nr, posix_timers_cpos);
+ remap_array(timerfd, rst_timerfd_nr, rst_timerfd_cpos);
+ remap_array(siginfo, siginfo_nr, siginfo_cpos);
+ remap_array(tcp_socks, rst_tcp_socks_nr, rst_tcp_socks_cpos);
+ remap_array(rings, mm->n_aios, aio_rings);
+ remap_array(rlims, rlims_nr, rlims_cpos);
+ remap_array(helpers, n_helpers, helpers_pos);
+ remap_array(zombies, n_zombies, zombies_pos);
+ remap_array(seccomp_filters, n_seccomp_filters, seccomp_filter_pos);
+
+#undef remap_array
+
+ if (core->tc->has_seccomp_mode)
+ task_args->seccomp_mode = core->tc->seccomp_mode;
+
+ /*
+ * Arguments for task restoration.
+ */
+
+ BUG_ON(core->mtype != CORE_ENTRY__MARCH);
+
+ task_args->logfd = log_get_fd();
+ task_args->loglevel = log_get_loglevel();
+ task_args->sigchld_act = sigchld_act;
+
+ strncpy(task_args->comm, core->tc->comm, sizeof(task_args->comm));
+
+
+ /*
+ * Fill up per-thread data.
+ */
+ creds_pos_next = creds_pos;
+ for (i = 0; i < current->nr_threads; i++) {
+ CoreEntry *tcore;
+ struct rt_sigframe *sigframe;
+
+ thread_args[i].pid = current->threads[i].virt;
+ thread_args[i].siginfo_n = siginfo_priv_nr[i];
+ thread_args[i].siginfo = rst_mem_remap_ptr(siginfo_cpos, RM_PRIVATE);
+ thread_args[i].siginfo += siginfo_nr;
+ siginfo_nr += thread_args[i].siginfo_n;
+
+ /* skip self */
+ if (thread_args[i].pid == pid) {
+ task_args->t = thread_args + i;
+ tcore = core;
+ } else
+ tcore = current->core[i];
+
+ if ((tcore->tc || tcore->ids) && thread_args[i].pid != pid) {
+ pr_err("Thread has optional fields present %d\n",
+ thread_args[i].pid);
+ ret = -1;
+ }
+
+ if (ret < 0) {
+ pr_err("Can't read core data for thread %d\n",
+ thread_args[i].pid);
+ goto err;
+ }
+
+ thread_args[i].ta = task_args;
+ thread_args[i].gpregs = *CORE_THREAD_ARCH_INFO(tcore)->gpregs;
+ thread_args[i].clear_tid_addr = CORE_THREAD_ARCH_INFO(tcore)->clear_tid_addr;
+ core_get_tls(tcore, &thread_args[i].tls);
+
+ rst_reloc_creds(&thread_args[i], &creds_pos_next);
+
+ if (tcore->thread_core) {
+ thread_args[i].has_futex = true;
+ thread_args[i].futex_rla = tcore->thread_core->futex_rla;
+ thread_args[i].futex_rla_len = tcore->thread_core->futex_rla_len;
+ thread_args[i].pdeath_sig = tcore->thread_core->pdeath_sig;
+ if (tcore->thread_core->pdeath_sig > _KNSIG) {
+ pr_err("Pdeath signal is too big\n");
+ goto err;
+ }
+
+ ret = prep_sched_info(&thread_args[i].sp, tcore->thread_core);
+ if (ret)
+ goto err;
+ }
+
+ sigframe = (struct rt_sigframe *)thread_args[i].mem_zone.rt_sigframe;
+
+ if (construct_sigframe(sigframe, sigframe, tcore))
+ goto err;
+
+ if (thread_args[i].pid != pid)
+ core_entry__free_unpacked(tcore, NULL);
+
+ pr_info("Thread %4d stack %8p rt_sigframe %8p\n",
+ i, thread_args[i].mem_zone.stack,
+ thread_args[i].mem_zone.rt_sigframe);
+
+ }
+
+#ifdef CONFIG_VDSO
+ /*
+ * Restorer needs own copy of vdso parameters. Runtime
+ * vdso must be kept non intersecting with anything else,
+ * since we need it being accessible even when own
+ * self-vmas are unmaped.
+ */
+ mem += rst_mem_size;
+ task_args->vdso_rt_parked_at = (unsigned long)mem + vdso_rt_delta;
+ task_args->vdso_sym_rt = vdso_sym_rt;
+ task_args->vdso_rt_size = vdso_rt_size;
+#endif
+
+ new_sp = restorer_stack(task_args->t);
+
+ ret = prepare_itimers(pid, core, task_args);
+ if (ret < 0)
+ goto err;
+
+ ret = prepare_mm(pid, task_args);
+ if (ret < 0)
+ goto err;
+
+ /* No longer need it */
+ core_entry__free_unpacked(core, NULL);
+ xfree(current->core);
+
+ /*
+ * Now prepare run-time data for threads restore.
+ */
+ task_args->nr_threads = current->nr_threads;
+ task_args->clone_restore_fn = (void *)restore_thread_exec_start;
+ task_args->thread_args = thread_args;
+
+ /*
+ * Make root and cwd restore _that_ late not to break any
+ * attempts to open files by paths above (e.g. /proc).
+ */
+
+ if (restore_fs(current))
+ goto err;
+
+ close_image_dir();
+ close_proc();
+ close_service_fd(ROOT_FD_OFF);
+ close_service_fd(USERNSD_SK);
+
+ __gcov_flush();
+
+ pr_info("task_args: %p\n"
+ "task_args->pid: %d\n"
+ "task_args->nr_threads: %d\n"
+ "task_args->clone_restore_fn: %p\n"
+ "task_args->thread_args: %p\n",
+ task_args, task_args->t->pid,
+ task_args->nr_threads,
+ task_args->clone_restore_fn,
+ task_args->thread_args);
+
+ /*
+ * An indirect call to task_restore, note it never returns
+ * and restoring core is extremely destructive.
+ */
+
+ JUMP_TO_RESTORER_BLOB(new_sp, restore_task_exec_start, task_args);
+
+err:
+ free_mappings(&self_vmas);
+err_nv:
+ /* Just to be sure */
+ exit(1);
+ return -1;
+}
diff --git a/criu/cr-service.c b/criu/cr-service.c
new file mode 100644
index 000000000000..a1987e713833
--- /dev/null
+++ b/criu/cr-service.c
@@ -0,0 +1,1101 @@
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+#include <arpa/inet.h>
+
+#include "crtools.h"
+#include "cr_options.h"
+#include "util.h"
+#include "log.h"
+#include "cpu.h"
+#include "files.h"
+#include "pstree.h"
+#include "cr-service.h"
+#include "cr-service-const.h"
+#include "page-xfer.h"
+#include "net.h"
+#include "mount.h"
+#include "cgroup.h"
+#include "action-scripts.h"
+#include "sockets.h"
+#include "irmap.h"
+#include "kerndat.h"
+#include "proc_parse.h"
+
+#include "setproctitle.h"
+
+#include "cr-errno.h"
+
+unsigned int service_sk_ino = -1;
+
+static int recv_criu_msg(int socket_fd, CriuReq **req)
+{
+ unsigned char *buf;
+ int len;
+
+ len = recv(socket_fd, NULL, 0, MSG_TRUNC | MSG_PEEK);
+ if (len == -1) {
+ pr_perror("Can't read request");
+ return -1;
+ }
+
+ buf = xmalloc(len);
+ if (!buf)
+ return -ENOMEM;
+
+ len = recv(socket_fd, buf, len, MSG_TRUNC);
+ if (len == -1) {
+ pr_perror("Can't read request");
+ goto err;
+ }
+
+ if (len == 0) {
+ pr_info("Client exited unexpectedly\n");
+ errno = ECONNRESET;
+ goto err;
+ }
+
+ *req = criu_req__unpack(NULL, len, buf);
+ if (!*req) {
+ pr_perror("Failed unpacking request");
+ goto err;
+ }
+
+ xfree(buf);
+ return 0;
+err:
+ xfree(buf);
+ return -1;
+}
+
+static int send_criu_msg(int socket_fd, CriuResp *msg)
+{
+ unsigned char *buf;
+ int len;
+
+ len = criu_resp__get_packed_size(msg);
+
+ buf = xmalloc(len);
+ if (!buf)
+ return -ENOMEM;
+
+ if (criu_resp__pack(msg, buf) != len) {
+ pr_perror("Failed packing response");
+ goto err;
+ }
+
+ if (write(socket_fd, buf, len) == -1) {
+ pr_perror("Can't send response");
+ goto err;
+ }
+
+ xfree(buf);
+ return 0;
+err:
+ xfree(buf);
+ return -1;
+}
+
+static void send_criu_err(int sk, char *msg)
+{
+ CriuResp resp = CRIU_RESP__INIT;
+
+ pr_perror("RPC error: %s", msg);
+
+ resp.type = CRIU_REQ_TYPE__EMPTY;
+ resp.success = false;
+ if (get_cr_errno()) {
+ resp.has_cr_errno = true;
+ resp.cr_errno = get_cr_errno();
+ }
+
+ send_criu_msg(sk, &resp);
+}
+
+int send_criu_dump_resp(int socket_fd, bool success, bool restored)
+{
+ CriuResp msg = CRIU_RESP__INIT;
+ CriuDumpResp resp = CRIU_DUMP_RESP__INIT;
+
+ msg.type = CRIU_REQ_TYPE__DUMP;
+ msg.success = success;
+ if (get_cr_errno()) {
+ msg.has_cr_errno = true;
+ msg.cr_errno = get_cr_errno();
+ }
+ msg.dump = &resp;
+
+ resp.has_restored = true;
+ resp.restored = restored;
+
+ return send_criu_msg(socket_fd, &msg);
+}
+
+static int send_criu_pre_dump_resp(int socket_fd, bool success)
+{
+ CriuResp msg = CRIU_RESP__INIT;
+
+ msg.type = CRIU_REQ_TYPE__PRE_DUMP;
+ msg.success = success;
+ if (get_cr_errno()) {
+ msg.has_cr_errno = true;
+ msg.cr_errno = get_cr_errno();
+ }
+
+ return send_criu_msg(socket_fd, &msg);
+}
+
+int send_criu_restore_resp(int socket_fd, bool success, int pid)
+{
+ CriuResp msg = CRIU_RESP__INIT;
+ CriuRestoreResp resp = CRIU_RESTORE_RESP__INIT;
+
+ msg.type = CRIU_REQ_TYPE__RESTORE;
+ msg.success = success;
+ if (get_cr_errno()) {
+ msg.has_cr_errno = true;
+ msg.cr_errno = get_cr_errno();
+ }
+ msg.restore = &resp;
+
+ resp.pid = pid;
+
+ return send_criu_msg(socket_fd, &msg);
+}
+
+int send_criu_rpc_script(enum script_actions act, char *name, int fd)
+{
+ int ret;
+ CriuResp msg = CRIU_RESP__INIT;
+ CriuReq *req;
+ CriuNotify cn = CRIU_NOTIFY__INIT;
+
+ msg.type = CRIU_REQ_TYPE__NOTIFY;
+ msg.success = true;
+ msg.notify = &cn;
+ cn.script = name;
+
+ switch (act) {
+ case ACT_SETUP_NS:
+ case ACT_POST_RESTORE:
+ /*
+ * FIXME pid is required only once on
+ * restore. Need some more sane way of
+ * checking this.
+ */
+ cn.has_pid = true;
+ cn.pid = root_item->pid.real;
+ break;
+ default:
+ break;
+ }
+
+ ret = send_criu_msg(fd, &msg);
+ if (ret < 0)
+ return ret;
+
+ ret = recv_criu_msg(fd, &req);
+ if (ret < 0)
+ return ret;
+
+ if (req->type != CRIU_REQ_TYPE__NOTIFY || !req->notify_success) {
+ pr_err("RPC client reported script error\n");
+ return -1;
+ }
+
+ criu_req__free_unpacked(req, NULL);
+ return 0;
+}
+
+static char images_dir[PATH_MAX];
+
+static int setup_opts_from_req(int sk, CriuOpts *req)
+{
+ struct ucred ids;
+ struct stat st;
+ socklen_t ids_len = sizeof(struct ucred);
+ char images_dir_path[PATH_MAX];
+ char work_dir_path[PATH_MAX];
+ int i;
+
+ if (getsockopt(sk, SOL_SOCKET, SO_PEERCRED, &ids, &ids_len)) {
+ pr_perror("Can't get socket options");
+ goto err;
+ }
+
+ if (fstat(sk, &st)) {
+ pr_perror("Can't get socket stat");
+ goto err;
+ }
+
+ BUG_ON(st.st_ino == -1);
+ service_sk_ino = st.st_ino;
+
+ /* open images_dir */
+ sprintf(images_dir_path, "/proc/%d/fd/%d", ids.pid, req->images_dir_fd);
+
+ if (req->parent_img)
+ opts.img_parent = req->parent_img;
+
+ if (open_image_dir(images_dir_path) < 0) {
+ pr_perror("Can't open images directory");
+ goto err;
+ }
+
+ /* get full path to images_dir to use in process title */
+ if (readlink(images_dir_path, images_dir, PATH_MAX) == -1) {
+ pr_perror("Can't readlink %s", images_dir_path);
+ goto err;
+ }
+
+ /* chdir to work dir */
+ if (req->has_work_dir_fd)
+ sprintf(work_dir_path, "/proc/%d/fd/%d", ids.pid, req->work_dir_fd);
+ else
+ strcpy(work_dir_path, images_dir_path);
+
+ if (chdir(work_dir_path)) {
+ pr_perror("Can't chdir to work_dir");
+ goto err;
+ }
+
+ /* initiate log file in work dir */
+ if (req->log_file) {
+ if (strchr(req->log_file, '/')) {
+ pr_perror("No subdirs are allowed in log_file name");
+ goto err;
+ }
+
+ opts.output = req->log_file;
+ } else
+ opts.output = DEFAULT_LOG_FILENAME;
+
+ log_set_loglevel(req->log_level);
+ if (log_init(opts.output) == -1) {
+ pr_perror("Can't initiate log");
+ goto err;
+ }
+
+ /* checking flags from client */
+ if (req->has_leave_running && req->leave_running)
+ opts.final_state = TASK_ALIVE;
+
+ if (!req->has_pid) {
+ req->has_pid = true;
+ req->pid = ids.pid;
+ }
+
+ if (req->has_ext_unix_sk) {
+ opts.ext_unix_sk = req->ext_unix_sk;
+ for (i = 0; i < req->n_unix_sk_ino; i++) {
+ if (unix_sk_id_add(req->unix_sk_ino[i]->inode) < 0)
+ goto err;
+ }
+ }
+
+ if (req->root)
+ opts.root = req->root;
+
+ if (req->has_rst_sibling) {
+ if (!opts.swrk_restore) {
+ pr_err("rst_sibling is not allowed in standalone service\n");
+ goto err;
+ }
+
+ opts.restore_sibling = req->rst_sibling;
+ }
+
+ if (req->has_tcp_established)
+ opts.tcp_established_ok = req->tcp_established;
+
+ if (req->has_evasive_devices)
+ opts.evasive_devices = req->evasive_devices;
+
+ if (req->has_shell_job)
+ opts.shell_job = req->shell_job;
+
+ if (req->has_file_locks)
+ opts.handle_file_locks = req->file_locks;
+
+ if (req->has_track_mem)
+ opts.track_mem = req->track_mem;
+
+ if (req->has_link_remap)
+ opts.link_remap_ok = req->link_remap;
+
+ if (req->has_auto_dedup)
+ opts.auto_dedup = req->auto_dedup;
+
+ if (req->has_force_irmap)
+ opts.force_irmap = req->force_irmap;
+
+ if (req->n_exec_cmd > 0) {
+ opts.exec_cmd = xmalloc((req->n_exec_cmd + 1) * sizeof(char *));
+ memcpy(opts.exec_cmd, req->exec_cmd, req->n_exec_cmd * sizeof(char *));
+ opts.exec_cmd[req->n_exec_cmd] = NULL;
+ }
+
+ if (req->ps) {
+ opts.use_page_server = true;
+ opts.addr = req->ps->address;
+ opts.port = htons((short)req->ps->port);
+
+ if (req->ps->has_fd) {
+ if (!opts.swrk_restore)
+ goto err;
+
+ opts.ps_socket = req->ps->fd;
+ }
+ }
+
+ if (req->notify_scripts &&
+ add_script(SCRIPT_RPC_NOTIFY, sk))
+ goto err;
+
+ for (i = 0; i < req->n_veths; i++) {
+ if (veth_pair_add(req->veths[i]->if_in, req->veths[i]->if_out))
+ goto err;
+ }
+
+ for (i = 0; i < req->n_ext_mnt; i++) {
+ if (ext_mount_add(req->ext_mnt[i]->key, req->ext_mnt[i]->val))
+ goto err;
+ }
+
+ if (req->n_inherit_fd && !opts.swrk_restore) {
+ pr_err("inherit_fd is not allowed in standalone service\n");
+ goto err;
+ }
+ for (i = 0; i < req->n_inherit_fd; i++) {
+ if (inherit_fd_add(req->inherit_fd[i]->fd, req->inherit_fd[i]->key))
+ goto err;
+ }
+
+ for (i = 0; i < req->n_external; i++)
+ if (add_external(req->external[i]))
+ goto err;
+
+ for (i = 0; i < req->n_cg_root; i++) {
+ if (new_cg_root_add(req->cg_root[i]->ctrl,
+ req->cg_root[i]->path))
+ goto err;
+ }
+
+ for (i = 0; i < req->n_enable_fs; i++) {
+ if (!add_fsname_auto(req->enable_fs[i]))
+ goto err;
+ }
+
+ for (i = 0; i < req->n_skip_mnt; i++) {
+ if (!add_skip_mount(req->skip_mnt[i]))
+ goto err;
+ }
+
+ if (req->has_cpu_cap)
+ opts.cpu_cap = req->cpu_cap;
+
+ /*
+ * FIXME: For backward compatibility we setup
+ * soft mode here, need to enhance to support
+ * other modes as well via separate option
+ * probably.
+ */
+ if (req->has_manage_cgroups)
+ opts.manage_cgroups = req->manage_cgroups ? CG_MODE_SOFT : CG_MODE_IGNORE;
+
+ /* Override the manage_cgroup if mode is set explicitly */
+ if (req->has_manage_cgroups_mode) {
+ unsigned int mode;
+
+ switch (req->manage_cgroups_mode) {
+ case CRIU_CG_MODE__IGNORE:
+ mode = CG_MODE_IGNORE;
+ break;
+ case CRIU_CG_MODE__NONE:
+ mode = CG_MODE_NONE;
+ break;
+ case CRIU_CG_MODE__PROPS:
+ mode = CG_MODE_PROPS;
+ break;
+ case CRIU_CG_MODE__SOFT:
+ mode = CG_MODE_SOFT;
+ break;
+ case CRIU_CG_MODE__FULL:
+ mode = CG_MODE_FULL;
+ break;
+ case CRIU_CG_MODE__STRICT:
+ mode = CG_MODE_STRICT;
+ break;
+ case CRIU_CG_MODE__DEFAULT:
+ mode = CG_MODE_DEFAULT;
+ break;
+ default:
+ goto err;
+ }
+
+ opts.manage_cgroups = mode;
+ }
+
+ if (req->has_auto_ext_mnt)
+ opts.autodetect_ext_mounts = req->auto_ext_mnt;
+
+ if (req->has_ext_sharing)
+ opts.enable_external_sharing = req->ext_sharing;
+
+ if (req->has_ext_masters)
+ opts.enable_external_masters = req->ext_masters;
+
+ if (req->has_ghost_limit)
+ opts.ghost_limit = req->ghost_limit;
+
+ if (req->n_irmap_scan_paths) {
+ for (i = 0; i < req->n_irmap_scan_paths; i++) {
+ if (irmap_scan_path_add(req->irmap_scan_paths[i]))
+ goto err;
+ }
+ }
+
+ return 0;
+
+err:
+ set_cr_errno(EBADRQC);
+ return -1;
+}
+
+static int dump_using_req(int sk, CriuOpts *req)
+{
+ bool success = false;
+ bool self_dump = !req->pid;
+
+ if (setup_opts_from_req(sk, req))
+ goto exit;
+
+ setproctitle("dump --rpc -t %d -D %s", req->pid, images_dir);
+
+ /*
+ * FIXME -- cr_dump_tasks() may return code from custom
+ * scripts, that can be positive. However, right now we
+ * don't have ability to push scripts via RPC, so psitive
+ * ret values are impossible here.
+ */
+ if (cr_dump_tasks(req->pid))
+ goto exit;
+
+ success = true;
+exit:
+ if (req->leave_running || !self_dump || !success) {
+ if (send_criu_dump_resp(sk, success, false) == -1) {
+ pr_perror("Can't send response");
+ success = false;
+ }
+ }
+
+ return success ? 0 : 1;
+}
+
+static int restore_using_req(int sk, CriuOpts *req)
+{
+ bool success = false;
+
+ /*
+ * We can't restore processes under arbitrary task yet.
+ * Thus for now we force the detached restore under the
+ * cr service task.
+ */
+
+ opts.restore_detach = true;
+
+ if (setup_opts_from_req(sk, req))
+ goto exit;
+
+ setproctitle("restore --rpc -D %s", images_dir);
+
+ if (cr_restore_tasks())
+ goto exit;
+
+ success = true;
+exit:
+ if (send_criu_restore_resp(sk, success,
+ root_item ? root_item->pid.real : -1) == -1) {
+ pr_perror("Can't send response");
+ success = false;
+ }
+
+ if (success && opts.exec_cmd) {
+ int logfd;
+
+ logfd = log_get_fd();
+ if (dup2(logfd, STDOUT_FILENO) == -1 || dup2(logfd, STDERR_FILENO) == -1) {
+ pr_perror("Failed to redirect stdout and stderr to the logfile");
+ return 1;
+ }
+
+ close_pid_proc();
+ close(sk);
+
+ execvp(opts.exec_cmd[0], opts.exec_cmd);
+ pr_perror("Failed to exec cmd %s", opts.exec_cmd[0]);
+ success = false;
+ }
+
+ return success ? 0 : 1;
+}
+
+static int check(int sk)
+{
+ CriuResp resp = CRIU_RESP__INIT;
+
+ resp.type = CRIU_REQ_TYPE__CHECK;
+
+ setproctitle("check --rpc");
+
+ /* Check only minimal kernel support */
+ opts.check_ms_kernel = true;
+
+ if (!cr_check())
+ resp.success = true;
+
+ return send_criu_msg(sk, &resp);
+}
+
+static int pre_dump_using_req(int sk, CriuOpts *req)
+{
+ int pid, status;
+ bool success = false;
+
+ pid = fork();
+ if (pid < 0) {
+ pr_perror("Can't fork");
+ goto out;
+ }
+
+ if (pid == 0) {
+ int ret = 1;
+
+ if (setup_opts_from_req(sk, req))
+ goto cout;
+
+ setproctitle("pre-dump --rpc -t %d -D %s", req->pid, images_dir);
+
+ if (cr_pre_dump_tasks(req->pid))
+ goto cout;
+
+ ret = 0;
+cout:
+ exit(ret);
+ }
+
+ wait(&status);
+ if (!WIFEXITED(status))
+ goto out;
+ if (WEXITSTATUS(status) != 0)
+ goto out;
+
+ success = true;
+out:
+ if (send_criu_pre_dump_resp(sk, success) == -1) {
+ pr_perror("Can't send pre-dump resp");
+ success = false;
+ }
+
+ return success ? 0 : -1;
+}
+
+static int pre_dump_loop(int sk, CriuReq *msg)
+{
+ int ret;
+
+ do {
+ ret = pre_dump_using_req(sk, msg->opts);
+ if (ret < 0)
+ return ret;
+
+ criu_req__free_unpacked(msg, NULL);
+ if (recv_criu_msg(sk, &msg) == -1) {
+ pr_perror("Can't recv request");
+ return -1;
+ }
+ } while (msg->type == CRIU_REQ_TYPE__PRE_DUMP);
+
+ if (msg->type != CRIU_REQ_TYPE__DUMP) {
+ send_criu_err(sk, "Bad req seq");
+ return -1;
+ }
+
+ return dump_using_req(sk, msg->opts);
+}
+
+struct ps_info {
+ int pid;
+ unsigned short port;
+};
+
+static int start_page_server_req(int sk, CriuOpts *req)
+{
+ int ret = -1, pid, start_pipe[2];
+ ssize_t count;
+ bool success = false;
+ CriuResp resp = CRIU_RESP__INIT;
+ CriuPageServerInfo ps = CRIU_PAGE_SERVER_INFO__INIT;
+ struct ps_info info;
+
+ if (pipe(start_pipe)) {
+ pr_perror("No start pipe");
+ goto out;
+ }
+
+ pid = fork();
+ if (pid == 0) {
+ close(start_pipe[0]);
+
+ if (setup_opts_from_req(sk, req))
+ goto out_ch;
+
+ setproctitle("page-server --rpc --address %s --port %hu", opts.addr, opts.port);
+
+ pr_debug("Starting page server\n");
+
+ pid = cr_page_server(true, start_pipe[1]);
+ if (pid <= 0)
+ goto out_ch;
+
+ info.pid = pid;
+ info.port = opts.port;
+
+ count = write(start_pipe[1], &info, sizeof(info));
+ if (count != sizeof(info))
+ goto out_ch;
+
+ ret = 0;
+out_ch:
+ if (ret < 0 && pid > 0)
+ kill(pid, SIGKILL);
+ close(start_pipe[1]);
+ exit(ret);
+ }
+
+ close(start_pipe[1]);
+ wait(&ret);
+ if (WIFEXITED(ret)) {
+ if (WEXITSTATUS(ret)) {
+ pr_err("Child exited with an error\n");
+ goto out;
+ }
+ } else {
+ pr_err("Child wasn't terminated normally\n");
+ goto out;
+ }
+
+ count = read(start_pipe[0], &info, sizeof(info));
+ close(start_pipe[0]);
+ if (count != sizeof(info))
+ goto out;
+
+ success = true;
+ ps.has_pid = true;
+ ps.pid = info.pid;
+ ps.has_port = true;
+ ps.port = info.port;
+ resp.ps = &ps;
+
+ pr_debug("Page server started\n");
+out:
+ resp.type = CRIU_REQ_TYPE__PAGE_SERVER;
+ resp.success = success;
+ return send_criu_msg(sk, &resp);
+}
+
+static int chk_keepopen_req(CriuReq *msg)
+{
+ if (!msg->keep_open)
+ return 0;
+
+ /*
+ * Service may (well, it will) leave some
+ * resources leaked after processing e.g.
+ * dump or restore requests. Before we audit
+ * the code for this, let's first enable
+ * mreq RPCs for those requests we know do
+ * good work
+ */
+
+ if (msg->type == CRIU_REQ_TYPE__PAGE_SERVER)
+ /* This just fork()-s so no leaks */
+ return 0;
+ else if (msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP ||
+ msg->type == CRIU_REQ_TYPE__CPUINFO_CHECK)
+ return 0;
+ else if (msg->type == CRIU_REQ_TYPE__FEATURE_CHECK)
+ return 0;
+
+ return -1;
+}
+
+/*
+ * Generic function to handle CRIU_REQ_TYPE__FEATURE_CHECK.
+ *
+ * The function will have resp.sucess = true for most cases
+ * and the actual result will be in resp.features.
+ *
+ * For each feature which has been requested in msg->features
+ * the corresponding parameter will be set in resp.features.
+ */
+static int handle_feature_check(int sk, CriuReq * msg)
+{
+ CriuResp resp = CRIU_RESP__INIT;
+ CriuFeatures feat = CRIU_FEATURES__INIT;
+ bool success = false;
+ int pid, status;
+
+ /* enable setting of an optional message */
+ feat.has_mem_track = 1;
+ feat.mem_track = false;
+
+ /*
+ * Check if the requested feature check can be answered.
+ *
+ * This function is right now hard-coded to memory
+ * tracking detection and needs other/better logic to
+ * handle multiple feature checks.
+ */
+ if (msg->features->has_mem_track != 1) {
+ pr_warn("Feature checking for unknown feature.\n");
+ goto out;
+ }
+
+ /*
+ * From this point on the function will always
+ * 'succeed'. If the requested features are supported
+ * can be seen if the requested optional parameters are
+ * set in the message 'criu_features'.
+ */
+ success = true;
+
+ pid = fork();
+ if (pid < 0) {
+ pr_perror("Can't fork");
+ goto out;
+ }
+
+ if (pid == 0) {
+ int ret = 1;
+
+ if (setup_opts_from_req(sk, msg->opts))
+ goto cout;
+
+ setproctitle("feature-check --rpc -D %s", images_dir);
+
+ kerndat_get_dirty_track();
+
+ if (kdat.has_dirty_track)
+ ret = 0;
+cout:
+ exit(ret);
+ }
+
+ wait(&status);
+ if (!WIFEXITED(status) || WEXITSTATUS(status))
+ goto out;
+
+ feat.mem_track = true;
+out:
+ resp.features = &feat;
+ resp.type = msg->type;
+ resp.success = success;
+
+ return send_criu_msg(sk, &resp);
+}
+
+static int handle_cpuinfo(int sk, CriuReq *msg)
+{
+ CriuResp resp = CRIU_RESP__INIT;
+ bool success = false;
+ int pid, status;
+
+ pid = fork();
+ if (pid < 0) {
+ pr_perror("Can't fork");
+ goto out;
+ }
+
+ if (pid == 0) {
+ int ret = 1;
+
+ if (setup_opts_from_req(sk, msg->opts))
+ goto cout;
+
+ setproctitle("cpuinfo %s --rpc -D %s",
+ msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP ?
+ "dump" : "check",
+ images_dir);
+
+ if (msg->type == CRIU_REQ_TYPE__CPUINFO_DUMP)
+ ret = cpuinfo_dump();
+ else
+ ret = cpuinfo_check();
+cout:
+ exit(ret);
+ }
+
+ wait(&status);
+ if (!WIFEXITED(status))
+ goto out;
+ switch (WEXITSTATUS(status)) {
+ case (-ENOTSUP & 0xff):
+ resp.has_cr_errno = 1;
+ /*
+ * Let's return the actual error code and
+ * not just (-ENOTSUP & 0xff)
+ */
+ resp.cr_errno = ENOTSUP;
+ break;
+ case 0:
+ success = true;
+ break;
+ default:
+ break;
+ }
+
+out:
+ resp.type = msg->type;
+ resp.success = success;
+
+ return send_criu_msg(sk, &resp);
+}
+
+int cr_service_work(int sk)
+{
+ int ret = -1;
+ CriuReq *msg = 0;
+
+more:
+ if (recv_criu_msg(sk, &msg) == -1) {
+ pr_perror("Can't recv request");
+ goto err;
+ }
+
+ if (chk_keepopen_req(msg))
+ goto err;
+
+ switch (msg->type) {
+ case CRIU_REQ_TYPE__DUMP:
+ ret = dump_using_req(sk, msg->opts);
+ break;
+ case CRIU_REQ_TYPE__RESTORE:
+ ret = restore_using_req(sk, msg->opts);
+ break;
+ case CRIU_REQ_TYPE__CHECK:
+ ret = check(sk);
+ break;
+ case CRIU_REQ_TYPE__PRE_DUMP:
+ ret = pre_dump_loop(sk, msg);
+ break;
+ case CRIU_REQ_TYPE__PAGE_SERVER:
+ ret = start_page_server_req(sk, msg->opts);
+ break;
+ case CRIU_REQ_TYPE__CPUINFO_DUMP:
+ case CRIU_REQ_TYPE__CPUINFO_CHECK:
+ ret = handle_cpuinfo(sk, msg);
+ break;
+ case CRIU_REQ_TYPE__FEATURE_CHECK:
+ ret = handle_feature_check(sk, msg);
+ break;
+
+ default:
+ send_criu_err(sk, "Invalid req");
+ break;
+ }
+
+ if (!ret && msg->keep_open) {
+ criu_req__free_unpacked(msg, NULL);
+ ret = -1;
+ goto more;
+ }
+
+err:
+ return ret;
+}
+
+static void reap_worker(int signo)
+{
+ int saved_errno;
+ int status;
+ pid_t pid;
+
+ saved_errno = errno;
+
+ /*
+ * As we block SIGCHLD, lets wait for every child that has
+ * already changed state.
+ */
+ while (1) {
+ pid = waitpid(-1, &status, WNOHANG);
+
+ if (pid <= 0) {
+ errno = saved_errno;
+ return;
+ }
+
+ if (WIFEXITED(status))
+ pr_info("Worker(pid %d) exited with %d\n",
+ pid, WEXITSTATUS(status));
+ else if (WIFSIGNALED(status))
+ pr_info("Worker(pid %d) was killed by %d\n",
+ pid, WTERMSIG(status));
+ }
+}
+
+static int setup_sigchld_handler()
+{
+ struct sigaction action;
+
+ sigemptyset(&action.sa_mask);
+ sigaddset(&action.sa_mask, SIGCHLD);
+ action.sa_handler = reap_worker;
+ action.sa_flags = SA_RESTART;
+
+ if (sigaction(SIGCHLD, &action, NULL)) {
+ pr_perror("Can't setup SIGCHLD handler");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int restore_sigchld_handler()
+{
+ struct sigaction action;
+
+ sigemptyset(&action.sa_mask);
+ sigaddset(&action.sa_mask, SIGCHLD);
+ action.sa_handler = SIG_DFL;
+ action.sa_flags = SA_RESTART;
+
+ if (sigaction(SIGCHLD, &action, NULL)) {
+ pr_perror("Can't restore SIGCHLD handler");
+ return -1;
+ }
+
+ return 0;
+}
+
+int cr_service(bool daemon_mode)
+{
+ int server_fd = -1;
+ int child_pid;
+
+ struct sockaddr_un client_addr;
+ socklen_t client_addr_len;
+
+ {
+ struct sockaddr_un server_addr;
+ socklen_t server_addr_len;
+
+ server_fd = socket(AF_LOCAL, SOCK_SEQPACKET, 0);
+ if (server_fd == -1) {
+ pr_perror("Can't initialize service socket");
+ goto err;
+ }
+
+ memset(&server_addr, 0, sizeof(server_addr));
+ memset(&client_addr, 0, sizeof(client_addr));
+ server_addr.sun_family = AF_LOCAL;
+
+ if (opts.addr == NULL) {
+ pr_warn("Binding to local dir address!\n");
+ opts.addr = CR_DEFAULT_SERVICE_ADDRESS;
+ }
+
+ strcpy(server_addr.sun_path, opts.addr);
+
+ server_addr_len = strlen(server_addr.sun_path)
+ + sizeof(server_addr.sun_family);
+ client_addr_len = sizeof(client_addr);
+
+ unlink(server_addr.sun_path);
+
+ if (bind(server_fd, (struct sockaddr *) &server_addr,
+ server_addr_len) == -1) {
+ pr_perror("Can't bind");
+ goto err;
+ }
+
+ pr_info("The service socket is bound to %s\n", server_addr.sun_path);
+
+ /* change service socket permissions, so anyone can connect to it */
+ if (chmod(server_addr.sun_path, 0666)) {
+ pr_perror("Can't change permissions of the service socket");
+ goto err;
+ }
+
+ if (listen(server_fd, 16) == -1) {
+ pr_perror("Can't listen for socket connections");
+ goto err;
+ }
+ }
+
+ if (daemon_mode) {
+ if (daemon(1, 0) == -1) {
+ pr_perror("Can't run service server in the background");
+ goto err;
+ }
+ }
+
+ if (opts.pidfile) {
+ if (write_pidfile(getpid()) == -1) {
+ pr_perror("Can't write pidfile");
+ goto err;
+ }
+ }
+
+ if (setup_sigchld_handler())
+ goto err;
+
+ while (1) {
+ int sk;
+
+ pr_info("Waiting for connection...\n");
+
+ sk = accept(server_fd, &client_addr, &client_addr_len);
+ if (sk == -1) {
+ pr_perror("Can't accept connection");
+ goto err;
+ }
+
+ pr_info("Connected.\n");
+ child_pid = fork();
+ if (child_pid == 0) {
+ int ret;
+
+ if (restore_sigchld_handler())
+ exit(1);
+
+ close(server_fd);
+ init_opts();
+ ret = cr_service_work(sk);
+ close(sk);
+ exit(ret != 0);
+ }
+
+ if (child_pid < 0)
+ pr_perror("Can't fork a child");
+
+ close(sk);
+ }
+
+err:
+ close_safe(&server_fd);
+
+ return 1;
+}
diff --git a/criu/cr-show.c b/criu/cr-show.c
new file mode 100644
index 000000000000..91d4d095a071
--- /dev/null
+++ b/criu/cr-show.c
@@ -0,0 +1,574 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <limits.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <ctype.h>
+
+#include <fcntl.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "asm/types.h"
+#include "list.h"
+#include "imgset.h"
+#include "namespaces.h"
+#include "compiler.h"
+#include "cr_options.h"
+#include "util.h"
+#include "sockets.h"
+#include "image.h"
+#include "uts_ns.h"
+#include "ipc_ns.h"
+#include "pstree.h"
+#include "cr-show.h"
+#include "crtools.h"
+
+#include "protobuf.h"
+#include "protobuf/pstree.pb-c.h"
+#include "protobuf/pipe-data.pb-c.h"
+#include "protobuf/siginfo.pb-c.h"
+
+#define DEF_PAGES_PER_LINE 6
+
+
+static LIST_HEAD(pstree_list);
+
+static void pipe_data_handler(struct cr_img *img, void *obj)
+{
+ PipeDataEntry *e = obj;
+ print_image_data(img, e->bytes, opts.show_pages_content);
+}
+
+static int nice_width_for(unsigned long addr)
+{
+ int ret = 3;
+
+ while (addr) {
+ addr >>= 4;
+ ret++;
+ }
+
+ return ret;
+}
+
+static inline void pr_xdigi(unsigned char *data, size_t len, int pos)
+{
+ if (pos < len)
+ pr_msg("%02x ", data[pos]);
+ else
+ pr_msg(" ");
+}
+
+static inline void pr_xsym(unsigned char *data, size_t len, int pos)
+{
+ char sym;
+
+ if (pos < len)
+ sym = data[pos];
+ else
+ sym = ' ';
+
+ pr_msg("%c", isprint(sym) ? sym : '.');
+}
+
+void print_data(unsigned long addr, unsigned char *data, size_t size)
+{
+ int i, j, addr_len;
+ unsigned zero_line = 0;
+
+ addr_len = nice_width_for(addr + size);
+
+ for (i = 0; i < size; i += 16) {
+ if (*(u64 *)(data + i) == 0 && *(u64 *)(data + i + 8) == 0) {
+ if (zero_line == 0)
+ zero_line = 1;
+ else {
+ if (zero_line == 1) {
+ pr_msg("*\n");
+ zero_line = 2;
+ }
+
+ continue;
+ }
+ } else
+ zero_line = 0;
+
+ pr_msg("%#0*lx: ", addr_len, addr + i);
+ for (j = 0; j < 8; j++)
+ pr_xdigi(data, size, i + j);
+ pr_msg(" ");
+ for (j = 8; j < 16; j++)
+ pr_xdigi(data, size, i + j);
+
+ pr_msg(" |");
+ for (j = 0; j < 8; j++)
+ pr_xsym(data, size, i + j);
+ pr_msg(" ");
+ for (j = 8; j < 16; j++)
+ pr_xsym(data, size, i + j);
+
+ pr_msg("|\n");
+ }
+}
+
+void print_image_data(struct cr_img *img, unsigned int length, int show)
+{
+ void *data;
+ int ret;
+
+ if (!show) {
+ lseek(img_raw_fd(img), length, SEEK_CUR);
+ return;
+ }
+
+ pr_msg("\n");
+
+ data = xmalloc(length);
+ if (!data)
+ return;
+ ret = read_img_buf(img, (unsigned char *)data, length);
+ if (ret < 0) {
+ xfree(data);
+ return;
+ }
+ print_data(0, (unsigned char *)data, length);
+ xfree(data);
+}
+
+static void show_pagemaps(struct cr_img *img, void *obj)
+{
+ pb_show_plain_pretty(img, PB_PAGEMAP, "nr_pages:%u");
+}
+
+void show_siginfo(struct cr_img *img)
+{
+ int ret;
+
+ pr_img_head(CR_FD_SIGNAL);
+ while (1) {
+ SiginfoEntry *sie;
+ siginfo_t *info;
+
+ ret = pb_read_one_eof(img, &sie, PB_SIGINFO);
+ if (ret <= 0)
+ break;
+
+ info = (siginfo_t *) sie->siginfo.data;
+ pr_msg("signal: si_signo=%d si_code=%x\n",
+ info->si_signo, info->si_code);
+ siginfo_entry__free_unpacked(sie, NULL);
+
+ }
+ pr_img_tail(CR_FD_SIGNAL);
+}
+
+static int pstree_item_from_pb(PstreeEntry *e, struct pstree_item *item)
+{
+ int i;
+
+ item->pid.virt = e->pid;
+ item->nr_threads = e->n_threads;
+ item->threads = xzalloc(sizeof(struct pid) * e->n_threads);
+ if (!item->threads)
+ return -1;
+
+ for (i = 0; i < item->nr_threads; i++)
+ item->threads[i].virt = e->threads[i];
+
+ return 0;
+}
+
+static void pstree_handler(struct cr_img *img, void *obj)
+{
+ PstreeEntry *e = obj;
+ struct pstree_item *item = NULL;
+
+ item = xzalloc(sizeof(struct pstree_item));
+ if (!item)
+ return;
+
+ if (pstree_item_from_pb(e, item)) {
+ xfree(item);
+ return;
+ }
+
+ list_add_tail(&item->sibling, &pstree_list);
+}
+
+static void show_collect_pstree(struct cr_img *img, int collect)
+{
+ pb_show_plain_payload_pretty(img, PB_PSTREE,
+ collect ? pstree_handler : NULL, "*:%d");
+}
+
+static inline char *task_state_str(int state)
+{
+ switch (state) {
+ case TASK_ALIVE:
+ return "running/sleeping";
+ case TASK_DEAD:
+ return "zombie";
+ default:
+ return "UNKNOWN";
+ }
+}
+
+static void show_core_regs(UserX86RegsEntry *regs)
+{
+#define pr_regs4(s, n1, n2, n3, n4) \
+ pr_msg("\t%8s: 0x%-16"PRIx64" " \
+ "%8s: 0x%-16"PRIx64" " \
+ "%8s: 0x%-16"PRIx64" " \
+ "%8s: 0x%-16"PRIx64"\n", \
+ #n1, s->n1, \
+ #n2, s->n2, \
+ #n3, s->n3, \
+ #n4, s->n4)
+
+#define pr_regs3(s, n1, n2, n3) \
+ pr_msg("\t%8s: 0x%-16"PRIx64" " \
+ "%8s: 0x%-16"PRIx64" " \
+ "%8s: 0x%-16"PRIx64"\n", \
+ #n1, s->n1, \
+ #n2, s->n2, \
+ #n3, s->n3)
+
+ pr_msg("\t---[ GP registers set ]---\n");
+
+ pr_regs4(regs, cs, ip, ds, es);
+ pr_regs4(regs, ss, sp, fs, gs);
+ pr_regs4(regs, di, si, dx, cx);
+ pr_regs4(regs, ax, r8, r9, r10);
+ pr_regs4(regs, r11, r12, r13, r14);
+ pr_regs3(regs, r15, bp, bx);
+ pr_regs4(regs, orig_ax, flags, fs_base, gs_base);
+ pr_msg("\n");
+}
+
+void show_thread_info(ThreadInfoX86 *thread_info)
+{
+ if (!thread_info)
+ return;
+
+ pr_msg("\t---[ Thread info ]---\n");
+ pr_msg("\tclear_tid_addr: 0x%"PRIx64"\n", thread_info->clear_tid_addr);
+ pr_msg("\n");
+
+ show_core_regs(thread_info->gpregs);
+}
+
+static struct {
+ u32 magic;
+ u32 mask;
+ char *hint;
+} magic_hints[] = {
+ { .magic = 0x45311224, .mask = 0xffffffff, .hint = "ip route dump", },
+ { .magic = 0x47361222, .mask = 0xffffffff, .hint = "ip ifaddr dump", },
+ { .magic = 0x00008b1f, .mask = 0x0000ffff, .hint = "gzip file", },
+ { },
+};
+
+static void try_hint_magic(u32 magic)
+{
+ int i;
+
+ for (i = 0; magic_hints[i].hint != 0; i++)
+ if ((magic & magic_hints[i].mask) == magic_hints[i].magic)
+ pr_msg("This can be %s\n", magic_hints[i].hint);
+}
+
+#define SHOW_PLAIN(name) { name##_MAGIC, PB_##name, false, NULL, NULL, }
+/* nothing special behind this -S, just to avoid heavy patching */
+#define SHOW_PLAINS(name) { name##S_MAGIC, PB_##name, false, NULL, NULL, }
+#define SHOW_VERT(name) { name##_MAGIC, PB_##name, true, NULL, NULL, }
+
+static struct show_image_info show_infos[] = {
+ SHOW_VERT(INVENTORY),
+ SHOW_VERT(CORE),
+ SHOW_VERT(IDS),
+ SHOW_VERT(CREDS),
+ SHOW_VERT(UTSNS),
+ SHOW_VERT(IPC_VAR),
+ SHOW_VERT(FS),
+ SHOW_VERT(GHOST_FILE),
+ SHOW_VERT(MM),
+ SHOW_VERT(CGROUP),
+
+ SHOW_PLAINS(REG_FILE),
+ SHOW_PLAINS(NS_FILE),
+ SHOW_PLAIN(EVENTFD_FILE),
+ SHOW_PLAIN(EVENTPOLL_FILE),
+ SHOW_PLAIN(EVENTPOLL_TFD),
+ SHOW_PLAIN(SIGNALFD),
+ SHOW_PLAIN(TIMERFD),
+ SHOW_PLAIN(INOTIFY_FILE),
+ SHOW_PLAIN(INOTIFY_WD),
+ SHOW_PLAIN(FANOTIFY_FILE),
+ SHOW_PLAIN(FANOTIFY_MARK),
+ SHOW_PLAINS(VMA),
+ SHOW_PLAINS(PIPE),
+ SHOW_PLAIN(FIFO),
+ SHOW_PLAIN(SIGACT),
+ SHOW_PLAIN(NETLINK_SK),
+ SHOW_PLAIN(REMAP_FPATH),
+ SHOW_PLAINS(MNT),
+ SHOW_PLAINS(TTY_FILE),
+ SHOW_PLAIN(TTY_INFO),
+ SHOW_PLAIN(RLIMIT),
+ SHOW_PLAIN(TUNFILE),
+ SHOW_PLAINS(EXT_FILE),
+ SHOW_PLAIN(IRMAP_CACHE),
+ SHOW_PLAIN(CPUINFO),
+ SHOW_PLAIN(USERNS),
+ SHOW_PLAIN(NETNS),
+
+ { FILE_LOCKS_MAGIC, PB_FILE_LOCK, false, NULL, "3:%u", },
+ { TCP_STREAM_MAGIC, PB_TCP_STREAM, true, show_tcp_stream, "1:%u 2:%u 3:%u 4:%u 12:%u", },
+ { STATS_MAGIC, PB_STATS, true, NULL, "1.1:%u 1.2:%u 1.3:%u 1.4:%u 1.5:%Lu 1.6:%Lu 1.7:%Lu 1.8:%u", },
+ { FDINFO_MAGIC, PB_FDINFO, false, NULL, "flags:%#o fd:%d", },
+ { UNIXSK_MAGIC, PB_UNIX_SK, false, NULL, "1:%#x 2:%#x 3:%d 4:%d 5:%d 6:%d 7:%d 8:%#x 11:S", },
+ { INETSK_MAGIC, PB_INET_SK, false, NULL, "1:%#x 2:%#x 3:%d 4:%d 5:%d 6:%d 7:%d 8:%d 9:%2x 11:A 12:A", },
+ { PACKETSK_MAGIC, PB_PACKET_SOCK, false, NULL, "5:%d", },
+ { ITIMERS_MAGIC, PB_ITIMER, false, NULL, "*:%Lu", },
+ { POSIX_TIMERS_MAGIC, PB_POSIX_TIMER, false, NULL, "*:%d 5:%Lu 7:%Lu 8:%lu 9:%Lu 10:%Lu", },
+ { NETDEV_MAGIC, PB_NETDEV, false, NULL, "2:%d", },
+
+ { PAGEMAP_MAGIC, PB_PAGEMAP_HEAD, true, show_pagemaps, NULL, },
+ { PIPES_DATA_MAGIC, PB_PIPE_DATA, false, pipe_data_handler, NULL, },
+ { FIFO_DATA_MAGIC, PB_PIPE_DATA, false, pipe_data_handler, NULL, },
+ { SK_QUEUES_MAGIC, PB_SK_QUEUES, false, sk_queue_data_handler, NULL, },
+ { IPCNS_SHM_MAGIC, PB_IPC_SHM, false, ipc_shm_handler, NULL, },
+ { IPCNS_SEM_MAGIC, PB_IPC_SEM, false, ipc_sem_handler, NULL, },
+ { IPCNS_MSG_MAGIC, PB_IPCNS_MSG_ENT, false, ipc_msg_handler, NULL, },
+
+ { }
+};
+
+static int cr_parse_file(void)
+{
+ u32 magic;
+ int ret = -1, fd;
+ struct cr_img *img = NULL;
+
+ fd = open(opts.show_dump_file, O_RDONLY);
+ if (fd < 0) {
+ pr_perror("Can't open %s", opts.show_dump_file);
+ goto out;
+ }
+
+ img = img_from_fd(fd);
+ if (!img)
+ goto out;
+
+ if (read_img(img, &magic) < 0)
+ goto out;
+
+ ret = cr_parse_fd(img, magic);
+out:
+ if (img)
+ close_image(img);
+ else
+ close_safe(&fd);
+ return ret;
+}
+
+int cr_parse_fd(struct cr_img *img, u32 magic)
+{
+ int ret = 0, i;
+
+ if (magic == IMG_COMMON_MAGIC || magic == IMG_SERVICE_MAGIC) {
+ if (read_img(img, &magic) < 0)
+ goto out;
+ }
+
+ if (magic == PSTREE_MAGIC) {
+ show_collect_pstree(img, 0);
+ goto out;
+ }
+
+ if (magic == SIGNAL_MAGIC || magic == PSIGNAL_MAGIC) {
+ show_siginfo(img);
+ goto out;
+ }
+
+ for (i = 0; show_infos[i].magic; i++) {
+ struct show_image_info *si;
+
+ si = &show_infos[i];
+ if (si->magic != magic)
+ continue;
+
+ do_pb_show_plain(img, si->pb_type, si->single,
+ si->payload, si->fmt);
+ goto out;
+ }
+
+ ret = -1;
+ pr_err("Unknown magic %#x in %s\n",
+ magic, opts.show_dump_file);
+ try_hint_magic(magic);
+
+out:
+ return ret;
+}
+
+static int cr_show_pstree_item(struct pstree_item *item)
+{
+ int ret = -1, i;
+ struct cr_img *img;
+ struct cr_imgset *cr_imgset = NULL;
+ TaskKobjIdsEntry *ids;
+
+ cr_imgset = cr_task_imgset_open(item->pid.virt, O_SHOW);
+ if (!cr_imgset)
+ goto out;
+
+ pr_msg("Task %d:\n", item->pid.virt);
+ pr_msg("----------------------------------------\n");
+
+ cr_parse_fd(img_from_set(cr_imgset, CR_FD_CORE), CORE_MAGIC);
+
+ if (item->nr_threads > 1) {
+ for (i = 0; i < item->nr_threads; i++) {
+
+ if (item->threads[i].virt == item->pid.virt)
+ continue;
+
+ img = open_image(CR_FD_CORE, O_SHOW, item->threads[i].virt);
+ if (!img)
+ goto outc;
+
+ pr_msg("Thread %d.%d:\n", item->pid.virt, item->threads[i].virt);
+ pr_msg("----------------------------------------\n");
+
+ cr_parse_fd(img, CORE_MAGIC);
+ close_image(img);
+ }
+ }
+
+ pr_msg("Resources for %d:\n", item->pid.virt);
+ pr_msg("----------------------------------------\n");
+ for (i = _CR_FD_TASK_FROM + 1; i < _CR_FD_TASK_TO; i++)
+ if ((i != CR_FD_CORE) && (i != CR_FD_IDS)) {
+ pr_msg("* ");
+ pr_msg(imgset_template[i].fmt, item->pid.virt);
+ pr_msg(":\n");
+ cr_parse_fd(img_from_set(cr_imgset, i), imgset_template[i].magic);
+ }
+
+ img = open_image(CR_FD_RLIMIT, O_SHOW, item->pid.virt);
+ if (img) {
+ pr_msg("* ");
+ pr_msg(imgset_template[CR_FD_RLIMIT].fmt, item->pid.virt);
+ pr_msg(":\n");
+
+ cr_parse_fd(img, RLIMIT_MAGIC);
+ close_image(img);
+ }
+
+ if (pb_read_one(img_from_set(cr_imgset, CR_FD_IDS), &ids, PB_IDS) > 0) {
+ img = open_image(CR_FD_FDINFO, O_SHOW, ids->files_id);
+ if (img) {
+ pr_msg("* ");
+ pr_msg(imgset_template[CR_FD_FDINFO].fmt, ids->files_id);
+ pr_msg(":\n");
+
+ cr_parse_fd(img, FDINFO_MAGIC);
+ close_image(img);
+ }
+
+ task_kobj_ids_entry__free_unpacked(ids, NULL);
+ }
+
+ pr_msg("---[ end of task %d ]---\n", item->pid.virt);
+
+ ret = 0;
+outc:
+ close_cr_imgset(&cr_imgset);
+out:
+ return ret;
+}
+
+static int cr_show_pid(int pid)
+{
+ int ret;
+ struct cr_img *img;
+ struct pstree_item item;
+
+ img = open_image(CR_FD_PSTREE, O_SHOW);
+ if (!img)
+ return -1;
+
+ while (1) {
+ PstreeEntry *pe;
+
+ ret = pb_read_one_eof(img, &pe, PB_PSTREE);
+ if (ret <= 0) {
+ close_image(img);
+ return ret;
+ }
+
+ if (pe->pid == pid) {
+ pstree_item_from_pb(pe, &item);
+ pstree_entry__free_unpacked(pe, NULL);
+ break;
+ }
+
+ pstree_entry__free_unpacked(pe, NULL);
+ }
+
+ close_image(img);
+
+ return cr_show_pstree_item(&item);
+}
+
+static int cr_show_all(void)
+{
+ struct pstree_item *item = NULL, *tmp;
+ int ret = -1, pid;
+ struct cr_img *img;
+
+ img = open_image(CR_FD_PSTREE, O_SHOW);
+ if (!img)
+ goto out;
+ show_collect_pstree(img, 1);
+ close_image(img);
+
+ pid = list_first_entry(&pstree_list, struct pstree_item, sibling)->pid.virt;
+ ret = try_show_namespaces(pid);
+ if (ret)
+ goto out;
+
+ list_for_each_entry(item, &pstree_list, sibling)
+ if (cr_show_pstree_item(item))
+ break;
+
+out:
+ list_for_each_entry_safe(item, tmp, &pstree_list, sibling) {
+ list_del(&item->sibling);
+ xfree(item->threads);
+ xfree(item);
+ }
+ return ret;
+}
+
+int cr_show(int pid)
+{
+ if (isatty(STDOUT_FILENO)) {
+ pr_msg("The \"show\" action is deprecated by the CRIT utility.\n");
+ pr_msg("To view an image use the \"crit decode -i $name --pretty\" command.\n");
+ return -1;
+ }
+
+ if (opts.show_dump_file)
+ return cr_parse_file();
+
+ if (pid)
+ return cr_show_pid(pid);
+
+ return cr_show_all();
+}
diff --git a/criu/crtools.c b/criu/crtools.c
new file mode 100644
index 000000000000..44060293e730
--- /dev/null
+++ b/criu/crtools.c
@@ -0,0 +1,836 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <limits.h>
+#include <unistd.h>
+#include <errno.h>
+#include <getopt.h>
+#include <string.h>
+#include <ctype.h>
+#include <sched.h>
+
+#include <fcntl.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+
+#include <dlfcn.h>
+
+#include "asm/types.h"
+
+#include "compiler.h"
+#include "crtools.h"
+#include "cr_options.h"
+#include "sockets.h"
+#include "files.h"
+#include "sk-inet.h"
+#include "net.h"
+#include "version.h"
+#include "page-xfer.h"
+#include "tty.h"
+#include "file-lock.h"
+#include "cr-service.h"
+#include "plugin.h"
+#include "mount.h"
+#include "cgroup.h"
+#include "cpu.h"
+#include "action-scripts.h"
+#include "irmap.h"
+#include "fault-injection.h"
+#include "lsm.h"
+#include "proc_parse.h"
+
+#include "setproctitle.h"
+
+struct cr_options opts;
+
+void init_opts(void)
+{
+ memset(&opts, 0, sizeof(opts));
+
+ /* Default options */
+ opts.final_state = TASK_DEAD;
+ INIT_LIST_HEAD(&opts.ext_unixsk_ids);
+ INIT_LIST_HEAD(&opts.veth_pairs);
+ INIT_LIST_HEAD(&opts.scripts);
+ INIT_LIST_HEAD(&opts.ext_mounts);
+ INIT_LIST_HEAD(&opts.inherit_fds);
+ INIT_LIST_HEAD(&opts.external);
+ INIT_LIST_HEAD(&opts.new_cgroup_roots);
+ INIT_LIST_HEAD(&opts.irmap_scan_paths);
+
+ opts.cpu_cap = CPU_CAP_DEFAULT;
+ opts.manage_cgroups = CG_MODE_DEFAULT;
+ opts.ps_socket = -1;
+ opts.ghost_limit = DEFAULT_GHOST_LIMIT;
+ opts.timeout = DEFAULT_TIMEOUT;
+}
+
+static int parse_ns_string(const char *ptr)
+{
+ const char *end = ptr + strlen(ptr);
+
+ do {
+ if (ptr[3] != ',' && ptr[3] != '\0')
+ goto bad_ns;
+ if (!strncmp(ptr, "uts", 3))
+ opts.rst_namespaces_flags |= CLONE_NEWUTS;
+ else if (!strncmp(ptr, "ipc", 3))
+ opts.rst_namespaces_flags |= CLONE_NEWIPC;
+ else if (!strncmp(ptr, "mnt", 3))
+ opts.rst_namespaces_flags |= CLONE_NEWNS;
+ else if (!strncmp(ptr, "pid", 3))
+ opts.rst_namespaces_flags |= CLONE_NEWPID;
+ else if (!strncmp(ptr, "net", 3))
+ opts.rst_namespaces_flags |= CLONE_NEWNET;
+ else
+ goto bad_ns;
+ ptr += 4;
+ } while (ptr < end);
+ return 0;
+
+bad_ns:
+ pr_msg("Error: unknown namespace: %s\n", ptr);
+ return -1;
+}
+
+static int parse_cpu_cap(struct cr_options *opts, const char *optarg)
+{
+ bool inverse = false;
+
+#define ____cpu_set_cap(__opts, __cap, __inverse) \
+ do { \
+ if ((__inverse)) \
+ (__opts)->cpu_cap &= ~(__cap); \
+ else \
+ (__opts)->cpu_cap |= (__cap); \
+ } while (0)
+
+ if (!optarg) {
+ ____cpu_set_cap(opts, CPU_CAP_ALL, false);
+ return 0;
+ }
+
+ while (*optarg) {
+ if (optarg[0] == '^') {
+ inverse = !inverse;
+ optarg++;
+ continue;
+ } else if (optarg[0] == ',') {
+ inverse = false;
+ optarg++;
+ continue;
+ }
+
+ if (!strncmp(optarg, "fpu", 3)) {
+ ____cpu_set_cap(opts, CPU_CAP_FPU, inverse);
+ optarg += 3;
+ } else if (!strncmp(optarg, "all", 3)) {
+ ____cpu_set_cap(opts, CPU_CAP_ALL, inverse);
+ optarg += 3;
+ } else if (!strncmp(optarg, "none", 4)) {
+ if (inverse)
+ opts->cpu_cap = CPU_CAP_ALL;
+ else
+ opts->cpu_cap = CPU_CAP_NONE;
+ optarg += 4;
+ } else if (!strncmp(optarg, "cpu", 3)) {
+ ____cpu_set_cap(opts, CPU_CAP_CPU, inverse);
+ optarg += 3;
+ } else if (!strncmp(optarg, "ins", 3)) {
+ ____cpu_set_cap(opts, CPU_CAP_INS, inverse);
+ optarg += 3;
+ } else
+ goto Esyntax;
+ }
+#undef ____cpu_set_cap
+
+ return 0;
+
+Esyntax:
+ pr_err("Unknown FPU mode `%s' selected\n", optarg);
+ return -1;
+}
+
+static int parse_manage_cgroups(struct cr_options *opts, const char *optarg)
+{
+ if (!optarg) {
+ opts->manage_cgroups = CG_MODE_SOFT;
+ return 0;
+ }
+
+ if (!strcmp(optarg, "none")) {
+ opts->manage_cgroups = CG_MODE_NONE;
+ } else if (!strcmp(optarg, "props")) {
+ opts->manage_cgroups = CG_MODE_PROPS;
+ } else if (!strcmp(optarg, "soft")) {
+ opts->manage_cgroups = CG_MODE_SOFT;
+ } else if (!strcmp(optarg, "full")) {
+ opts->manage_cgroups = CG_MODE_FULL;
+ } else if (!strcmp(optarg, "strict")) {
+ opts->manage_cgroups = CG_MODE_STRICT;
+ } else
+ goto Esyntax;
+
+ return 0;
+
+Esyntax:
+ pr_err("Unknown cgroups mode `%s' selected\n", optarg);
+ return -1;
+}
+
+static size_t parse_size(char *optarg)
+{
+ if (index(optarg, 'K'))
+ return (size_t)KILO(atol(optarg));
+ else if (index(optarg, 'M'))
+ return (size_t)MEGA(atol(optarg));
+ else if (index(optarg, 'G'))
+ return (size_t)GIGA(atol(optarg));
+ return (size_t)atol(optarg);
+}
+
+int add_external(char *key)
+{
+ struct external *ext;
+
+ ext = xmalloc(sizeof(*ext));
+ if (!ext)
+ return -1;
+ ext->id = key;
+ list_add(&ext->node, &opts.external);
+
+ return 0;
+}
+
+int main(int argc, char *argv[], char *envp[])
+{
+ pid_t pid = 0, tree_id = 0;
+ int ret = -1;
+ bool usage_error = true;
+ bool has_exec_cmd = false;
+ int opt, idx;
+ int log_level = LOG_UNSET;
+ char *imgs_dir = ".";
+ char *work_dir = NULL;
+ static const char short_opts[] = "dSsRf:F:t:p:hcD:o:n:v::x::Vr:jlW:L:M:";
+ static struct option long_opts[] = {
+ { "tree", required_argument, 0, 't' },
+ { "pid", required_argument, 0, 'p' },
+ { "leave-stopped", no_argument, 0, 's' },
+ { "leave-running", no_argument, 0, 'R' },
+ { "restore-detached", no_argument, 0, 'd' },
+ { "restore-sibling", no_argument, 0, 'S' },
+ { "daemon", no_argument, 0, 'd' },
+ { "contents", no_argument, 0, 'c' },
+ { "file", required_argument, 0, 'f' },
+ { "fields", required_argument, 0, 'F' },
+ { "images-dir", required_argument, 0, 'D' },
+ { "work-dir", required_argument, 0, 'W' },
+ { "log-file", required_argument, 0, 'o' },
+ { "namespaces", required_argument, 0, 'n' },
+ { "root", required_argument, 0, 'r' },
+ { USK_EXT_PARAM, optional_argument, 0, 'x' },
+ { "help", no_argument, 0, 'h' },
+ { SK_EST_PARAM, no_argument, 0, 1042 },
+ { "close", required_argument, 0, 1043 },
+ { "log-pid", no_argument, 0, 1044 },
+ { "version", no_argument, 0, 'V' },
+ { "evasive-devices", no_argument, 0, 1045 },
+ { "pidfile", required_argument, 0, 1046 },
+ { "veth-pair", required_argument, 0, 1047 },
+ { "action-script", required_argument, 0, 1049 },
+ { LREMAP_PARAM, no_argument, 0, 1041 },
+ { OPT_SHELL_JOB, no_argument, 0, 'j' },
+ { OPT_FILE_LOCKS, no_argument, 0, 'l' },
+ { "page-server", no_argument, 0, 1050 },
+ { "address", required_argument, 0, 1051 },
+ { "port", required_argument, 0, 1052 },
+ { "prev-images-dir", required_argument, 0, 1053 },
+ { "ms", no_argument, 0, 1054 },
+ { "track-mem", no_argument, 0, 1055 },
+ { "auto-dedup", no_argument, 0, 1056 },
+ { "libdir", required_argument, 0, 'L' },
+ { "cpu-cap", optional_argument, 0, 1057 },
+ { "force-irmap", no_argument, 0, 1058 },
+ { "ext-mount-map", required_argument, 0, 'M' },
+ { "exec-cmd", no_argument, 0, 1059 },
+ { "manage-cgroups", optional_argument, 0, 1060 },
+ { "cgroup-root", required_argument, 0, 1061 },
+ { "inherit-fd", required_argument, 0, 1062 },
+ { "feature", required_argument, 0, 1063 },
+ { "skip-mnt", required_argument, 0, 1064 },
+ { "enable-fs", required_argument, 0, 1065 },
+ { "enable-external-sharing", no_argument, 0, 1066 },
+ { "enable-external-masters", no_argument, 0, 1067 },
+ { "freeze-cgroup", required_argument, 0, 1068 },
+ { "ghost-limit", required_argument, 0, 1069 },
+ { "irmap-scan-path", required_argument, 0, 1070 },
+ { "lsm-profile", required_argument, 0, 1071 },
+ { "timeout", required_argument, 0, 1072 },
+ { "external", required_argument, 0, 1073 },
+ { },
+ };
+
+ BUILD_BUG_ON(PAGE_SIZE != PAGE_IMAGE_SIZE);
+
+ if (fault_injection_init())
+ return 1;
+
+ cr_pb_init();
+ setproctitle_init(argc, argv, envp);
+
+ if (argc < 2)
+ goto usage;
+
+ init_opts();
+
+ if (init_service_fd())
+ return 1;
+
+ if (!strcmp(argv[1], "swrk")) {
+ if (argc < 3)
+ goto usage;
+ /*
+ * This is to start criu service worker from libcriu calls.
+ * The usage is "criu swrk <fd>" and is not for CLI/scripts.
+ * The arguments semantics can change at any tyme with the
+ * corresponding lib call change.
+ */
+ opts.swrk_restore = true;
+ return cr_service_work(atoi(argv[2]));
+ }
+
+ while (1) {
+ idx = -1;
+ opt = getopt_long(argc, argv, short_opts, long_opts, &idx);
+ if (opt == -1)
+ break;
+
+ switch (opt) {
+ case 's':
+ opts.final_state = TASK_STOPPED;
+ break;
+ case 'R':
+ opts.final_state = TASK_ALIVE;
+ break;
+ case 'x':
+ if (optarg && unix_sk_ids_parse(optarg) < 0)
+ return 1;
+ opts.ext_unix_sk = true;
+ break;
+ case 'p':
+ pid = atoi(optarg);
+ if (pid <= 0)
+ goto bad_arg;
+ break;
+ case 't':
+ tree_id = atoi(optarg);
+ if (tree_id <= 0)
+ goto bad_arg;
+ break;
+ case 'c':
+ opts.show_pages_content = true;
+ break;
+ case 'f':
+ opts.show_dump_file = optarg;
+ break;
+ case 'F':
+ opts.show_fmt = optarg;
+ break;
+ case 'r':
+ opts.root = optarg;
+ break;
+ case 'd':
+ opts.restore_detach = true;
+ break;
+ case 'S':
+ opts.restore_sibling = true;
+ break;
+ case 'D':
+ imgs_dir = optarg;
+ break;
+ case 'W':
+ work_dir = optarg;
+ break;
+ case 'o':
+ opts.output = optarg;
+ break;
+ case 'n':
+ if (parse_ns_string(optarg))
+ goto bad_arg;
+ break;
+ case 'v':
+ if (log_level == LOG_UNSET)
+ log_level = 0;
+ if (optarg) {
+ if (optarg[0] == 'v')
+ /* handle -vvvvv */
+ log_level += strlen(optarg) + 1;
+ else
+ log_level = atoi(optarg);
+ } else
+ log_level++;
+ break;
+ case 1041:
+ pr_info("Will allow link remaps on FS\n");
+ opts.link_remap_ok = true;
+ break;
+ case 1042:
+ pr_info("Will dump TCP connections\n");
+ opts.tcp_established_ok = true;
+ break;
+ case 1043: {
+ int fd;
+
+ fd = atoi(optarg);
+ pr_info("Closing fd %d\n", fd);
+ close(fd);
+ break;
+ }
+ case 1044:
+ opts.log_file_per_pid = 1;
+ break;
+ case 1045:
+ opts.evasive_devices = true;
+ break;
+ case 1046:
+ opts.pidfile = optarg;
+ break;
+ case 1047:
+ {
+ char *aux;
+
+ aux = strchr(optarg, '=');
+ if (aux == NULL)
+ goto bad_arg;
+
+ *aux = '\0';
+ if (veth_pair_add(optarg, aux + 1))
+ return 1;
+ }
+ break;
+ case 1049:
+ if (add_script(optarg, 0))
+ return 1;
+
+ break;
+ case 1050:
+ opts.use_page_server = true;
+ break;
+ case 1051:
+ opts.addr = optarg;
+ break;
+ case 1052:
+ opts.port = htons(atoi(optarg));
+ if (!opts.port)
+ goto bad_arg;
+ break;
+ case 'j':
+ opts.shell_job = true;
+ break;
+ case 'l':
+ opts.handle_file_locks = true;
+ break;
+ case 1053:
+ opts.img_parent = optarg;
+ break;
+ case 1055:
+ opts.track_mem = true;
+ break;
+ case 1056:
+ opts.auto_dedup = true;
+ break;
+ case 1057:
+ if (parse_cpu_cap(&opts, optarg))
+ goto usage;
+ break;
+ case 1058:
+ opts.force_irmap = true;
+ break;
+ case 1054:
+ opts.check_ms_kernel = true;
+ break;
+ case 'L':
+ opts.libdir = optarg;
+ break;
+ case 1059:
+ has_exec_cmd = true;
+ break;
+ case 1060:
+ if (parse_manage_cgroups(&opts, optarg))
+ goto usage;
+ break;
+ case 1061:
+ {
+ char *path, *ctl;
+
+ path = strchr(optarg, ':');
+ if (path) {
+ *path = '\0';
+ path++;
+ ctl = optarg;
+ } else {
+ path = optarg;
+ ctl = NULL;
+ }
+
+ if (new_cg_root_add(ctl, path))
+ return -1;
+ }
+ break;
+ case 1062:
+ if (inherit_fd_parse(optarg) < 0)
+ return 1;
+ break;
+ case 1063:
+ if (check_add_feature(optarg) < 0)
+ return 1;
+ break;
+ case 1064:
+ if (!add_skip_mount(optarg))
+ return 1;
+ break;
+ case 1065:
+ if (!add_fsname_auto(optarg))
+ return 1;
+ break;
+ case 1066:
+ opts.enable_external_sharing = true;
+ break;
+ case 1067:
+ opts.enable_external_masters = true;
+ break;
+ case 1068:
+ opts.freeze_cgroup = optarg;
+ break;
+ case 1069:
+ opts.ghost_limit = parse_size(optarg);
+ break;
+ case 1070:
+ if (irmap_scan_path_add(optarg))
+ return -1;
+ break;
+ case 1071:
+ if (parse_lsm_arg(optarg) < 0)
+ return -1;
+ break;
+ case 1072:
+ opts.timeout = atoi(optarg);
+ break;
+ case 'M':
+ {
+ char *aux;
+
+ if (strcmp(optarg, "auto") == 0) {
+ opts.autodetect_ext_mounts = true;
+ break;
+ }
+
+ aux = strchr(optarg, ':');
+ if (aux == NULL)
+ goto bad_arg;
+
+ *aux = '\0';
+ if (ext_mount_add(optarg, aux + 1))
+ return 1;
+ }
+ break;
+ case 1073:
+ if (add_external(optarg))
+ return 1;
+ break;
+ case 'V':
+ pr_msg("Version: %s\n", CRIU_VERSION);
+ if (strcmp(CRIU_GITID, "0"))
+ pr_msg("GitID: %s\n", CRIU_GITID);
+ return 0;
+ case 'h':
+ usage_error = false;
+ goto usage;
+ default:
+ goto usage;
+ }
+ }
+
+ if (!opts.restore_detach && opts.restore_sibling) {
+ pr_msg("--restore-sibling only makes sense with --restore-detach\n");
+ return 1;
+ }
+
+ if (!opts.autodetect_ext_mounts && (opts.enable_external_masters || opts.enable_external_sharing)) {
+ pr_msg("must specify --ext-mount-map auto with --enable-external-{sharing|masters}");
+ return 1;
+ }
+
+ if (work_dir == NULL)
+ work_dir = imgs_dir;
+
+ if (optind >= argc) {
+ pr_msg("Error: command is required\n");
+ goto usage;
+ }
+
+ if (has_exec_cmd) {
+ if (argc - optind <= 1) {
+ pr_msg("Error: --exec-cmd requires a command\n");
+ goto usage;
+ }
+
+ if (strcmp(argv[optind], "restore")) {
+ pr_msg("Error: --exec-cmd is available for the restore command only\n");
+ goto usage;
+ }
+
+ if (opts.restore_detach) {
+ pr_msg("Error: --restore-detached and --exec-cmd cannot be used together\n");
+ goto usage;
+ }
+
+ opts.exec_cmd = xmalloc((argc - optind) * sizeof(char *));
+ if (!opts.exec_cmd)
+ return 1;
+ memcpy(opts.exec_cmd, &argv[optind + 1], (argc - optind - 1) * sizeof(char *));
+ opts.exec_cmd[argc - optind - 1] = NULL;
+ }
+
+ /* We must not open imgs dir, if service is called */
+ if (strcmp(argv[optind], "service")) {
+ ret = open_image_dir(imgs_dir);
+ if (ret < 0)
+ return 1;
+ }
+
+ if (chdir(work_dir)) {
+ pr_perror("Can't change directory to %s", work_dir);
+ return 1;
+ }
+
+ log_set_loglevel(log_level);
+
+ if (log_init(opts.output))
+ return 1;
+
+ if (!list_empty(&opts.external) && strcmp(argv[optind], "dump")) {
+ pr_err("--external is dump-only option\n");
+ return 1;
+ }
+
+ if (!list_empty(&opts.inherit_fds)) {
+ if (strcmp(argv[optind], "restore")) {
+ pr_err("--inherit-fd is restore-only option\n");
+ return 1;
+ }
+ /* now that log file is set up, print inherit fd list */
+ inherit_fd_log();
+ }
+
+ if (opts.img_parent)
+ pr_info("Will do snapshot from %s\n", opts.img_parent);
+
+ if (!strcmp(argv[optind], "dump")) {
+ preload_socket_modules();
+
+ if (!tree_id)
+ goto opt_pid_missing;
+ return cr_dump_tasks(tree_id);
+ }
+
+ if (!strcmp(argv[optind], "pre-dump")) {
+ if (!tree_id)
+ goto opt_pid_missing;
+
+ return cr_pre_dump_tasks(tree_id) != 0;
+ }
+
+ if (!strcmp(argv[optind], "restore")) {
+ if (tree_id)
+ pr_warn("Using -t with criu restore is obsoleted\n");
+
+ ret = cr_restore_tasks();
+ if (ret == 0 && opts.exec_cmd) {
+ close_pid_proc();
+ execvp(opts.exec_cmd[0], opts.exec_cmd);
+ pr_perror("Failed to exec command %s", opts.exec_cmd[0]);
+ ret = 1;
+ }
+
+ return ret != 0;
+ }
+
+ if (!strcmp(argv[optind], "show"))
+ return cr_show(pid) != 0;
+
+ if (!strcmp(argv[optind], "check"))
+ return cr_check() != 0;
+
+ if (!strcmp(argv[optind], "exec")) {
+ if (!pid)
+ pid = tree_id; /* old usage */
+ if (!pid)
+ goto opt_pid_missing;
+ return cr_exec(pid, argv + optind + 1) != 0;
+ }
+
+ if (!strcmp(argv[optind], "page-server"))
+ return cr_page_server(opts.daemon_mode, -1) > 0 ? 0 : 1;
+
+ if (!strcmp(argv[optind], "service"))
+ return cr_service(opts.daemon_mode);
+
+ if (!strcmp(argv[optind], "dedup"))
+ return cr_dedup() != 0;
+
+ if (!strcmp(argv[optind], "cpuinfo")) {
+ if (!argv[optind + 1])
+ goto usage;
+ if (!strcmp(argv[optind + 1], "dump"))
+ return cpuinfo_dump();
+ else if (!strcmp(argv[optind + 1], "check"))
+ return cpuinfo_check();
+ }
+
+ pr_msg("Error: unknown command: %s\n", argv[optind]);
+usage:
+ pr_msg("\n"
+"Usage:\n"
+" criu dump|pre-dump -t PID [<options>]\n"
+" criu restore [<options>]\n"
+" criu check [--ms]\n"
+" criu exec -p PID <syscall-string>\n"
+" criu page-server\n"
+" criu service [<options>]\n"
+" criu dedup\n"
+"\n"
+"Commands:\n"
+" dump checkpoint a process/tree identified by pid\n"
+" pre-dump pre-dump task(s) minimizing their frozen time\n"
+" restore restore a process/tree\n"
+" check checks whether the kernel support is up-to-date\n"
+" exec execute a system call by other task\n"
+" page-server launch page server\n"
+" service launch service\n"
+" dedup remove duplicates in memory dump\n"
+" cpuinfo dump writes cpu information into image file\n"
+" cpuinfo check validates cpu information read from image file\n"
+ );
+
+ if (usage_error) {
+ pr_msg("\nTry -h|--help for more info\n");
+ return 1;
+ }
+
+ pr_msg("\n"
+"Dump/Restore options:\n"
+"\n"
+"* Generic:\n"
+" -t|--tree PID checkpoint a process tree identified by PID\n"
+" -d|--restore-detached detach after restore\n"
+" -S|--restore-sibling restore root task as sibling\n"
+" -s|--leave-stopped leave tasks in stopped state after checkpoint\n"
+" -R|--leave-running leave tasks in running state after checkpoint\n"
+" -D|--images-dir DIR directory for image files\n"
+" --pidfile FILE write root task, service or page-server pid to FILE\n"
+" -W|--work-dir DIR directory to cd and write logs/pidfiles/stats to\n"
+" (if not specified, value of --images-dir is used)\n"
+" --cpu-cap [CAP] require certain cpu capability. CAP: may be one of:\n"
+" 'cpu','fpu','all','ins','none'. To disable capability, prefix it with '^'.\n"
+" --exec-cmd execute the command specified after '--' on successful\n"
+" restore making it the parent of the restored process\n"
+" --freeze-cgroup\n"
+" use cgroup freezer to collect processes\n"
+"\n"
+"* Special resources support:\n"
+" -x|--" USK_EXT_PARAM "inode,.." " allow external unix connections (optionally can be assign socket's inode that allows one-sided dump)\n"
+" --" SK_EST_PARAM " checkpoint/restore established TCP connections\n"
+" -r|--root PATH change the root filesystem (when run in mount namespace)\n"
+" --evasive-devices use any path to a device file if the original one\n"
+" is inaccessible\n"
+" --veth-pair IN=OUT map inside veth device name to outside one\n"
+" can optionally append @<bridge-name> to OUT for moving\n"
+" the outside veth to the named bridge\n"
+" --link-remap allow one to link unlinked files back when possible\n"
+" --ghost-limit size specify maximum size of deleted file contents to be carried inside an image file\n"
+" --action-script FILE add an external action script\n"
+" -j|--" OPT_SHELL_JOB " allow one to dump and restore shell jobs\n"
+" -l|--" OPT_FILE_LOCKS " handle file locks, for safety, only used for container\n"
+" -L|--libdir path to a plugin directory (by default " CR_PLUGIN_DEFAULT ")\n"
+" --force-irmap force resolving names for inotify/fsnotify watches\n"
+" --irmap-scan-path FILE\n"
+" add a path the irmap hints to scan\n"
+" -M|--ext-mount-map KEY:VALUE\n"
+" add external mount mapping\n"
+" -M|--ext-mount-map auto\n"
+" attempt to autodetect external mount mapings\n"
+" --enable-external-sharing\n"
+" allow autoresolving mounts with external sharing\n"
+" --enable-external-masters\n"
+" allow autoresolving mounts with external masters\n"
+" --manage-cgroups [m] dump or restore cgroups the process is in usig mode:\n"
+" 'none', 'props', 'soft' (default), 'full' and 'strict'.\n"
+" --cgroup-root [controller:]/newroot\n"
+" change the root cgroup the controller will be\n"
+" installed into. No controller means that root is the\n"
+" default for all controllers not specified.\n"
+" --skip-mnt PATH ignore this mountpoint when dumping the mount namespace.\n"
+" --enable-fs FSNAMES a comma separated list of filesystem names or \"all\".\n"
+" force criu to (try to) dump/restore these filesystem's\n"
+" mountpoints even if fs is not supported.\n"
+" --external RES dump objects from this list as external resources:\n"
+" Formats of RES:\n"
+" tty[rdev:dev]\n"
+" --inherit-fd fd[<num>]:<existing>\n"
+" Inherit file descriptors. This allows to treat file descriptor\n"
+" <num> as being already opened via <existing> one and instead of\n"
+" trying to open we inherit it:\n"
+" tty[rdev:dev]\n"
+" pipe[inode]\n"
+" socket[inode]\n"
+"\n"
+"* Logging:\n"
+" -o|--log-file FILE log file name\n"
+" --log-pid enable per-process logging to separate FILE.pid files\n"
+" -v[NUM] set logging level (higher level means more output):\n"
+" -v1|-v - only errors and messages\n"
+" -v2|-vv - also warnings (default level)\n"
+" -v3|-vvv - also information messages and timestamps\n"
+" -v4|-vvvv - lots of debug\n"
+"\n"
+"* Memory dumping options:\n"
+" --track-mem turn on memory changes tracker in kernel\n"
+" --prev-images-dir DIR path to images from previous dump (relative to -D)\n"
+" --page-server send pages to page server (see options below as well)\n"
+" --auto-dedup when used on dump it will deduplicate \"old\" data in\n"
+" pages images of previous dump\n"
+" when used on restore, as soon as page is restored, it\n"
+" will be punched from the image.\n"
+"\n"
+"Page/Service server options:\n"
+" --address ADDR address of server or service\n"
+" --port PORT port of page server\n"
+" -d|--daemon run in the background after creating socket\n"
+"\n"
+"Other options:\n"
+" -h|--help show this text\n"
+" -V|--version show version\n"
+" --ms don't check not yet merged kernel features\n"
+ );
+
+ return 0;
+
+opt_pid_missing:
+ pr_msg("Error: pid not specified\n");
+ return 1;
+
+bad_arg:
+ if (idx < 0) /* short option */
+ pr_msg("Error: invalid argument for -%c: %s\n",
+ opt, optarg);
+ else /* long option */
+ pr_msg("Error: invalid argument for --%s: %s\n",
+ long_opts[idx].name, optarg);
+ return 1;
+}
diff --git a/criu/eventfd.c b/criu/eventfd.c
new file mode 100644
index 000000000000..21b5c9d7b6d8
--- /dev/null
+++ b/criu/eventfd.c
@@ -0,0 +1,129 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/eventfd.h>
+
+#include "compiler.h"
+#include "asm/types.h"
+#include "imgset.h"
+#include "eventfd.h"
+#include "proc_parse.h"
+#include "image.h"
+#include "util.h"
+#include "log.h"
+
+#include "protobuf.h"
+#include "protobuf/eventfd.pb-c.h"
+
+#undef LOG_PREFIX
+#define LOG_PREFIX "eventfd: "
+
+struct eventfd_file_info {
+ EventfdFileEntry *efe;
+ struct file_desc d;
+};
+
+/* Checks if file descriptor @lfd is eventfd */
+int is_eventfd_link(char *link)
+{
+ return is_anon_link_type(link, "[eventfd]");
+}
+
+static void pr_info_eventfd(char *action, EventfdFileEntry *efe)
+{
+ pr_info("%s: id %#08x flags %#04x counter %#016"PRIx64"\n",
+ action, efe->id, efe->flags, efe->counter);
+}
+
+struct eventfd_dump_arg {
+ u32 id;
+ const struct fd_parms *p;
+ bool dumped;
+};
+
+static int dump_eventfd_entry(union fdinfo_entries *e, void *arg)
+{
+ struct eventfd_dump_arg *da = arg;
+
+ if (da->dumped) {
+ pr_err("Several counters in a file?\n");
+ return -1;
+ }
+
+ da->dumped = true;
+ e->efd.id = da->id;
+ e->efd.flags = da->p->flags;
+ e->efd.fown = (FownEntry *)&da->p->fown;
+
+ pr_info_eventfd("Dumping ", &e->efd);
+ return pb_write_one(img_from_set(glob_imgset, CR_FD_EVENTFD_FILE),
+ &e->efd, PB_EVENTFD_FILE);
+}
+
+static int dump_one_eventfd(int lfd, u32 id, const struct fd_parms *p)
+{
+ struct eventfd_dump_arg da = { .id = id, .p = p, };
+ return parse_fdinfo(lfd, FD_TYPES__EVENTFD, dump_eventfd_entry, &da);
+}
+
+const struct fdtype_ops eventfd_dump_ops = {
+ .type = FD_TYPES__EVENTFD,
+ .dump = dump_one_eventfd,
+};
+
+static int eventfd_open(struct file_desc *d)
+{
+ struct eventfd_file_info *info;
+ int tmp;
+
+ info = container_of(d, struct eventfd_file_info, d);
+
+ tmp = eventfd(info->efe->counter, 0);
+ if (tmp < 0) {
+ pr_perror("Can't create eventfd %#08x",
+ info->efe->id);
+ return -1;
+ }
+
+ if (rst_file_params(tmp, info->efe->fown, info->efe->flags)) {
+ pr_perror("Can't restore params on eventfd %#08x",
+ info->efe->id);
+ goto err_close;
+ }
+
+ return tmp;
+
+err_close:
+ close(tmp);
+ return -1;
+}
+
+static struct file_desc_ops eventfd_desc_ops = {
+ .type = FD_TYPES__EVENTFD,
+ .open = eventfd_open,
+};
+
+static int collect_one_efd(void *obj, ProtobufCMessage *msg)
+{
+ struct eventfd_file_info *info = obj;
+
+ info->efe = pb_msg(msg, EventfdFileEntry);
+ pr_info_eventfd("Collected ", info->efe);
+ return file_desc_add(&info->d, info->efe->id, &eventfd_desc_ops);
+}
+
+struct collect_image_info eventfd_cinfo = {
+ .fd_type = CR_FD_EVENTFD_FILE,
+ .pb_type = PB_EVENTFD_FILE,
+ .priv_size = sizeof(struct eventfd_file_info),
+ .collect = collect_one_efd,
+};
diff --git a/criu/eventpoll.c b/criu/eventpoll.c
new file mode 100644
index 000000000000..c414c35b9923
--- /dev/null
+++ b/criu/eventpoll.c
@@ -0,0 +1,229 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+#include <sys/types.h>
+#include <sys/ioctl.h>
+#include <sys/epoll.h>
+
+#include "compiler.h"
+#include "asm/types.h"
+#include "imgset.h"
+#include "rst_info.h"
+#include "eventpoll.h"
+#include "proc_parse.h"
+#include "image.h"
+#include "util.h"
+#include "log.h"
+
+#include "protobuf.h"
+#include "protobuf/eventpoll.pb-c.h"
+
+#undef LOG_PREFIX
+#define LOG_PREFIX "epoll: "
+
+struct eventpoll_file_info {
+ EventpollFileEntry *efe;
+ struct file_desc d;
+};
+
+struct eventpoll_tfd_file_info {
+ EventpollTfdEntry *tdefe;
+ struct list_head list;
+};
+
+static LIST_HEAD(eventpoll_tfds);
+
+/* Checks if file descriptor @lfd is eventfd */
+int is_eventpoll_link(char *link)
+{
+ return is_anon_link_type(link, "[eventpoll]");
+}
+
+static void pr_info_eventpoll_tfd(char *action, EventpollTfdEntry *e)
+{
+ pr_info("%seventpoll-tfd: id %#08x tfd %#08x events %#08x data %#016"PRIx64"\n",
+ action, e->id, e->tfd, e->events, e->data);
+}
+
+static void pr_info_eventpoll(char *action, EventpollFileEntry *e)
+{
+ pr_info("%seventpoll: id %#08x flags %#04x\n", action, e->id, e->flags);
+}
+
+struct eventpoll_list {
+ struct list_head list;
+ int n;
+};
+
+static int dump_eventpoll_entry(union fdinfo_entries *e, void *arg)
+{
+ struct eventpoll_list *ep_list = (struct eventpoll_list *) arg;
+ EventpollTfdEntry *efd = &e->epl.e;
+
+ pr_info_eventpoll_tfd("Dumping: ", efd);
+
+ list_add_tail(&e->epl.node, &ep_list->list);
+ ep_list->n++;
+
+ return 0;
+}
+
+static int dump_one_eventpoll(int lfd, u32 id, const struct fd_parms *p)
+{
+ EventpollFileEntry e = EVENTPOLL_FILE_ENTRY__INIT;
+ struct eventpoll_list ep_list = {LIST_HEAD_INIT(ep_list.list), 0};
+ union fdinfo_entries *te, *tmp;
+ int i, ret = -1;
+
+ e.id = id;
+ e.flags = p->flags;
+ e.fown = (FownEntry *)&p->fown;
+
+ if (parse_fdinfo(lfd, FD_TYPES__EVENTPOLL, dump_eventpoll_entry, &ep_list))
+ goto out;
+
+ e.tfd = xmalloc(sizeof(struct EventpollTfdEntry *) * ep_list.n);
+ if (!e.tfd)
+ goto out;
+
+ i = 0;
+ list_for_each_entry(te, &ep_list.list, epl.node)
+ e.tfd[i++] = &te->epl.e;
+ e.n_tfd = ep_list.n;
+
+ pr_info_eventpoll("Dumping ", &e);
+ ret = pb_write_one(img_from_set(glob_imgset, CR_FD_EVENTPOLL_FILE),
+ &e, PB_EVENTPOLL_FILE);
+out:
+ list_for_each_entry_safe(te, tmp, &ep_list.list, epl.node)
+ free_event_poll_entry(te);
+
+ return ret;
+}
+
+const struct fdtype_ops eventpoll_dump_ops = {
+ .type = FD_TYPES__EVENTPOLL,
+ .dump = dump_one_eventpoll,
+};
+
+static int eventpoll_open(struct file_desc *d)
+{
+ struct eventpoll_file_info *info;
+ int tmp;
+
+ info = container_of(d, struct eventpoll_file_info, d);
+
+ pr_info_eventpoll("Restore ", info->efe);
+
+ tmp = epoll_create(1);
+ if (tmp < 0) {
+ pr_perror("Can't create epoll %#08x",
+ info->efe->id);
+ return -1;
+ }
+
+ if (rst_file_params(tmp, info->efe->fown, info->efe->flags)) {
+ pr_perror("Can't restore file params on epoll %#08x",
+ info->efe->id);
+ goto err_close;
+ }
+
+ return tmp;
+err_close:
+ close(tmp);
+ return -1;
+}
+static int eventpoll_retore_tfd(int fd, int id, EventpollTfdEntry *tdefe)
+{
+ struct epoll_event event;
+
+ pr_info_eventpoll_tfd("Restore ", tdefe);
+
+ event.events = tdefe->events;
+ event.data.u64 = tdefe->data;
+ if (epoll_ctl(fd, EPOLL_CTL_ADD, tdefe->tfd, &event)) {
+ pr_perror("Can't add event on %#08x", id);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int eventpoll_post_open(struct file_desc *d, int fd)
+{
+ struct eventpoll_tfd_file_info *td_info;
+ struct eventpoll_file_info *info;
+ int i;
+
+ info = container_of(d, struct eventpoll_file_info, d);
+
+ for (i = 0; i < info->efe->n_tfd; i++) {
+ if (eventpoll_retore_tfd(fd, info->efe->id, info->efe->tfd[i]))
+ return -1;
+ }
+
+ list_for_each_entry(td_info, &eventpoll_tfds, list) {
+ if (td_info->tdefe->id != info->efe->id)
+ continue;
+
+ if (eventpoll_retore_tfd(fd, info->efe->id, td_info->tdefe))
+ return -1;
+
+ }
+
+ return 0;
+}
+
+static void eventpoll_collect_fd(struct file_desc *d,
+ struct fdinfo_list_entry *fle, struct rst_info *ri)
+{
+ list_add_tail(&fle->ps_list, &ri->eventpoll);
+}
+
+static struct file_desc_ops desc_ops = {
+ .type = FD_TYPES__EVENTPOLL,
+ .open = eventpoll_open,
+ .post_open = eventpoll_post_open,
+ .collect_fd = eventpoll_collect_fd,
+};
+
+static int collect_one_epoll_tfd(void *o, ProtobufCMessage *msg)
+{
+ struct eventpoll_tfd_file_info *info = o;
+
+ info->tdefe = pb_msg(msg, EventpollTfdEntry);
+ list_add(&info->list, &eventpoll_tfds);
+ pr_info_eventpoll_tfd("Collected ", info->tdefe);
+
+ return 0;
+}
+
+struct collect_image_info epoll_tfd_cinfo = {
+ .fd_type = CR_FD_EVENTPOLL_TFD,
+ .pb_type = PB_EVENTPOLL_TFD,
+ .priv_size = sizeof(struct eventpoll_tfd_file_info),
+ .collect = collect_one_epoll_tfd,
+};
+
+static int collect_one_epoll(void *o, ProtobufCMessage *msg)
+{
+ struct eventpoll_file_info *info = o;
+
+ info->efe = pb_msg(msg, EventpollFileEntry);
+ pr_info_eventpoll("Collected ", info->efe);
+ return file_desc_add(&info->d, info->efe->id, &desc_ops);
+}
+
+struct collect_image_info epoll_cinfo = {
+ .fd_type = CR_FD_EVENTPOLL_FILE,
+ .pb_type = PB_EVENTPOLL_FILE,
+ .priv_size = sizeof(struct eventpoll_file_info),
+ .collect = collect_one_epoll,
+};
diff --git a/criu/fault-injection.c b/criu/fault-injection.c
new file mode 100644
index 000000000000..f239fd9db649
--- /dev/null
+++ b/criu/fault-injection.c
@@ -0,0 +1,22 @@
+#include <stdlib.h>
+#include "fault-injection.h"
+
+enum faults fi_strategy;
+
+int fault_injection_init()
+{
+ char *val;
+ int strat;
+
+ val = getenv("CRIU_FAULT");
+ if (val == NULL)
+ return 0;
+
+ strat = atoi(val);
+
+ if (strat <= 0 || strat >= FI_MAX)
+ return -1;
+
+ fi_strategy = strat;
+ return 0;
+}
diff --git a/criu/fifo.c b/criu/fifo.c
new file mode 100644
index 000000000000..bd06da9c16e7
--- /dev/null
+++ b/criu/fifo.c
@@ -0,0 +1,168 @@
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+
+#include "imgset.h"
+#include "image.h"
+#include "files.h"
+#include "files-reg.h"
+#include "pipes.h"
+
+#include "fifo.h"
+
+#include "protobuf.h"
+#include "protobuf/regfile.pb-c.h"
+#include "protobuf/fifo.pb-c.h"
+
+/*
+ * FIFO checkpoint and restore is done in a bit unusual manner.
+ * We use files-reg.c engine to save fifo path and flags,
+ * thus regular files image will contain fifo descriptors which
+ * are useless for reg-files engine itself but needed for our fifo
+ * engine.
+ *
+ * In particular we dump fifo-entry automatically and appropriate
+ * reg-file entry manually, thus on restore we need to ask reg-file
+ * engine to restore fifo path and flags via direct call.
+ */
+
+struct fifo_info {
+ struct list_head list;
+ struct file_desc d;
+ FifoEntry *fe;
+ bool restore_data;
+ struct file_desc *reg_d;
+};
+
+static LIST_HEAD(fifo_head);
+static struct pipe_data_dump pd_fifo = { .img_type = CR_FD_FIFO_DATA, };
+
+static int dump_one_fifo(int lfd, u32 id, const struct fd_parms *p)
+{
+ struct cr_img *img = img_from_set(glob_imgset, CR_FD_FIFO);
+ FifoEntry e = FIFO_ENTRY__INIT;
+
+ /*
+ * It's a trick here, we use regular files dumping
+ * code to save path to a fifo, then we reuse it
+ * on restore.
+ */
+ if (dump_one_reg_file(lfd, id, p))
+ return -1;
+
+ pr_info("Dumping fifo %d with id %#x pipe_id %#x\n",
+ lfd, id, pipe_id(p));
+
+ e.id = id;
+ e.pipe_id = pipe_id(p);
+
+ if (pb_write_one(img, &e, PB_FIFO))
+ return -1;
+
+ return dump_one_pipe_data(&pd_fifo, lfd, p);
+}
+
+const struct fdtype_ops fifo_dump_ops = {
+ .type = FD_TYPES__FIFO,
+ .dump = dump_one_fifo,
+};
+
+static struct pipe_data_rst *pd_hash_fifo[PIPE_DATA_HASH_SIZE];
+
+static int do_open_fifo(int ns_root_fd, struct reg_file_info *rfi, void *arg)
+{
+ struct fifo_info *info = arg;
+ int new_fifo, fake_fifo = -1;
+
+ /*
+ * The fifos (except read-write fifos) do wait until
+ * another pipe-end get connected, so to be able to
+ * proceed the restoration procedure we open a fake
+ * fifo here.
+ */
+ fake_fifo = openat(ns_root_fd, rfi->path, O_RDWR);
+ if (fake_fifo < 0) {
+ pr_perror("Can't open fake fifo %#x [%s]", info->fe->id, rfi->path);
+ return -1;
+ }
+
+ new_fifo = openat(ns_root_fd, rfi->path, rfi->rfe->flags);
+ if (new_fifo < 0) {
+ pr_perror("Can't open fifo %#x [%s]", info->fe->id, rfi->path);
+ goto out;
+ }
+
+ if (info->restore_data)
+ if (restore_pipe_data(CR_FD_FIFO_DATA, fake_fifo,
+ info->fe->pipe_id, pd_hash_fifo)) {
+ close(new_fifo);
+ new_fifo = -1;
+ }
+
+out:
+ close(fake_fifo);
+ return new_fifo;
+}
+
+static int open_fifo_fd(struct file_desc *d)
+{
+ struct fifo_info *info = container_of(d, struct fifo_info, d);
+
+ return open_path(info->reg_d, do_open_fifo, info);
+}
+
+static void collect_fifo_fd(struct file_desc *d,
+ struct fdinfo_list_entry *fle, struct rst_info *ri)
+{
+ struct fifo_info *info;
+
+ info = container_of(d, struct fifo_info, d);
+ info->reg_d = collect_special_file(info->fe->id);
+ BUG_ON(info->reg_d == NULL);
+ collect_gen_fd(fle, ri);
+}
+
+static struct file_desc_ops fifo_desc_ops = {
+ .type = FD_TYPES__FIFO,
+ .open = open_fifo_fd,
+ .collect_fd = collect_fifo_fd,
+};
+
+static int collect_one_fifo(void *o, ProtobufCMessage *base)
+{
+ struct fifo_info *info = o, *f;
+
+ info->fe = pb_msg(base, FifoEntry);
+ pr_info("Collected fifo entry ID %#x PIPE ID %#x\n",
+ info->fe->id, info->fe->pipe_id);
+
+ /* check who will restore the fifo data */
+ list_for_each_entry(f, &fifo_head, list)
+ if (f->fe->pipe_id == info->fe->pipe_id)
+ break;
+
+ if (&f->list == &fifo_head) {
+ list_add(&info->list, &fifo_head);
+ info->restore_data = true;
+ } else {
+ INIT_LIST_HEAD(&info->list);
+ info->restore_data = false;
+ }
+
+ return file_desc_add(&info->d, info->fe->id, &fifo_desc_ops);
+
+}
+
+struct collect_image_info fifo_cinfo = {
+ .fd_type = CR_FD_FIFO,
+ .pb_type = PB_FIFO,
+ .priv_size = sizeof(struct fifo_info),
+ .collect = collect_one_fifo,
+};
+
+int collect_fifo(void)
+{
+ return collect_pipe_data(CR_FD_FIFO_DATA, pd_hash_fifo);
+}
diff --git a/criu/file-ids.c b/criu/file-ids.c
new file mode 100644
index 000000000000..f23924a0516b
--- /dev/null
+++ b/criu/file-ids.c
@@ -0,0 +1,113 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <signal.h>
+#include <limits.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "asm/types.h"
+#include "file-ids.h"
+#include "rbtree.h"
+#include "kcmp-ids.h"
+#include "compiler.h"
+#include "image.h"
+#include "util.h"
+#include "irmap.h"
+#include "files.h"
+
+static DECLARE_KCMP_TREE(fd_tree, KCMP_FILE);
+
+#define FDID_BITS 5
+#define FDID_SIZE (1 << FDID_BITS)
+#define FDID_MASK (FDID_SIZE - 1)
+
+static inline int fdid_hashfn(unsigned int s_dev, unsigned long i_ino)
+{
+ return (s_dev + i_ino) & FDID_MASK;
+}
+
+struct fd_id {
+ int mnt_id;
+ unsigned int dev;
+ unsigned long ino;
+ u32 id;
+ struct fd_id *n;
+};
+
+static struct fd_id *fd_id_cache[FDID_SIZE];
+
+static void fd_id_cache_one(u32 id, struct fd_parms *p)
+{
+ struct fd_id *fi;
+ unsigned hv;
+
+ fi = xmalloc(sizeof(*fi));
+ if (fi) {
+ fi->dev = p->stat.st_dev;
+ fi->ino = p->stat.st_ino;
+ fi->mnt_id = p->mnt_id;
+ fi->id = id;
+
+ hv = fdid_hashfn(p->stat.st_dev, p->stat.st_ino);
+ fi->n = fd_id_cache[hv];
+ fd_id_cache[hv] = fi;
+ }
+}
+
+static struct fd_id *fd_id_cache_lookup(struct fd_parms *p)
+{
+ struct stat *st = &p->stat;
+ struct fd_id *fi;
+
+ for (fi = fd_id_cache[fdid_hashfn(st->st_dev, st->st_ino)];
+ fi; fi = fi->n)
+ if (fi->dev == st->st_dev &&
+ fi->ino == st->st_ino &&
+ fi->mnt_id == p->mnt_id)
+ return fi;
+
+ return NULL;
+}
+
+int fd_id_generate_special(struct fd_parms *p, u32 *id)
+{
+ if (p) {
+ struct fd_id *fi;
+
+ fi = fd_id_cache_lookup(p);
+ if (fi) {
+ *id = fi->id;
+ return 0;
+ }
+ }
+
+ *id = fd_tree.subid++;
+ if (p)
+ fd_id_cache_one(*id, p);
+ return 1;
+}
+
+int fd_id_generate(pid_t pid, FdinfoEntry *fe, struct fd_parms *p)
+{
+ u32 id;
+ struct kid_elem e;
+ int new_id = 0;
+
+ e.pid = pid;
+ e.genid = fe->id;
+ e.idx = fe->fd;
+
+ id = kid_generate_gen(&fd_tree, &e, &new_id);
+ if (!id)
+ return -ENOMEM;
+
+ if (new_id)
+ fd_id_cache_one(id, p);
+
+ fe->id = id;
+ return new_id;
+}
diff --git a/criu/file-lock.c b/criu/file-lock.c
new file mode 100644
index 000000000000..8e4e48192ecf
--- /dev/null
+++ b/criu/file-lock.c
@@ -0,0 +1,377 @@
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/file.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include "cr_options.h"
+#include "imgset.h"
+#include "files.h"
+#include "fs-magic.h"
+#include "kerndat.h"
+#include "image.h"
+#include "mount.h"
+#include "proc_parse.h"
+#include "servicefd.h"
+#include "file-lock.h"
+#include "parasite.h"
+#include "parasite-syscall.h"
+
+struct file_lock_rst {
+ FileLockEntry *fle;
+ struct list_head l;
+};
+
+struct list_head file_lock_list = LIST_HEAD_INIT(file_lock_list);
+
+static int collect_one_file_lock(void *o, ProtobufCMessage *m)
+{
+ struct file_lock_rst *lr = o;
+
+ lr->fle = pb_msg(m, FileLockEntry);
+ list_add_tail(&lr->l, &file_lock_list);
+
+ return 0;
+}
+
+struct collect_image_info file_locks_cinfo = {
+ .fd_type = CR_FD_FILE_LOCKS,
+ .pb_type = PB_FILE_LOCK,
+ .priv_size = sizeof(struct file_lock_rst),
+ .collect = collect_one_file_lock,
+};
+
+struct file_lock *alloc_file_lock(void)
+{
+ struct file_lock *flock;
+
+ flock = xzalloc(sizeof(*flock));
+ if (!flock)
+ return NULL;
+
+ INIT_LIST_HEAD(&flock->list);
+ flock->real_owner = -1;
+ flock->owners_fd = -1;
+
+ return flock;
+}
+
+void free_file_locks(void)
+{
+ struct file_lock *flock, *tmp;
+
+ list_for_each_entry_safe(flock, tmp, &file_lock_list, list) {
+ xfree(flock);
+ }
+
+ INIT_LIST_HEAD(&file_lock_list);
+}
+
+static int dump_one_file_lock(FileLockEntry *fle)
+{
+ pr_info("LOCK flag: %d,type: %d,pid: %d,fd: %d,start: %8"PRIx64",len: %8"PRIx64"\n",
+ fle->flag, fle->type, fle->pid, fle->fd, fle->start, fle->len);
+
+ return pb_write_one(img_from_set(glob_imgset, CR_FD_FILE_LOCKS),
+ fle, PB_FILE_LOCK);
+}
+
+static void fill_flock_entry(FileLockEntry *fle, int fl_kind, int fl_ltype)
+{
+ fle->flag |= fl_kind;
+ fle->type = fl_ltype;
+}
+
+int dump_file_locks(void)
+{
+ FileLockEntry fle;
+ struct file_lock *fl;
+ int ret = 0;
+
+ pr_info("Dumping file-locks\n");
+
+ list_for_each_entry(fl, &file_lock_list, list) {
+ if (fl->real_owner == -1) {
+ if (fl->fl_kind == FL_POSIX) {
+ pr_err("Unresolved lock found pid %d ino %ld\n",
+ fl->fl_owner, fl->i_no);
+ return -1;
+ }
+
+ continue;
+ }
+
+ file_lock_entry__init(&fle);
+ fle.pid = fl->real_owner;
+ fle.fd = fl->owners_fd;
+ fill_flock_entry(&fle, fl->fl_kind, fl->fl_ltype);
+ fle.start = fl->start;
+ if (!strncmp(fl->end, "EOF", 3))
+ fle.len = 0;
+ else
+ fle.len = (atoll(fl->end) + 1) - fl->start;
+
+ ret = dump_one_file_lock(&fle);
+ if (ret) {
+ pr_err("Dump file lock failed!\n");
+ goto err;
+ }
+ }
+
+err:
+ return ret;
+}
+
+static int lock_btrfs_file_match(pid_t pid, int fd, struct file_lock *fl, struct fd_parms *p)
+{
+ int phys_dev = MKKDEV(fl->maj, fl->min);
+ char link[PATH_MAX], t[32];
+ struct ns_id *ns;
+ int ret;
+
+ snprintf(t, sizeof(t), "/proc/%d/fd/%d", pid, fd);
+ ret = readlink(t, link, sizeof(link)) - 1;
+ if (ret < 0) {
+ pr_perror("Can't read link of fd %d", fd);
+ return -1;
+ } else if ((size_t)ret == sizeof(link)) {
+ pr_err("Buffer for read link of fd %d is too small\n", fd);
+ return -1;
+ }
+ link[ret] = 0;
+
+ ns = lookup_nsid_by_mnt_id(p->mnt_id);
+ return phys_stat_dev_match(p->stat.st_dev, phys_dev, ns, link);
+}
+
+static inline int lock_file_match(pid_t pid, int fd, struct file_lock *fl, struct fd_parms *p)
+{
+ dev_t dev = p->stat.st_dev;
+
+ if (fl->i_no != p->stat.st_ino)
+ return 0;
+
+ /*
+ * Get the right devices for BTRFS. Look at phys_stat_resolve_dev()
+ * for more details.
+ */
+ if (p->fs_type == BTRFS_SUPER_MAGIC) {
+ if (p->mnt_id != -1) {
+ struct mount_info *m;
+
+ m = lookup_mnt_id(p->mnt_id);
+ BUG_ON(m == NULL);
+ dev = kdev_to_odev(m->s_dev);
+ } else /* old kernel */
+ return lock_btrfs_file_match(pid, fd, fl, p);
+ }
+
+ return makedev(fl->maj, fl->min) == dev;
+}
+
+static int lock_check_fd(int lfd, struct file_lock *fl)
+{
+ int ret;
+
+ if (fl->fl_ltype & LOCK_MAND)
+ ret = flock(lfd, LOCK_MAND | LOCK_RW);
+ else
+ ret = flock(lfd, LOCK_EX | LOCK_NB);
+ pr_debug(" `- %d/%d\n", ret, errno);
+ if (ret != 0) {
+ if (errno != EAGAIN) {
+ pr_err("Bogus lock test result %d\n", ret);
+ return -1;
+ }
+
+ return 0;
+ } else {
+ /*
+ * The ret == 0 means, that new lock doesn't conflict
+ * with any others on the file. But since we do know,
+ * that there should be some other one (file is found
+ * in /proc/locks), it means that the lock is already
+ * on file pointed by fd.
+ */
+ pr_debug(" `- downgrading lock back\n");
+ if (fl->fl_ltype & LOCK_MAND)
+ flock(lfd, fl->fl_ltype);
+ else if (fl->fl_ltype == F_RDLCK)
+ flock(lfd, LOCK_SH);
+ }
+
+ return 1;
+}
+
+int note_file_lock(struct pid *pid, int fd, int lfd, struct fd_parms *p)
+{
+ struct file_lock *fl;
+ int ret;
+
+ if (kdat.has_fdinfo_lock)
+ return 0;
+
+ list_for_each_entry(fl, &file_lock_list, list) {
+ ret = lock_file_match(pid->real, fd, fl, p);
+ if (ret < 0)
+ return -1;
+ if (ret == 0)
+ continue;
+
+ if (!opts.handle_file_locks) {
+ pr_err("Some file locks are hold by dumping tasks!"
+ "You can try --" OPT_FILE_LOCKS " to dump them.\n");
+ return -1;
+ }
+
+ if (fl->fl_kind == FL_POSIX) {
+ /*
+ * POSIX locks cannot belong to anyone
+ * but creator.
+ */
+ if (fl->fl_owner != pid->real)
+ continue;
+ } else /* fl->fl_kind == FL_FLOCK */ {
+ int ret;
+
+ /*
+ * FLOCKs can be inherited across fork,
+ * thus we can have any task as lock
+ * owner. But the creator is preferred
+ * anyway.
+ */
+
+ if (fl->fl_owner != pid->real &&
+ fl->real_owner != -1)
+ continue;
+
+ pr_debug("Checking lock holder %d:%d\n", pid->real, fd);
+ ret = lock_check_fd(lfd, fl);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ continue;
+ }
+
+ fl->real_owner = pid->virt;
+ fl->owners_fd = fd;
+
+ pr_info("Found lock entry %d.%d %d vs %d\n",
+ pid->real, pid->virt, fd,
+ fl->fl_owner);
+ }
+
+ return 0;
+}
+
+static int restore_file_lock(FileLockEntry *fle)
+{
+ int ret = -1;
+ unsigned int cmd;
+
+ if (fle->flag & FL_FLOCK) {
+ if (fle->type & LOCK_MAND) {
+ cmd = fle->type;
+ } else if (fle->type == F_RDLCK) {
+ cmd = LOCK_SH;
+ } else if (fle->type == F_WRLCK) {
+ cmd = LOCK_EX;
+ } else if (fle->type == F_UNLCK) {
+ cmd = LOCK_UN;
+ } else {
+ pr_err("Unknown flock type!\n");
+ goto err;
+ }
+
+ pr_info("(flock)flag: %d, type: %d, cmd: %d, pid: %d, fd: %d\n",
+ fle->flag, fle->type, cmd, fle->pid, fle->fd);
+
+ ret = flock(fle->fd, cmd);
+ if (ret < 0) {
+ pr_err("Can not set flock!\n");
+ goto err;
+ }
+ } else if (fle->flag & FL_POSIX) {
+ struct flock flk;
+ memset(&flk, 0, sizeof(flk));
+
+ flk.l_whence = SEEK_SET;
+ flk.l_start = fle->start;
+ flk.l_len = fle->len;
+ flk.l_pid = fle->pid;
+ flk.l_type = fle->type;
+
+ pr_info("(posix)flag: %d, type: %d, pid: %d, fd: %d, "
+ "start: %8"PRIx64", len: %8"PRIx64"\n",
+ fle->flag, fle->type, fle->pid, fle->fd,
+ fle->start, fle->len);
+
+ ret = fcntl(fle->fd, F_SETLKW, &flk);
+ if (ret < 0) {
+ pr_err("Can not set posix lock!\n");
+ goto err;
+ }
+ } else {
+ pr_err("Unknown file lock style!\n");
+ goto err;
+ }
+
+ return 0;
+err:
+ return ret;
+}
+
+static int restore_file_locks(int pid)
+{
+ int ret = 0;
+ struct file_lock_rst *lr;
+
+ list_for_each_entry(lr, &file_lock_list, l) {
+ if (lr->fle->pid == pid) {
+ ret = restore_file_lock(lr->fle);
+ if (ret)
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static int restore_file_locks_legacy(int pid)
+{
+ int ret = -1;
+ struct cr_img *img;
+ FileLockEntry *fle;
+
+ img = open_image(CR_FD_FILE_LOCKS_PID, O_RSTR, pid);
+ if (!img)
+ return -1;
+
+ while (1) {
+ ret = pb_read_one_eof(img, &fle, PB_FILE_LOCK);
+ if (ret <= 0)
+ break;
+
+ ret = restore_file_lock(fle);
+ file_lock_entry__free_unpacked(fle, NULL);
+ if (ret)
+ break;
+ }
+
+ close_image(img);
+ return ret;
+}
+
+int prepare_file_locks(int pid)
+{
+ if (!opts.handle_file_locks)
+ return 0;
+
+ pr_info("Restore file locks.\n");
+ if (file_locks_cinfo.flags & COLLECT_HAPPENED)
+ return restore_file_locks(pid);
+
+ return restore_file_locks_legacy(pid);
+}
diff --git a/criu/files-ext.c b/criu/files-ext.c
new file mode 100644
index 000000000000..b196b259006b
--- /dev/null
+++ b/criu/files-ext.c
@@ -0,0 +1,93 @@
+/* An external file is a file, which is dumped with help a plugin */
+
+#include <unistd.h>
+
+#include "imgset.h"
+#include "files.h"
+#include "plugin.h"
+
+#include "protobuf.h"
+#include "protobuf/ext-file.pb-c.h"
+
+static int dump_one_ext_file(int lfd, u32 id, const struct fd_parms *p)
+{
+ int ret;
+ struct cr_img *rimg;
+
+ ExtFileEntry xfe = EXT_FILE_ENTRY__INIT;
+
+ ret = run_plugins(DUMP_EXT_FILE, lfd, id);
+ if (ret < 0)
+ return ret;
+
+ xfe.id = id;
+ xfe.fown = (FownEntry *)&p->fown;
+
+ rimg = img_from_set(glob_imgset, CR_FD_EXT_FILES);
+ return pb_write_one(rimg, &xfe, PB_EXT_FILE);
+}
+
+const struct fdtype_ops ext_dump_ops = {
+ .type = FD_TYPES__EXT,
+ .dump = dump_one_ext_file,
+};
+
+struct ext_file_info {
+ struct file_desc d;
+ ExtFileEntry *xfe;
+};
+
+static int open_fd(struct file_desc *d)
+{
+ struct ext_file_info *xfi;
+ int fd;
+
+ xfi = container_of(d, struct ext_file_info, d);
+
+ fd = run_plugins(RESTORE_EXT_FILE, xfi->xfe->id);
+ if (fd < 0) {
+ pr_err("Unable to restore %#x\n", xfi->xfe->id);
+ return -1;
+ }
+
+ if (restore_fown(fd, xfi->xfe->fown))
+ return -1;
+
+ return fd;
+}
+
+static struct file_desc_ops ext_desc_ops = {
+ .type = FD_TYPES__EXT,
+ .open = open_fd,
+};
+
+static int collect_one_ext(void *o, ProtobufCMessage *base)
+{
+ struct ext_file_info *xfi = o;
+
+ xfi->xfe = pb_msg(base, ExtFileEntry);
+
+ pr_info("Collected external file with ID %#x\n", xfi->xfe->id);
+ return file_desc_add(&xfi->d, xfi->xfe->id, &ext_desc_ops);
+}
+
+struct collect_image_info ext_file_cinfo = {
+ .fd_type = CR_FD_EXT_FILES,
+ .pb_type = PB_EXT_FILE,
+ .priv_size = sizeof(struct ext_file_info),
+ .collect = collect_one_ext,
+};
+
+int dump_unsupp_fd(struct fd_parms *p, int lfd,
+ struct cr_img *img, char *more, char *info)
+{
+ int ret;
+
+ ret = do_dump_gen_file(p, lfd, &ext_dump_ops, img);
+ if (ret == 0)
+ return 0;
+ if (ret == -ENOTSUP)
+ pr_err("Can't dump file %d of that type [%o] (%s %s)\n",
+ p->fd, p->stat.st_mode, more, info);
+ return -1;
+}
diff --git a/criu/files-reg.c b/criu/files-reg.c
new file mode 100644
index 000000000000..7911d667351b
--- /dev/null
+++ b/criu/files-reg.c
@@ -0,0 +1,1643 @@
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/vfs.h>
+#include <sys/prctl.h>
+#include <ctype.h>
+#include <sched.h>
+
+/* Stolen from kernel/fs/nfs/unlink.c */
+#define SILLYNAME_PREF ".nfs"
+#define SILLYNAME_SUFF_LEN (((unsigned)sizeof(u64) << 1) + ((unsigned)sizeof(unsigned int) << 1))
+
+#include "cr_options.h"
+#include "imgset.h"
+#include "file-ids.h"
+#include "mount.h"
+#include "files.h"
+#include "image.h"
+#include "list.h"
+#include "util.h"
+#include "fs-magic.h"
+#include "asm/atomic.h"
+#include "namespaces.h"
+#include "proc_parse.h"
+#include "pstree.h"
+
+#include "protobuf.h"
+#include "protobuf/regfile.pb-c.h"
+#include "protobuf/remap-file-path.pb-c.h"
+
+#include "files-reg.h"
+#include "plugin.h"
+
+int setfsuid(uid_t fsuid);
+
+/*
+ * Ghost files are those not visible from the FS. Dumping them is
+ * nasty and the only way we have -- just carry its contents with
+ * us. Any brave soul to implement link unlinked file back?
+ */
+struct ghost_file {
+ struct list_head list;
+ u32 id;
+
+ u32 dev;
+ u32 ino;
+
+ struct file_remap remap;
+};
+
+static u32 ghost_file_ids = 1;
+static LIST_HEAD(ghost_files);
+
+static mutex_t *ghost_file_mutex;
+
+static LIST_HEAD(remaps);
+
+/*
+ * Remember the name to delete it if needed on error or
+ * rollback action. Note we don't expect that there will
+ * be a HUGE number of link remaps, so in a sake of speed
+ * we keep all data in memory.
+ */
+struct link_remap_rlb {
+ struct list_head list;
+ struct ns_id *mnt_ns;
+ char *path;
+};
+
+static int note_link_remap(char *path, struct ns_id *nsid)
+{
+ struct link_remap_rlb *rlb;
+
+ rlb = xmalloc(sizeof(*rlb));
+ if (!rlb)
+ goto err;
+
+ rlb->path = strdup(path);
+ if (!rlb->path)
+ goto err2;
+
+ rlb->mnt_ns = nsid;
+ list_add(&rlb->list, &remaps);
+
+ return 0;
+
+err2:
+ xfree(rlb);
+err:
+ pr_err("Can't note link remap for %s\n", path);
+ return -1;
+}
+
+/* Trim "a/b/c/d" to "a/b/d" */
+static int trim_last_parent(char *path)
+{
+ char *fname, *p;
+
+ p = strrchr(path, '/');
+ fname = p + 1;
+ if (!p || *fname == '\0')
+ return -1;
+
+ while (p >= path && *p == '/')
+ p--;
+
+ if (p < path)
+ return -1;
+
+ while (p >= path && *p != '/')
+ p--;
+ p++;
+
+ while (*fname != '\0')
+ *p++ = *fname++;
+ *p = '\0';
+
+ return 0;
+}
+
+static int mkreg_ghost(char *path, u32 mode, struct ghost_file *gf, struct cr_img *img)
+{
+ int gfd, ret;
+
+ gfd = open(path, O_WRONLY | O_CREAT | O_EXCL, mode);
+ if (gfd < 0)
+ return -1;
+
+ ret = copy_file(img_raw_fd(img), gfd, 0);
+ close(gfd);
+
+ return ret;
+}
+
+static int ghost_apply_metadata(const char *path, GhostFileEntry *gfe)
+{
+ struct timeval tv[2];
+ int ret = -1;
+
+ if (chown(path, gfe->uid, gfe->gid) < 0) {
+ pr_perror("Can't reset user/group on ghost %s", path);
+ goto err;
+ }
+
+ if (chmod(path, gfe->mode)) {
+ pr_perror("Can't set perms %o on ghost %s", gfe->mode, path);
+ goto err;
+ }
+
+ if (gfe->atim) {
+ tv[0].tv_sec = gfe->atim->tv_sec;
+ tv[0].tv_usec = gfe->atim->tv_usec;
+ tv[1].tv_sec = gfe->mtim->tv_sec;
+ tv[1].tv_usec = gfe->mtim->tv_usec;
+ if (lutimes(path, tv)) {
+ pr_perror("Can't set access and modufication times on ghost %s", path);
+ goto err;
+ }
+ }
+
+ ret = 0;
+err:
+ return ret;
+}
+
+static int create_ghost(struct ghost_file *gf, GhostFileEntry *gfe, struct cr_img *img)
+{
+ char path[PATH_MAX];
+ int ret, root_len;
+ char *msg;
+
+ root_len = ret = rst_get_mnt_root(gf->remap.rmnt_id, path, sizeof(path));
+ if (ret < 0) {
+ pr_err("The %d mount is not found for ghost\n", gf->remap.rmnt_id);
+ goto err;
+ }
+
+ snprintf(path + ret, sizeof(path) - ret, "/%s", gf->remap.rpath);
+ ret = -1;
+again:
+ if (S_ISFIFO(gfe->mode)) {
+ if ((ret = mknod(path, gfe->mode, 0)) < 0)
+ msg = "Can't create node for ghost file";
+ } else if (S_ISCHR(gfe->mode) || S_ISBLK(gfe->mode)) {
+ if (!gfe->has_rdev) {
+ pr_err("No rdev for ghost device\n");
+ goto err;
+ }
+ if ((ret = mknod(path, gfe->mode, gfe->rdev)) < 0)
+ msg = "Can't create node for ghost dev";
+ } else if (S_ISDIR(gfe->mode)) {
+ if ((ret = mkdir(path, gfe->mode)) < 0) {
+ pr_perror("Can't make ghost dir");
+ goto err;
+ }
+ } else {
+ if ((ret = mkreg_ghost(path, gfe->mode, gf, img)) < 0)
+ msg = "Can't create ghost regfile\n";
+ }
+
+ if (ret < 0) {
+ /* Use grand parent, if parent directory does not exist */
+ if (errno == ENOENT) {
+ if (trim_last_parent(path) < 0) {
+ pr_err("trim failed: @%s@\n", path);
+ goto err;
+ }
+ goto again;
+ }
+
+ pr_perror("%s", msg);
+ goto err;
+ }
+
+ strcpy(gf->remap.rpath, path + root_len + 1);
+ pr_debug("Remap rpath is %s\n", gf->remap.rpath);
+
+ ret = -1;
+ if (ghost_apply_metadata(path, gfe))
+ goto err;
+
+ ret = 0;
+err:
+ return ret;
+}
+
+static inline void ghost_path(char *path, int plen,
+ struct reg_file_info *rfi, RemapFilePathEntry *rfe)
+{
+ snprintf(path, plen, "%s.cr.%x.ghost", rfi->path, rfe->remap_id);
+}
+
+static int open_remap_ghost(struct reg_file_info *rfi,
+ RemapFilePathEntry *rfe)
+{
+ struct ghost_file *gf;
+ GhostFileEntry *gfe = NULL;
+ struct cr_img *img;
+
+ list_for_each_entry(gf, &ghost_files, list)
+ if (gf->id == rfe->remap_id)
+ goto gf_found;
+
+ /*
+ * Ghost not found. We will create one in the same dir
+ * as the very first client of it thus resolving any
+ * issues with cross-device links.
+ */
+
+ pr_info("Opening ghost file %#x for %s\n", rfe->remap_id, rfi->path);
+
+ gf = shmalloc(sizeof(*gf));
+ if (!gf)
+ return -1;
+
+ gf->remap.rpath = xmalloc(PATH_MAX);
+ if (!gf->remap.rpath)
+ goto err;
+
+ img = open_image(CR_FD_GHOST_FILE, O_RSTR, rfe->remap_id);
+ if (!img)
+ goto err;
+
+ if (pb_read_one(img, &gfe, PB_GHOST_FILE) < 0)
+ goto close_ifd;
+
+ /*
+ * For old formats where optional has_[dev|ino] is
+ * not present we will have zeros here which is quite
+ * a sign for "absent" fields.
+ */
+ gf->dev = gfe->dev;
+ gf->ino = gfe->ino;
+ gf->remap.rmnt_id = rfi->rfe->mnt_id;
+
+ if (S_ISDIR(gfe->mode))
+ strncpy(gf->remap.rpath, rfi->path, PATH_MAX);
+ else
+ ghost_path(gf->remap.rpath, PATH_MAX, rfi, rfe);
+
+ if (create_ghost(gf, gfe, img))
+ goto close_ifd;
+
+ ghost_file_entry__free_unpacked(gfe, NULL);
+ close_image(img);
+
+ gf->id = rfe->remap_id;
+ gf->remap.users = 0;
+ gf->remap.is_dir = S_ISDIR(gfe->mode);
+ gf->remap.owner = gfe->uid;
+ list_add_tail(&gf->list, &ghost_files);
+gf_found:
+ rfi->remap = &gf->remap;
+ return 0;
+
+close_ifd:
+ close_image(img);
+err:
+ if (gfe)
+ ghost_file_entry__free_unpacked(gfe, NULL);
+ xfree(gf->remap.rpath);
+ shfree_last(gf);
+ return -1;
+}
+
+static int open_remap_linked(struct reg_file_info *rfi,
+ RemapFilePathEntry *rfe)
+{
+ struct file_remap *rm;
+ struct file_desc *rdesc;
+ struct reg_file_info *rrfi;
+ uid_t owner = -1;
+
+ rdesc = find_file_desc_raw(FD_TYPES__REG, rfe->remap_id);
+ if (!rdesc) {
+ pr_err("Can't find target file %x\n", rfe->remap_id);
+ return -1;
+ }
+
+ rm = xmalloc(sizeof(*rm));
+ if (!rm)
+ return -1;
+
+ rrfi = container_of(rdesc, struct reg_file_info, d);
+ pr_info("Remapped %s -> %s\n", rfi->path, rrfi->path);
+
+ if (root_ns_mask & CLONE_NEWUSER) {
+ int rfd;
+ struct stat st;
+
+ rfd = mntns_get_root_by_mnt_id(rfi->rfe->mnt_id);
+ if (fstatat(rfd, rrfi->path, &st, AT_SYMLINK_NOFOLLOW)) {
+ pr_perror("Can't get owner of link remap %s", rrfi->path);
+ xfree(rm);
+ return -1;
+ }
+
+ owner = st.st_uid;
+ }
+
+ rm->rpath = rrfi->path;
+ rm->users = 0;
+ rm->is_dir = false;
+ rm->owner = owner;
+ rm->rmnt_id = rfi->rfe->mnt_id;
+ rfi->remap = rm;
+ return 0;
+}
+
+static int open_remap_dead_process(struct reg_file_info *rfi,
+ RemapFilePathEntry *rfe)
+{
+ struct pstree_item *helper;
+
+ for_each_pstree_item(helper) {
+ /* don't need to add multiple tasks */
+ if (helper->pid.virt == rfe->remap_id) {
+ pr_info("Skipping helper for restoring /proc/%d; pid exists\n", rfe->remap_id);
+ return 0;
+ }
+ }
+
+ helper = alloc_pstree_helper();
+ if (!helper)
+ return -1;
+
+ helper->sid = root_item->sid;
+ helper->pgid = root_item->pgid;
+ helper->pid.virt = rfe->remap_id;
+ helper->parent = root_item;
+ list_add_tail(&helper->sibling, &root_item->children);
+
+ pr_info("Added a helper for restoring /proc/%d\n", helper->pid.virt);
+
+ return 0;
+}
+
+struct remap_info {
+ struct list_head list;
+ RemapFilePathEntry *rfe;
+ struct reg_file_info *rfi;
+};
+
+static int collect_one_remap(void *obj, ProtobufCMessage *msg)
+{
+ struct remap_info *ri = obj;
+ RemapFilePathEntry *rfe;
+ struct file_desc *fdesc;
+
+ ri->rfe = rfe = pb_msg(msg, RemapFilePathEntry);
+
+ if (!rfe->has_remap_type) {
+ rfe->has_remap_type = true;
+ /* backward compatibility with images */
+ if (rfe->remap_id & REMAP_GHOST) {
+ rfe->remap_id &= ~REMAP_GHOST;
+ rfe->remap_type = REMAP_TYPE__GHOST;
+ } else
+ rfe->remap_type = REMAP_TYPE__LINKED;
+ }
+
+ fdesc = find_file_desc_raw(FD_TYPES__REG, rfe->orig_id);
+ if (fdesc == NULL) {
+ pr_err("Remap for non existing file %#x\n", rfe->orig_id);
+ return -1;
+ }
+
+ ri->rfi = container_of(fdesc, struct reg_file_info, d);
+
+ list_add_tail(&ri->list, &remaps);
+
+ return 0;
+}
+
+static int prepare_one_remap(struct remap_info *ri)
+{
+ int ret = -1;
+ RemapFilePathEntry *rfe = ri->rfe;
+ struct reg_file_info *rfi = ri->rfi;
+
+ pr_info("Configuring remap %#x -> %#x\n", rfi->rfe->id, rfe->remap_id);
+
+ switch (rfe->remap_type) {
+ case REMAP_TYPE__LINKED:
+ ret = open_remap_linked(rfi, rfe);
+ break;
+ case REMAP_TYPE__GHOST:
+ ret = open_remap_ghost(rfi, rfe);
+ break;
+ case REMAP_TYPE__PROCFS:
+ /* handled earlier by prepare_procfs_remaps */
+ ret = 0;
+ break;
+ default:
+ pr_err("unknown remap type %u\n", rfe->remap_type);
+ goto out;
+ }
+
+out:
+ return ret;
+}
+
+/* We separate the prepartion of PROCFS remaps because they allocate pstree
+ * items, which need to be seen by the root task. We can't do all remaps here,
+ * because the files haven't been loaded yet.
+ */
+int prepare_procfs_remaps(void)
+{
+ struct remap_info *ri;
+
+ list_for_each_entry(ri, &remaps, list) {
+ RemapFilePathEntry *rfe = ri->rfe;
+ struct reg_file_info *rfi = ri->rfi;
+
+ switch (rfe->remap_type) {
+ case REMAP_TYPE__PROCFS:
+ if (open_remap_dead_process(rfi, rfe) < 0)
+ return -1;
+ break;
+ default:
+ continue;
+ }
+ }
+
+ return 0;
+}
+
+int prepare_remaps(void)
+{
+ struct remap_info *ri;
+ int ret = 0;
+
+ list_for_each_entry(ri, &remaps, list) {
+ ret = prepare_one_remap(ri);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static void try_clean_ghost(struct remap_info *ri)
+{
+ char path[PATH_MAX];
+ int mnt_id, ret;
+
+ mnt_id = ri->rfi->rfe->mnt_id; /* rirfirfe %) */
+ ret = rst_get_mnt_root(mnt_id, path, sizeof(path));
+ if (ret < 0)
+ return;
+
+ ghost_path(path + ret, sizeof(path) - 1, ri->rfi, ri->rfe);
+ if (!unlink(path)) {
+ pr_info(" `- X [%s] ghost\n", path);
+ return;
+ }
+
+ /*
+ * We can also find out the ghost type by stat()-ing
+ * it or by reading the ghost image, but this way
+ * is the fastest one.
+ */
+
+ if ((errno == EISDIR)) {
+ strncpy(path + ret, ri->rfi->path, sizeof(path) - 1);
+ if (!rmdir(path)) {
+ pr_info(" `- Xd [%s] ghost\n", path);
+ return;
+ }
+ }
+
+ pr_perror(" `- XFail [%s] ghost", path);
+}
+
+void try_clean_remaps(int ns_fd)
+{
+ struct remap_info *ri;
+ int old_ns = -1;
+ int cwd_fd = -1;
+
+ if (list_empty(&remaps))
+ goto out;
+
+ if (ns_fd >= 0) {
+ pr_info("Switching to new ns to clean ghosts\n");
+
+ old_ns = open_proc(PROC_SELF, "ns/mnt");
+ if (old_ns < 0) {
+ pr_perror("`- Can't keep old ns");
+ return;
+ }
+
+ cwd_fd = open(".", O_DIRECTORY);
+ if (cwd_fd < 0) {
+ pr_perror("Unable to open cwd");
+ return;
+ }
+
+ if (setns(ns_fd, CLONE_NEWNS) < 0) {
+ close(old_ns);
+ close(cwd_fd);
+ pr_perror("`- Can't switch");
+ return;
+ }
+ }
+
+ list_for_each_entry(ri, &remaps, list)
+ if (ri->rfe->remap_type == REMAP_TYPE__GHOST)
+ try_clean_ghost(ri);
+
+ if (old_ns >= 0) {
+ if (setns(old_ns, CLONE_NEWNS) < 0)
+ pr_perror("Fail to switch back!");
+ close(old_ns);
+ }
+
+ if (cwd_fd >= 0) {
+ if (fchdir(cwd_fd)) {
+ pr_perror("Unable to restore cwd");
+ close(cwd_fd);
+ return;
+ }
+ close(cwd_fd);
+ }
+
+out:
+ if (ns_fd >= 0)
+ close(ns_fd);
+}
+
+static struct collect_image_info remap_cinfo = {
+ .fd_type = CR_FD_REMAP_FPATH,
+ .pb_type = PB_REMAP_FPATH,
+ .priv_size = sizeof(struct remap_info),
+ .collect = collect_one_remap,
+};
+
+static int dump_ghost_file(int _fd, u32 id, const struct stat *st, dev_t phys_dev)
+{
+ struct cr_img *img;
+ GhostFileEntry gfe = GHOST_FILE_ENTRY__INIT;
+ Timeval atim = TIMEVAL__INIT, mtim = TIMEVAL__INIT;
+
+ pr_info("Dumping ghost file contents (id %#x)\n", id);
+
+ img = open_image(CR_FD_GHOST_FILE, O_DUMP, id);
+ if (!img)
+ return -1;
+
+ gfe.uid = userns_uid(st->st_uid);
+ gfe.gid = userns_gid(st->st_gid);
+ gfe.mode = st->st_mode;
+
+ gfe.atim = &atim;
+ gfe.mtim = &mtim;
+ gfe.atim->tv_sec = st->st_atim.tv_sec;
+ gfe.atim->tv_usec = st->st_atim.tv_nsec / 1000;
+ gfe.mtim->tv_sec = st->st_mtim.tv_sec;
+ gfe.mtim->tv_usec = st->st_mtim.tv_nsec / 1000;
+
+ gfe.has_dev = gfe.has_ino = true;
+ gfe.dev = phys_dev;
+ gfe.ino = st->st_ino;
+
+ if (S_ISCHR(st->st_mode) || S_ISBLK(st->st_mode)) {
+ gfe.has_rdev = true;
+ gfe.rdev = st->st_rdev;
+ }
+
+ if (pb_write_one(img, &gfe, PB_GHOST_FILE))
+ return -1;
+
+ if (S_ISREG(st->st_mode)) {
+ int fd, ret;
+ char lpath[PSFDS];
+
+ /*
+ * Reopen file locally since it may have no read
+ * permissions when drained
+ */
+ sprintf(lpath, "/proc/self/fd/%d", _fd);
+ fd = open(lpath, O_RDONLY);
+ if (fd < 0) {
+ pr_perror("Can't open ghost original file");
+ return -1;
+ }
+ ret = copy_file(fd, img_raw_fd(img), st->st_size);
+ close(fd);
+ if (ret)
+ return -1;
+ }
+
+ close_image(img);
+ return 0;
+}
+
+void remap_put(struct file_remap *remap)
+{
+ mutex_lock(ghost_file_mutex);
+ if (--remap->users == 0) {
+ int mntns_root;
+
+ pr_info("Unlink the ghost %s\n", remap->rpath);
+
+ mntns_root = mntns_get_root_by_mnt_id(remap->rmnt_id);
+ unlinkat(mntns_root, remap->rpath, 0);
+ }
+ mutex_unlock(ghost_file_mutex);
+}
+
+struct file_remap *lookup_ghost_remap(u32 dev, u32 ino)
+{
+ struct ghost_file *gf;
+
+ mutex_lock(ghost_file_mutex);
+ list_for_each_entry(gf, &ghost_files, list) {
+ if (gf->ino == ino && (gf->dev == dev)) {
+ gf->remap.users++;
+ mutex_unlock(ghost_file_mutex);
+ return &gf->remap;
+ }
+ }
+ mutex_unlock(ghost_file_mutex);
+
+ return NULL;
+}
+
+static int dump_ghost_remap(char *path, const struct stat *st,
+ int lfd, u32 id, struct ns_id *nsid)
+{
+ struct ghost_file *gf;
+ RemapFilePathEntry rpe = REMAP_FILE_PATH_ENTRY__INIT;
+ dev_t phys_dev;
+
+ pr_info("Dumping ghost file for fd %d id %#x\n", lfd, id);
+
+ if (st->st_size > opts.ghost_limit) {
+ pr_err("Can't dump ghost file %s of %"PRIu64" size, increase limit\n",
+ path, st->st_size);
+ return -1;
+ }
+
+ phys_dev = phys_stat_resolve_dev(nsid, st->st_dev, path);
+ list_for_each_entry(gf, &ghost_files, list)
+ if ((gf->dev == phys_dev) && (gf->ino == st->st_ino))
+ goto dump_entry;
+
+ gf = xmalloc(sizeof(*gf));
+ if (gf == NULL)
+ return -1;
+
+ gf->dev = phys_dev;
+ gf->ino = st->st_ino;
+ gf->id = ghost_file_ids++;
+ list_add_tail(&gf->list, &ghost_files);
+
+ if (dump_ghost_file(lfd, gf->id, st, phys_dev))
+ return -1;
+
+dump_entry:
+ rpe.orig_id = id;
+ rpe.remap_id = gf->id;
+ rpe.has_remap_type = true;
+ rpe.remap_type = REMAP_TYPE__GHOST;
+
+ return pb_write_one(img_from_set(glob_imgset, CR_FD_REMAP_FPATH),
+ &rpe, PB_REMAP_FPATH);
+}
+
+static void __rollback_link_remaps(bool do_unlink)
+{
+ struct link_remap_rlb *rlb, *tmp;
+ int mntns_root;
+
+ list_for_each_entry_safe(rlb, tmp, &remaps, list) {
+ if (do_unlink) {
+ mntns_root = mntns_get_root_fd(rlb->mnt_ns);
+ if (mntns_root >= 0)
+ unlinkat(mntns_root, rlb->path, 0);
+ else
+ pr_err("Failed to clenaup %s link remap\n", rlb->path);
+ }
+
+ list_del(&rlb->list);
+ xfree(rlb->path);
+ xfree(rlb);
+ }
+}
+
+void delete_link_remaps(void) { __rollback_link_remaps(true); }
+void free_link_remaps(void) { __rollback_link_remaps(false); }
+
+static int create_link_remap(char *path, int len, int lfd,
+ u32 *idp, struct ns_id *nsid)
+{
+ char link_name[PATH_MAX], *tmp;
+ RegFileEntry rfe = REG_FILE_ENTRY__INIT;
+ FownEntry fwn = FOWN_ENTRY__INIT;
+ int mntns_root;
+
+ if (!opts.link_remap_ok) {
+ pr_err("Can't create link remap for %s. "
+ "Use " LREMAP_PARAM " option.\n", path);
+ return -1;
+ }
+
+ /*
+ * Linked remapping -- we create a hard link on a removed file
+ * in the directory original file used to sit.
+ *
+ * Bad news is than we can't easily open lfd's parent dir. Thus
+ * we have to just generate an absolute path and use it. The linkat
+ * will fail if we chose the bad one.
+ */
+
+ link_name[0] = '.';
+ memcpy(link_name + 1, path, len);
+ tmp = link_name + len;
+ while (*tmp != '/') {
+ BUG_ON(tmp == link_name);
+ tmp--;
+ }
+
+ fd_id_generate_special(NULL, idp);
+ rfe.id = *idp;
+ rfe.flags = 0;
+ rfe.pos = 0;
+ rfe.fown = &fwn;
+ rfe.name = link_name + 1;
+
+ /* Any 'unique' name works here actually. Remap works by reg-file ids. */
+ snprintf(tmp + 1, sizeof(link_name) - (size_t)(tmp - link_name - 1), "link_remap.%d", rfe.id);
+
+ mntns_root = mntns_get_root_fd(nsid);
+
+ if (linkat(lfd, "", mntns_root, link_name, AT_EMPTY_PATH) < 0) {
+ pr_perror("Can't link remap to %s", path);
+ return -1;
+ }
+
+ if (note_link_remap(link_name, nsid))
+ return -1;
+
+ return pb_write_one(img_from_set(glob_imgset, CR_FD_REG_FILES), &rfe, PB_REG_FILE);
+}
+
+static int dump_linked_remap(char *path, int len, const struct stat *ost,
+ int lfd, u32 id, struct ns_id *nsid)
+{
+ u32 lid;
+ RemapFilePathEntry rpe = REMAP_FILE_PATH_ENTRY__INIT;
+
+ if (create_link_remap(path, len, lfd, &lid, nsid))
+ return -1;
+
+ rpe.orig_id = id;
+ rpe.remap_id = lid;
+
+ return pb_write_one(img_from_set(glob_imgset, CR_FD_REMAP_FPATH),
+ &rpe, PB_REMAP_FPATH);
+}
+
+static int have_seen_dead_pid(pid_t pid)
+{
+ static pid_t *dead_pids = NULL;
+ static int n_dead_pids = 0;
+ size_t i;
+
+ for (i = 0; i < n_dead_pids; i++) {
+ if (dead_pids[i] == pid)
+ return 1;
+ }
+
+ if (xrealloc_safe(&dead_pids, sizeof(*dead_pids) * (n_dead_pids + 1)))
+ return -1;
+ dead_pids[n_dead_pids++] = pid;
+
+ return 0;
+}
+
+static int dump_dead_process_remap(pid_t pid, u32 id)
+{
+ RemapFilePathEntry rpe = REMAP_FILE_PATH_ENTRY__INIT;
+ int ret;
+
+ ret = have_seen_dead_pid(pid);
+ if (ret < 0)
+ return -1;
+ if (ret) {
+ pr_info("Found dead pid %d already, skipping remap\n", pid);
+ return 0;
+ }
+
+ rpe.orig_id = id;
+ rpe.remap_id = pid;
+ rpe.has_remap_type = true;
+ rpe.remap_type = REMAP_TYPE__PROCFS;
+
+ return pb_write_one(img_from_set(glob_imgset, CR_FD_REMAP_FPATH),
+ &rpe, PB_REMAP_FPATH);
+}
+
+static bool is_sillyrename_name(char *name)
+{
+ int i;
+
+ name = strrchr(name, '/');
+ BUG_ON(name == NULL); /* see check in dump_one_reg_file */
+ name++;
+
+ /*
+ * Strictly speaking this check is not bullet-proof. User
+ * can create file with this name by hands and we have no
+ * API to distinguish really-silly-renamed files from those
+ * fake names :(
+ *
+ * But since NFS people expect .nfsXXX files to be unstable,
+ * we treat them as such too.
+ */
+
+ if (strncmp(name, SILLYNAME_PREF, sizeof(SILLYNAME_PREF) - 1))
+ return false;
+
+ name += sizeof(SILLYNAME_PREF) - 1;
+ for (i = 0; i < SILLYNAME_SUFF_LEN; i++)
+ if (!isxdigit(name[i]))
+ return false;
+
+ return true;
+}
+
+static inline bool nfs_silly_rename(char *rpath, const struct fd_parms *parms)
+{
+ return (parms->fs_type == NFS_SUPER_MAGIC) && is_sillyrename_name(rpath);
+}
+
+int strip_deleted(struct fd_link *link)
+{
+ struct dcache_prepends {
+ const char *str;
+ size_t len;
+ } static const prepends[] = {
+ {
+ .str = " (deleted)",
+ .len = 10,
+ }, {
+ .str = "//deleted",
+ .len = 9,
+ }
+ };
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(prepends); i++) {
+ size_t at;
+
+ if (link->len <= prepends[i].len)
+ continue;
+
+ at = link->len - prepends[i].len;
+ if (!strcmp(&link->name[at], prepends[i].str)) {
+ pr_debug("Strip '%s' tag from '%s'\n",
+ prepends[i].str, link->name);
+ link->name[at] = '\0';
+ link->len -= prepends[i].len;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static int check_path_remap(struct fd_link *link, const struct fd_parms *parms,
+ int lfd, u32 id, struct ns_id *nsid)
+{
+ char *rpath = link->name;
+ int plen = link->len;
+ int ret, mntns_root;
+ struct stat pst;
+ const struct stat *ost = &parms->stat;
+
+ if (parms->fs_type == PROC_SUPER_MAGIC) {
+ /* The file points to /proc/pid/<foo> where pid is a dead
+ * process. We remap this file by adding this pid to be
+ * fork()ed into a TASK_HELPER state so that we can point to it
+ * on restore.
+ */
+ pid_t pid;
+ char *start, *end;
+
+ /* skip "./proc/" */
+ start = strstr(rpath, "/");
+ if (!start)
+ return -1;
+ start = strstr(start + 1, "/");
+ if (!start)
+ return -1;
+ pid = strtol(start + 1, &end, 10);
+
+ /* If strtol didn't convert anything, then we are looking at
+ * something like /proc/kmsg, which we shouldn't mess with.
+ * Anything under /proc/<pid> (including that directory itself)
+ * can be c/r'd with a dead pid remap, so let's allow all such
+ * cases.
+ */
+ if (pid != 0) {
+ bool is_dead = strip_deleted(link);
+
+ /* /proc/<pid> will be "/proc/1 (deleted)" when it is
+ * dead, but a path like /proc/1/mountinfo won't have
+ * the suffix, since it isn't actually deleted (still
+ * exists, but the parent dir is deleted). So, if we
+ * have a path like /proc/1/mountinfo, test if /proc/1
+ * exists instead, since this is what CRIU will need to
+ * open on restore.
+ */
+ if (!is_dead) {
+ *end = 0;
+ is_dead = access(rpath, F_OK);
+ *end = '/';
+ }
+
+ if (is_dead) {
+ pr_info("Dumping dead process remap of %d\n", pid);
+ return dump_dead_process_remap(pid, id);
+ }
+ }
+
+ return 0;
+ } else if (parms->fs_type == DEVPTS_SUPER_MAGIC) {
+ /*
+ * It's safe to call stripping here because
+ * file paths are having predefined format for
+ * this FS and can't have a valid " (deleted)"
+ * postfix as a part of not deleted filename.
+ */
+ strip_deleted(link);
+ /*
+ * Devpts devices/files are generated by the
+ * kernel itself so we should not try to generate
+ * any kind of ghost files here even if file is
+ * no longer exist.
+ */
+ return 0;
+ }
+
+ if (ost->st_nlink == 0) {
+ /*
+ * Unpleasant, but easy case. File is completely invisible
+ * from the FS. Just dump its contents and that's it. But
+ * be careful whether anybody still has any of its hardlinks
+ * also open.
+ */
+ strip_deleted(link);
+ return dump_ghost_remap(rpath + 1, ost, lfd, id, nsid);
+ }
+
+ if (nfs_silly_rename(rpath, parms)) {
+ /*
+ * If this is NFS silly-rename file the path we have at hands
+ * will be accessible by fstat(), but once we kill the dumping
+ * tasks it will disappear. So we just go ahead an dump it as
+ * linked-remap file (NFS will allow us to create more hard
+ * links on it) to have some persistent name at hands.
+ */
+ pr_debug("Dump silly-rename linked remap for %x\n", id);
+ return dump_linked_remap(rpath + 1, plen - 1, ost, lfd, id, nsid);
+ }
+
+ mntns_root = mntns_get_root_fd(nsid);
+ if (mntns_root < 0)
+ return -1;
+
+ ret = fstatat(mntns_root, rpath, &pst, 0);
+ if (ret < 0) {
+ /*
+ * Linked file, but path is not accessible (unless any
+ * other error occurred). We can create a temporary link to it
+ * uning linkat with AT_EMPTY_PATH flag and remap it to this
+ * name.
+ */
+
+ if (errno == ENOENT)
+ return dump_linked_remap(rpath + 1, plen - 1,
+ ost, lfd, id, nsid);
+
+ pr_perror("Can't stat path");
+ return -1;
+ }
+
+ if ((pst.st_ino != ost->st_ino) || (pst.st_dev != ost->st_dev)) {
+ if (opts.evasive_devices &&
+ (S_ISCHR(ost->st_mode) || S_ISBLK(ost->st_mode)) &&
+ pst.st_rdev == ost->st_rdev)
+ return 0;
+ /*
+ * FIXME linked file, but the name we see it by is reused
+ * by somebody else. We can dump it with linked remaps, but
+ * we'll have difficulties on restore -- we will have to
+ * move the exisint file aside, then restore this one,
+ * unlink, then move the original file back. It's fairly
+ * easy to do, but we don't do it now, since unlinked files
+ * have the "(deleted)" suffix in proc and name conflict
+ * is unlikely :)
+ */
+ pr_err("Unaccessible path opened %u:%u, need %u:%u\n",
+ (int)pst.st_dev, (int)pst.st_ino,
+ (int)ost->st_dev, (int)ost->st_ino);
+ return -1;
+ }
+
+ /*
+ * File is linked and visible by the name it is opened by
+ * this task. Go ahead and dump it.
+ */
+ return 0;
+}
+
+static bool should_check_size(int flags)
+{
+ /* Skip size if file has O_APPEND and O_WRONLY flags (e.g. log file). */
+ if (((flags & O_ACCMODE) == O_WRONLY) &&
+ (flags & O_APPEND))
+ return false;
+
+ return true;
+}
+
+int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p)
+{
+ struct fd_link _link, *link;
+ struct ns_id *nsid;
+ struct cr_img *rimg;
+
+ RegFileEntry rfe = REG_FILE_ENTRY__INIT;
+
+ if (!p->link) {
+ if (fill_fdlink(lfd, p, &_link))
+ return -1;
+ link = &_link;
+ } else
+ link = p->link;
+
+ nsid = lookup_nsid_by_mnt_id(p->mnt_id);
+ if (nsid == NULL) {
+ pr_err("Can't lookup mount=%d for fd=%d path=%s\n",
+ p->mnt_id, p->fd, link->name + 1);
+ return -1;
+ }
+
+ if (p->mnt_id >= 0 && (root_ns_mask & CLONE_NEWNS)) {
+ rfe.mnt_id = p->mnt_id;
+ rfe.has_mnt_id = true;
+ }
+
+ pr_info("Dumping path for %d fd via self %d [%s]\n",
+ p->fd, lfd, &link->name[1]);
+
+ /*
+ * The regular path we can handle should start with slash.
+ */
+ if (link->name[1] != '/') {
+ pr_err("The path [%s] is not supported\n", &link->name[1]);
+ return -1;
+ }
+
+ if (check_path_remap(link, p, lfd, id, nsid))
+ return -1;
+
+ rfe.id = id;
+ rfe.flags = p->flags;
+ rfe.pos = p->pos;
+ rfe.fown = (FownEntry *)&p->fown;
+ rfe.name = &link->name[1];
+
+ if (S_ISREG(p->stat.st_mode) && should_check_size(rfe.flags)) {
+ rfe.has_size = true;
+ rfe.size = p->stat.st_size;
+ }
+
+ rimg = img_from_set(glob_imgset, CR_FD_REG_FILES);
+ return pb_write_one(rimg, &rfe, PB_REG_FILE);
+}
+
+const struct fdtype_ops regfile_dump_ops = {
+ .type = FD_TYPES__REG,
+ .dump = dump_one_reg_file,
+};
+
+static void convert_path_from_another_mp(char *src, char *dst, int dlen,
+ struct mount_info *smi,
+ struct mount_info *dmi)
+{
+ int off;
+
+ /*
+ * mi->mountpoint ./foo/bar
+ * mi->ns_mountpoint /foo/bar
+ * rfi->path foo/bar/baz
+ */
+ off = strlen(smi->ns_mountpoint + 1);
+ BUG_ON(strlen(smi->root) < strlen(dmi->root));
+
+ /*
+ * Create paths relative to this mount.
+ * Absolute path to the mount point + difference between source
+ * and destination roots + path relative to the mountpoint.
+ */
+ snprintf(dst, dlen, "%s/%s/%s",
+ dmi->ns_mountpoint + 1,
+ smi->root + strlen(dmi->root),
+ src + off);
+}
+
+static int linkat_hard(int odir, char *opath, int ndir, char *npath, uid_t owner)
+{
+ int ret, old_fsuid = -1;
+ int errno_save;
+
+ if (root_ns_mask & CLONE_NEWUSER)
+ /*
+ * Kernel has strange secutiry restrictions about
+ * linkat. If the fsuid of the caller doesn't equals
+ * the uid of the file and the file is not "safe"
+ * one, then only global CAP_CHOWN will be allowed
+ * to link().
+ *
+ * Next, when we're in user namespace we're ns root,
+ * but not global CAP_CHOWN. Thus, even though we
+ * ARE ns root, we will not be allowed to link() at
+ * files that belong to regular users %)
+ *
+ * Fortunately, the setfsuid() requires ns-level
+ * CAP_SETUID which we have.
+ */
+
+ old_fsuid = setfsuid(owner);
+
+ ret = linkat(odir, opath, ndir, npath, 0);
+ errno_save = errno;
+ if (ret < 0)
+ pr_perror("Can't link %s -> %s", opath, npath);
+
+ if (root_ns_mask & CLONE_NEWUSER) {
+ setfsuid(old_fsuid);
+ if (setfsuid(-1) != old_fsuid) {
+ pr_warn("Failed to restore old fsuid!\n");
+ /*
+ * Don't fail here. We still have chances to run till
+ * the pie/restorer, and if _this_ guy fails to set
+ * the proper fsuid, then we'll abort the restore.
+ */
+ }
+
+ /*
+ * Restoring PR_SET_DUMPABLE flag is required after setfsuid,
+ * as if it not set, proc inode will be created with root cred
+ * (see proc_pid_make_inode), which will result in permission
+ * check fail when trying to access files in /proc/self/
+ */
+ prctl(PR_SET_DUMPABLE, 1, 0);
+ }
+ errno = errno_save;
+
+ return ret;
+}
+
+static void rm_parent_dirs(int mntns_root, char *path, int count)
+{
+ char *p, *prev = NULL;
+
+ if (!count)
+ return;
+
+ while (count--) {
+ p = strrchr(path, '/');
+ if (p)
+ *p = '\0';
+ if (prev)
+ *prev = '/';
+
+ if (unlinkat(mntns_root, path, AT_REMOVEDIR))
+ pr_perror("Can't remove %s AT %d", path, mntns_root);
+ else
+ pr_debug("Unlinked parent dir: %s AT %d\n", path, mntns_root);
+ prev = p;
+ }
+
+ if (prev)
+ *prev = '/';
+}
+
+/* Construct parent dir name and mkdir parent/grandparents if they're not exist */
+static int make_parent_dirs_if_need(int mntns_root, char *path)
+{
+ char *p, *last_delim;
+ int err, count = 0;
+ struct stat st;
+
+ p = last_delim = strrchr(path, '/');
+ if (!p) {
+ pr_err("Path %s has no parent dir", path);
+ return -1;
+ }
+ *p = '\0';
+
+ if (fstatat(mntns_root, path, &st, AT_EMPTY_PATH) == 0)
+ goto out;
+ if (errno != ENOENT) {
+ pr_perror("Can't stat %s", path);
+ count = -1;
+ goto out;
+ }
+
+ p = path;
+ do {
+ p = strchr(p, '/');
+ if (p)
+ *p = '\0';
+
+ err = mkdirat(mntns_root, path, 0777);
+ if (err && errno != EEXIST) {
+ pr_perror("Can't create dir: %s AT %d", path, mntns_root);
+ rm_parent_dirs(mntns_root, path, count);
+ count = -1;
+ goto out;
+ } else if (!err) {
+ pr_debug("Created parent dir: %s AT %d\n", path, mntns_root);
+ count++;
+ }
+
+ if (p)
+ *p++ = '/';
+ } while (p);
+out:
+ *last_delim = '/';
+ return count;
+}
+
+/*
+ * This routine properly resolves d's path handling ghost/link-remaps.
+ * The open_cb is a routine that does actual open, it differs for
+ * files, directories, fifos, etc.
+ */
+
+static int rfi_remap(struct reg_file_info *rfi, int *level)
+{
+ struct mount_info *mi, *rmi, *tmi;
+ char _path[PATH_MAX], *path = _path;
+ char _rpath[PATH_MAX], *rpath = _rpath;
+ int mntns_root;
+
+ if (rfi->rfe->mnt_id == -1) {
+ /* Know nothing about mountpoints */
+ mntns_root = mntns_get_root_by_mnt_id(-1);
+ path = rfi->path;
+ rpath = rfi->remap->rpath;
+ goto out_root;
+ }
+
+ mi = lookup_mnt_id(rfi->rfe->mnt_id);
+ if (rfi->rfe->mnt_id == rfi->remap->rmnt_id) {
+ /* Both links on the same mount point */
+ tmi = mi;
+ path = rfi->path;
+ rpath = rfi->remap->rpath;
+ goto out;
+ }
+
+ rmi = lookup_mnt_id(rfi->remap->rmnt_id);
+
+ /*
+ * Find the common bind-mount. We know that one mount point was
+ * really mounted and all other were bind-mounted from it, so the
+ * lowest mount must contains all bind-mounts.
+ */
+ for (tmi = mi; tmi->bind; tmi = tmi->bind)
+ ;
+
+ BUG_ON(tmi->s_dev != rmi->s_dev);
+ BUG_ON(tmi->s_dev != mi->s_dev);
+
+ /* Calcalate paths on the device (root mount) */
+ convert_path_from_another_mp(rfi->path, path, sizeof(_path), mi, tmi);
+ convert_path_from_another_mp(rfi->remap->rpath, rpath, sizeof(_rpath), rmi, tmi);
+
+out:
+ pr_debug("%d: Link %s -> %s\n", tmi->mnt_id, rpath, path);
+ mntns_root = mntns_get_root_fd(tmi->nsid);
+
+out_root:
+ *level = make_parent_dirs_if_need(mntns_root, path);
+ if (*level < 0)
+ return -1;
+
+ if (linkat_hard(mntns_root, rpath, mntns_root, path, rfi->remap->owner) < 0) {
+ rm_parent_dirs(mntns_root, path, *level);
+ return -1;
+ }
+
+ return 0;
+}
+
+int open_path(struct file_desc *d,
+ int(*open_cb)(int mntns_root, struct reg_file_info *, void *), void *arg)
+{
+ int tmp, mntns_root, level;
+ struct reg_file_info *rfi;
+ char *orig_path = NULL;
+
+ if (inherited_fd(d, &tmp))
+ return tmp;
+
+ rfi = container_of(d, struct reg_file_info, d);
+ if (rfi->remap) {
+ mutex_lock(ghost_file_mutex);
+ if (rfi->remap->is_dir) {
+ /*
+ * FIXME Can't make directory under new name.
+ * Will have to open it under the ghost one :(
+ */
+ orig_path = rfi->path;
+ rfi->path = rfi->remap->rpath;
+ } else if (rfi_remap(rfi, &level) < 0) {
+ static char tmp_path[PATH_MAX];
+
+ if (errno != EEXIST) {
+ pr_err("Can't link %s -> %s", rfi->path,
+ rfi->remap->rpath);
+ return -1;
+ }
+
+ /*
+ * The file whose name we're trying to create
+ * exists. Need to pick some other one, we're
+ * going to remove it anyway.
+ *
+ * Strictly speaking, this is cheating, file
+ * name shouldn't change. But since NFS with
+ * its silly-rename doesn't care, why should we?
+ */
+
+ orig_path = rfi->path;
+ rfi->path = tmp_path;
+ snprintf(tmp_path, sizeof(tmp_path), "%s.cr_link", orig_path);
+ pr_debug("Fake %s -> %s link\n", rfi->path, rfi->remap->rpath);
+
+ if (rfi_remap(rfi, &level) < 0) {
+ pr_perror("Can't create even fake link!");
+ return -1;
+ }
+ }
+ }
+
+ mntns_root = mntns_get_root_by_mnt_id(rfi->rfe->mnt_id);
+ tmp = open_cb(mntns_root, rfi, arg);
+ if (tmp < 0) {
+ pr_perror("Can't open file %s", rfi->path);
+ return -1;
+ }
+
+ if (rfi->rfe->has_size && !rfi->size_checked) {
+ struct stat st;
+
+ if (fstat(tmp, &st) < 0) {
+ pr_perror("Can't fstat opened file");
+ return -1;
+ }
+
+ if (st.st_size != rfi->rfe->size) {
+ pr_err("File %s has bad size %"PRIu64" (expect %"PRIu64")\n",
+ rfi->path, st.st_size,
+ rfi->rfe->size);
+ return -1;
+ }
+
+ /*
+ * This is only visible in the current process, so
+ * change w/o locks. Other tasks sharing the same
+ * file will get one via unix sockets.
+ */
+ rfi->size_checked = true;
+ }
+
+ if (rfi->remap) {
+ if (!rfi->remap->is_dir) {
+ unlinkat(mntns_root, rfi->path, 0);
+ rm_parent_dirs(mntns_root, rfi->path, level);
+ }
+
+ BUG_ON(!rfi->remap->users);
+ if (--rfi->remap->users == 0) {
+ pr_info("Unlink the ghost %s\n", rfi->remap->rpath);
+ mntns_root = mntns_get_root_by_mnt_id(rfi->remap->rmnt_id);
+ unlinkat(mntns_root, rfi->remap->rpath, rfi->remap->is_dir ? AT_REMOVEDIR : 0);
+ }
+
+ if (orig_path)
+ rfi->path = orig_path;
+ mutex_unlock(ghost_file_mutex);
+ }
+
+ if (restore_fown(tmp, rfi->rfe->fown))
+ return -1;
+
+ return tmp;
+}
+
+int do_open_reg_noseek_flags(int ns_root_fd, struct reg_file_info *rfi, void *arg)
+{
+ u32 flags = *(u32 *)arg;
+ int fd;
+
+ fd = openat(ns_root_fd, rfi->path, flags);
+ if (fd < 0) {
+ pr_perror("Can't open file %s on restore", rfi->path);
+ return fd;
+ }
+
+ return fd;
+}
+
+static int do_open_reg_noseek(int ns_root_fd, struct reg_file_info *rfi, void *arg)
+{
+ return do_open_reg_noseek_flags(ns_root_fd, rfi, &rfi->rfe->flags);
+}
+
+static int do_open_reg(int ns_root_fd, struct reg_file_info *rfi, void *arg)
+{
+ int fd;
+
+ fd = do_open_reg_noseek(ns_root_fd, rfi, arg);
+ if (fd < 0)
+ return fd;
+
+ if ((rfi->rfe->pos != -1ULL) &&
+ lseek(fd, rfi->rfe->pos, SEEK_SET) < 0) {
+ pr_perror("Can't restore file pos");
+ close(fd);
+ return -1;
+ }
+
+ return fd;
+}
+
+int open_reg_fd(struct file_desc *fd)
+{
+ return open_path(fd, do_open_reg_noseek, NULL);
+}
+
+int open_reg_by_id(u32 id)
+{
+ struct file_desc *fd;
+
+ /*
+ * This one gets called by exe link, chroot and cwd
+ * restoring code. No need in calling lseek on either
+ * of them.
+ */
+
+ fd = find_file_desc_raw(FD_TYPES__REG, id);
+ if (fd == NULL) {
+ pr_err("Can't find regfile for %#x\n", id);
+ return -1;
+ }
+
+ return open_reg_fd(fd);
+}
+
+int get_filemap_fd(struct vma_area *vma)
+{
+ u32 flags;
+
+ /*
+ * Thevma->fd should have been assigned in collect_filemap
+ *
+ * We open file w/o lseek, as mappings don't care about it
+ */
+
+ BUG_ON(vma->vmfd == NULL);
+ if (vma->e->has_fdflags)
+ flags = vma->e->fdflags;
+ else if ((vma->e->prot & PROT_WRITE) &&
+ vma_area_is(vma, VMA_FILE_SHARED))
+ flags = O_RDWR;
+ else
+ flags = O_RDONLY;
+
+ return open_path(vma->vmfd, do_open_reg_noseek_flags, &flags);
+}
+
+static void remap_get(struct file_desc *fdesc, char typ)
+{
+ struct reg_file_info *rfi;
+
+ rfi = container_of(fdesc, struct reg_file_info, d);
+ if (rfi->remap) {
+ pr_debug("One more remap user (%c) for %s\n",
+ typ, rfi->remap->rpath);
+ /* No lock, we're still sngle-process here */
+ rfi->remap->users++;
+ }
+}
+
+static void collect_reg_fd(struct file_desc *fdesc,
+ struct fdinfo_list_entry *fle, struct rst_info *ri)
+{
+ if (list_empty(&fdesc->fd_info_head))
+ remap_get(fdesc, 'f');
+
+ collect_gen_fd(fle, ri);
+}
+
+static int open_fe_fd(struct file_desc *fd)
+{
+ return open_path(fd, do_open_reg, NULL);
+}
+
+static char *reg_file_path(struct file_desc *d, char *buf, size_t s)
+{
+ struct reg_file_info *rfi;
+
+ rfi = container_of(d, struct reg_file_info, d);
+ return rfi->path;
+}
+
+static struct file_desc_ops reg_desc_ops = {
+ .type = FD_TYPES__REG,
+ .open = open_fe_fd,
+ .collect_fd = collect_reg_fd,
+ .name = reg_file_path,
+};
+
+struct file_desc *try_collect_special_file(u32 id, int optional)
+{
+ struct file_desc *fdesc;
+
+ /*
+ * Files dumped for vmas/exe links can have remaps
+ * configured. Need to bump-up users for them, otherwise
+ * the open_path() would unlink the remap file after
+ * the very first open.
+ */
+
+ fdesc = find_file_desc_raw(FD_TYPES__REG, id);
+ if (fdesc == NULL) {
+ if (!optional)
+ pr_err("No entry for reg-file-ID %#x\n", id);
+ return NULL;
+ }
+
+ remap_get(fdesc, 's');
+ return fdesc;
+}
+
+static int collect_one_regfile(void *o, ProtobufCMessage *base)
+{
+ struct reg_file_info *rfi = o;
+ static char dot[] = ".";
+
+ rfi->rfe = pb_msg(base, RegFileEntry);
+ /* change "/foo" into "foo" and "/" into "." */
+ if (rfi->rfe->name[1] == '\0')
+ rfi->path = dot;
+ else
+ rfi->path = rfi->rfe->name + 1;
+ rfi->remap = NULL;
+ rfi->size_checked = false;
+
+ pr_info("Collected [%s] ID %#x\n", rfi->path, rfi->rfe->id);
+ return file_desc_add(&rfi->d, rfi->rfe->id, ®_desc_ops);
+}
+
+static struct collect_image_info reg_file_cinfo = {
+ .fd_type = CR_FD_REG_FILES,
+ .pb_type = PB_REG_FILE,
+ .priv_size = sizeof(struct reg_file_info),
+ .collect = collect_one_regfile,
+};
+
+int prepare_shared_reg_files(void)
+{
+ ghost_file_mutex = shmalloc(sizeof(*ghost_file_mutex));
+ if (!ghost_file_mutex)
+ return -1;
+
+ mutex_init(ghost_file_mutex);
+ return 0;
+}
+
+int collect_remaps_and_regfiles(void)
+{
+ if (collect_image(®_file_cinfo))
+ return -1;
+
+ if (collect_image(&remap_cinfo))
+ return -1;
+
+ return 0;
+}
diff --git a/criu/files.c b/criu/files.c
new file mode 100644
index 000000000000..db15527e9ed9
--- /dev/null
+++ b/criu/files.c
@@ -0,0 +1,1587 @@
+#include <unistd.h>
+#include <fcntl.h>
+#include <errno.h>
+
+#include <linux/limits.h>
+#include <linux/major.h>
+
+#include <sys/types.h>
+#include <sys/prctl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <stdlib.h>
+
+#include "files.h"
+#include "file-ids.h"
+#include "files-reg.h"
+#include "file-lock.h"
+#include "image.h"
+#include "list.h"
+#include "util.h"
+#include "util-pie.h"
+#include "lock.h"
+#include "sockets.h"
+#include "pstree.h"
+#include "tty.h"
+#include "pipes.h"
+#include "fifo.h"
+#include "eventfd.h"
+#include "eventpoll.h"
+#include "fsnotify.h"
+#include "mount.h"
+#include "signalfd.h"
+#include "namespaces.h"
+#include "tun.h"
+#include "timerfd.h"
+#include "imgset.h"
+#include "fs-magic.h"
+#include "proc_parse.h"
+#include "cr_options.h"
+
+#include "parasite.h"
+#include "parasite-syscall.h"
+
+#include "protobuf.h"
+#include "protobuf/fs.pb-c.h"
+#include "protobuf/ext-file.pb-c.h"
+
+#include "plugin.h"
+
+#define FDESC_HASH_SIZE 64
+static struct hlist_head file_desc_hash[FDESC_HASH_SIZE];
+
+int prepare_shared_fdinfo(void)
+{
+ int i;
+
+ for (i = 0; i < FDESC_HASH_SIZE; i++)
+ INIT_HLIST_HEAD(&file_desc_hash[i]);
+
+ return 0;
+}
+
+void file_desc_init(struct file_desc *d, u32 id, struct file_desc_ops *ops)
+{
+ INIT_LIST_HEAD(&d->fd_info_head);
+ INIT_HLIST_NODE(&d->hash);
+
+ d->id = id;
+ d->ops = ops;
+}
+
+int file_desc_add(struct file_desc *d, u32 id, struct file_desc_ops *ops)
+{
+ file_desc_init(d, id, ops);
+ hlist_add_head(&d->hash, &file_desc_hash[id % FDESC_HASH_SIZE]);
+ return 0; /* this is to make tail-calls in collect_one_foo look nice */
+}
+
+struct file_desc *find_file_desc_raw(int type, u32 id)
+{
+ struct file_desc *d;
+ struct hlist_head *chain;
+
+ chain = &file_desc_hash[id % FDESC_HASH_SIZE];
+ hlist_for_each_entry(d, chain, hash)
+ if (d->ops->type == type && d->id == id)
+ return d;
+
+ return NULL;
+}
+
+static inline struct file_desc *find_file_desc(FdinfoEntry *fe)
+{
+ return find_file_desc_raw(fe->type, fe->id);
+}
+
+/*
+ * A file may be shared between several file descriptors. E.g
+ * when doing a fork() every fd of a forker and respective fds
+ * of the child have such. Another way of getting shared files
+ * is by dup()-ing them or sending them via unix sockets in
+ * SCM_RIGHTS message.
+ *
+ * We restore this type of things in 3 steps (states[] below)
+ *
+ * 1. Prepare step.
+ * Select which task will create the file (open() one, or
+ * call any other syscall for than (socket, pipe, etc.). All
+ * the others, that share one, create unix sockets under the
+ * respective file descriptor (transport socket).
+ * 2. Open step.
+ * The one who creates the file (the 'master') creates one,
+ * then creates one more unix socket (transport) and sends the
+ * created file over this socket to the other recepients.
+ * 3. Receive step.
+ * Those, who wait for the file to appear, receive one via
+ * the transport socket, then close the socket and dup() the
+ * received file descriptor into its place.
+ *
+ * There's the 4th step in the states[] array -- the post_open
+ * one. This one is not about file-sharing resolving, but about
+ * doing something with a file using it's 'desired' fd. The
+ * thing is that while going the 3-step process above, the file
+ * may appear in variuos places in the task's fd table, and if
+ * we want to do something with it's _final_ descriptor value,
+ * we should wait for it to appear there. So the post_open is
+ * called when the file is finally set into its place.
+ */
+
+struct fdinfo_list_entry *file_master(struct file_desc *d)
+{
+ if (list_empty(&d->fd_info_head)) {
+ pr_err("Empty list on file desc id %#x(%d)\n", d->id,
+ d->ops ? d->ops->type : -1);
+ BUG();
+ }
+
+ return list_first_entry(&d->fd_info_head,
+ struct fdinfo_list_entry, desc_list);
+}
+
+void show_saved_files(void)
+{
+ int i;
+ struct file_desc *fd;
+
+ pr_info("File descs:\n");
+ for (i = 0; i < FDESC_HASH_SIZE; i++)
+ hlist_for_each_entry(fd, &file_desc_hash[i], hash) {
+ struct fdinfo_list_entry *le;
+
+ pr_info(" `- type %d ID %#x\n", fd->ops->type, fd->id);
+ list_for_each_entry(le, &fd->fd_info_head, desc_list)
+ pr_info(" `- FD %d pid %d\n", le->fe->fd, le->pid);
+ }
+}
+
+/*
+ * Workaround for the OverlayFS bug present before Kernel 4.2
+ *
+ * This is here only to support the Linux Kernel between versions
+ * 3.18 and 4.2. After that, this workaround is not needed anymore,
+ * but it will work properly on both a kernel with and withouth the bug.
+ *
+ * When a process has a file open in an OverlayFS directory,
+ * the information in /proc/<pid>/fd/<fd> and /proc/<pid>/fdinfo/<fd>
+ * is wrong. We can't even rely on stat()-ing /proc/<pid>/fd/<fd> since
+ * this will show us the wrong filesystem type.
+ *
+ * So we grab that information from the mountinfo table instead. This is done
+ * every time fill_fdlink is called. See lookup_overlayfs for more details.
+ *
+ */
+static int fixup_overlayfs(struct fd_parms *p, struct fd_link *link)
+{
+ struct mount_info *m;
+
+ if (!link)
+ return 0;
+
+ m = lookup_overlayfs(link->name, p->stat.st_dev, p->stat.st_ino, p->mnt_id);
+ if (IS_ERR(m))
+ return -1;
+
+ if (!m)
+ return 0;
+
+ p->mnt_id = m->mnt_id;
+
+ /*
+ * If the bug is present, the file path from /proc/<pid>/fd
+ * does not include the mountpoint, so we prepend it ourselves.
+ */
+ if (strcmp("./", m->mountpoint) != 0) {
+ char buf[PATH_MAX];
+ int n;
+
+ strncpy(buf, link->name, PATH_MAX - 1);
+ n = snprintf(link->name, PATH_MAX, "%s/%s", m->mountpoint, buf + 2);
+ if (n >= PATH_MAX) {
+ pr_err("Not enough space to replace %s\n", buf);
+ return -1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * The gen_id thing is used to optimize the comparison of shared files.
+ * If two files have different gen_ids, then they are different for sure.
+ * If it matches, we don't know it and have to call sys_kcmp().
+ *
+ * The kcmp-ids.c engine does this trick, see comments in it for more info.
+ */
+
+static u32 make_gen_id(const struct fd_parms *p)
+{
+ return ((u32)p->stat.st_dev) ^ ((u32)p->stat.st_ino) ^ ((u32)p->pos);
+}
+
+int do_dump_gen_file(struct fd_parms *p, int lfd,
+ const struct fdtype_ops *ops, struct cr_img *img)
+{
+ FdinfoEntry e = FDINFO_ENTRY__INIT;
+ int ret = -1;
+
+ e.type = ops->type;
+ e.id = make_gen_id(p);
+ e.fd = p->fd;
+ e.flags = p->fd_flags;
+
+ ret = fd_id_generate(p->pid, &e, p);
+ if (ret == 1) /* new ID generated */
+ ret = ops->dump(lfd, e.id, p);
+
+ if (ret < 0)
+ return ret;
+
+ pr_info("fdinfo: type: 0x%2x flags: %#o/%#o pos: 0x%8"PRIx64" fd: %d\n",
+ ops->type, p->flags, (int)p->fd_flags, p->pos, p->fd);
+
+ return pb_write_one(img, &e, PB_FDINFO);
+}
+
+int fill_fdlink(int lfd, const struct fd_parms *p, struct fd_link *link)
+{
+ int len;
+
+ link->name[0] = '.';
+
+ len = read_fd_link(lfd, &link->name[1], sizeof(link->name) - 1);
+ if (len < 0) {
+ pr_err("Can't read link for pid %d fd %d\n", p->pid, p->fd);
+ return -1;
+ }
+
+ link->len = len + 1;
+
+ if (opts.overlayfs)
+ if (fixup_overlayfs((struct fd_parms *)p, link) < 0)
+ return -1;
+ return 0;
+}
+
+static int fill_fd_params(struct parasite_ctl *ctl, int fd, int lfd,
+ struct fd_opts *opts, struct fd_parms *p)
+{
+ int ret;
+ struct statfs fsbuf;
+ struct fdinfo_common fdinfo = { .mnt_id = -1, .owner = ctl->pid.virt };
+
+ if (fstat(lfd, &p->stat) < 0) {
+ pr_perror("Can't stat fd %d", lfd);
+ return -1;
+ }
+
+ if (fstatfs(lfd, &fsbuf) < 0) {
+ pr_perror("Can't statfs fd %d", lfd);
+ return -1;
+ }
+
+ if (parse_fdinfo_pid(ctl->pid.real, fd, FD_TYPES__UND, NULL, &fdinfo))
+ return -1;
+
+ p->fs_type = fsbuf.f_type;
+ p->ctl = ctl;
+ p->fd = fd;
+ p->pos = fdinfo.pos;
+ p->flags = fdinfo.flags;
+ p->mnt_id = fdinfo.mnt_id;
+ p->pid = ctl->pid.real;
+ p->fd_flags = opts->flags;
+
+ fown_entry__init(&p->fown);
+
+ pr_info("%d fdinfo %d: pos: 0x%16"PRIx64" flags: %16o/%#x\n",
+ ctl->pid.real, fd, p->pos, p->flags, (int)p->fd_flags);
+
+ ret = fcntl(lfd, F_GETSIG, 0);
+ if (ret < 0) {
+ pr_perror("Can't get owner signum on %d", lfd);
+ return -1;
+ }
+ p->fown.signum = ret;
+
+ if (opts->fown.pid == 0)
+ return 0;
+
+ p->fown.pid = opts->fown.pid;
+ p->fown.pid_type = opts->fown.pid_type;
+ p->fown.uid = opts->fown.uid;
+ p->fown.euid = opts->fown.euid;
+
+ return 0;
+}
+
+static const struct fdtype_ops *get_misc_dev_ops(int minor)
+{
+ switch (minor) {
+ case TUN_MINOR:
+ return &tunfile_dump_ops;
+ };
+
+ return NULL;
+}
+
+static const struct fdtype_ops *get_mem_dev_ops(struct fd_parms *p, int minor)
+{
+ const struct fdtype_ops *ops = NULL;
+
+ switch (minor) {
+ case 11:
+ /*
+ * If /dev/kmsg is opened in write-only mode the file position
+ * should not be set up upon restore, kernel doesn't allow that.
+ */
+ if ((p->flags & O_ACCMODE) == O_WRONLY && p->pos == 0)
+ p->pos = -1ULL;
+ /*
+ * Fallthrough.
+ */
+ default:
+ ops = ®file_dump_ops;
+ break;
+ };
+
+ return ops;
+}
+
+static int dump_chrdev(struct fd_parms *p, int lfd, struct cr_img *img)
+{
+ int maj = major(p->stat.st_rdev);
+ const struct fdtype_ops *ops;
+
+ switch (maj) {
+ case MEM_MAJOR:
+ ops = get_mem_dev_ops(p, minor(p->stat.st_rdev));
+ break;
+ case MISC_MAJOR:
+ ops = get_misc_dev_ops(minor(p->stat.st_rdev));
+ if (ops)
+ break;
+ /* fallthrough */
+ default: {
+ char more[32];
+
+ if (is_tty(p->stat.st_rdev, p->stat.st_dev)) {
+ struct fd_link link;
+
+ if (fill_fdlink(lfd, p, &link))
+ return -1;
+ p->link = &link;
+ ops = &tty_dump_ops;
+ break;
+ }
+
+ sprintf(more, "%d:%d", maj, minor(p->stat.st_rdev));
+ return dump_unsupp_fd(p, lfd, img, "chr", more);
+ }
+ }
+
+ return do_dump_gen_file(p, lfd, ops, img);
+}
+
+static int dump_one_file(struct parasite_ctl *ctl, int fd, int lfd, struct fd_opts *opts,
+ struct cr_img *img)
+{
+ struct fd_parms p = FD_PARMS_INIT;
+ const struct fdtype_ops *ops;
+
+ if (fill_fd_params(ctl, fd, lfd, opts, &p) < 0) {
+ pr_perror("Can't get stat on %d", fd);
+ return -1;
+ }
+
+ if (note_file_lock(&ctl->pid, fd, lfd, &p))
+ return -1;
+
+ if (S_ISSOCK(p.stat.st_mode))
+ return dump_socket(&p, lfd, img);
+
+ if (S_ISCHR(p.stat.st_mode))
+ return dump_chrdev(&p, lfd, img);
+
+ if (p.fs_type == ANON_INODE_FS_MAGIC) {
+ char link[32];
+
+ if (read_fd_link(lfd, link, sizeof(link)) < 0)
+ return -1;
+
+ if (is_eventfd_link(link))
+ ops = &eventfd_dump_ops;
+ else if (is_eventpoll_link(link))
+ ops = &eventpoll_dump_ops;
+ else if (is_inotify_link(link))
+ ops = &inotify_dump_ops;
+ else if (is_fanotify_link(link))
+ ops = &fanotify_dump_ops;
+ else if (is_signalfd_link(link))
+ ops = &signalfd_dump_ops;
+ else if (is_timerfd_link(link))
+ ops = &timerfd_dump_ops;
+ else
+ return dump_unsupp_fd(&p, lfd, img, "anon", link);
+
+ return do_dump_gen_file(&p, lfd, ops, img);
+ }
+
+ if (S_ISREG(p.stat.st_mode) || S_ISDIR(p.stat.st_mode)) {
+ struct fd_link link;
+
+ if (fill_fdlink(lfd, &p, &link))
+ return -1;
+
+ p.link = &link;
+ if (link.name[1] == '/')
+ return do_dump_gen_file(&p, lfd, ®file_dump_ops, img);
+
+ if (check_ns_proc(&link))
+ return do_dump_gen_file(&p, lfd, &nsfile_dump_ops, img);
+
+ return dump_unsupp_fd(&p, lfd, img, "reg", link.name + 1);
+ }
+
+ if (S_ISFIFO(p.stat.st_mode)) {
+ if (p.fs_type == PIPEFS_MAGIC)
+ ops = &pipe_dump_ops;
+ else
+ ops = &fifo_dump_ops;
+
+ return do_dump_gen_file(&p, lfd, ops, img);
+ }
+
+ return dump_unsupp_fd(&p, lfd, img, "unknown", NULL);
+}
+
+int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item,
+ struct parasite_drain_fd *dfds)
+{
+ int *lfds;
+ struct cr_img *img;
+ struct fd_opts *opts;
+ int i, ret = -1;
+
+ pr_info("\n");
+ pr_info("Dumping opened files (pid: %d)\n", ctl->pid.real);
+ pr_info("----------------------------------------\n");
+
+ lfds = xmalloc(dfds->nr_fds * sizeof(int));
+ if (!lfds)
+ goto err;
+
+ opts = xmalloc(dfds->nr_fds * sizeof(struct fd_opts));
+ if (!opts)
+ goto err1;
+
+ ret = parasite_drain_fds_seized(ctl, dfds, lfds, opts);
+ if (ret)
+ goto err2;
+
+ img = open_image(CR_FD_FDINFO, O_DUMP, item->ids->files_id);
+ if (!img)
+ goto err2;
+
+ for (i = 0; i < dfds->nr_fds; i++) {
+ ret = dump_one_file(ctl, dfds->fds[i], lfds[i], opts + i, img);
+ close(lfds[i]);
+ if (ret)
+ break;
+ }
+
+ close_image(img);
+
+ pr_info("----------------------------------------\n");
+err2:
+ xfree(opts);
+err1:
+ xfree(lfds);
+err:
+ return ret;
+}
+
+static int predump_one_fd(int pid, int fd)
+{
+ const struct fdtype_ops *ops;
+ char link[PATH_MAX], t[32];
+ int ret = 0;
+
+ snprintf(t, sizeof(t), "/proc/%d/fd/%d", pid, fd);
+ ret = readlink(t, link, sizeof(link));
+ if (ret < 0) {
+ pr_perror("Can't read link of fd %d", fd);
+ return -1;
+ } else if ((size_t)ret == sizeof(link)) {
+ pr_err("Buffer for read link of fd %d is too small\n", fd);
+ return -1;
+ }
+ link[ret] = 0;
+
+ ret = 0;
+ if (is_inotify_link(link))
+ ops = &inotify_dump_ops;
+ else if (is_fanotify_link(link))
+ ops = &fanotify_dump_ops;
+ else
+ goto out;
+
+ pr_debug("Pre-dumping %d's %d fd\n", pid, fd);
+ ret = ops->pre_dump(pid, fd);
+out:
+ return ret;
+}
+
+int predump_task_files(int pid)
+{
+ struct dirent *de;
+ DIR *fd_dir;
+ int ret = -1;
+
+ pr_info("Pre-dump fds for %d)\n", pid);
+
+ fd_dir = opendir_proc(pid, "fd");
+ if (!fd_dir)
+ return -1;
+
+ while ((de = readdir(fd_dir))) {
+ if (dir_dots(de))
+ continue;
+
+ if (predump_one_fd(pid, atoi(de->d_name)))
+ goto out;
+ }
+
+ ret = 0;
+out:
+ closedir(fd_dir);
+ return ret;
+}
+
+int restore_fown(int fd, FownEntry *fown)
+{
+ struct f_owner_ex owner;
+ uid_t uids[3];
+ pid_t pid = getpid();
+
+ if (fown->signum) {
+ if (fcntl(fd, F_SETSIG, fown->signum)) {
+ pr_perror("%d: Can't set signal", pid);
+ return -1;
+ }
+ }
+
+ /* May be untouched */
+ if (!fown->pid)
+ return 0;
+
+ if (getresuid(&uids[0], &uids[1], &uids[2])) {
+ pr_perror("%d: Can't get current UIDs", pid);
+ return -1;
+ }
+
+ if (setresuid(fown->uid, fown->euid, uids[2])) {
+ pr_perror("%d: Can't set UIDs", pid);
+ return -1;
+ }
+
+ owner.type = fown->pid_type;
+ owner.pid = fown->pid;
+
+ if (fcntl(fd, F_SETOWN_EX, &owner)) {
+ pr_perror("%d: Can't setup %d file owner pid",
+ pid, fd);
+ return -1;
+ }
+
+ if (setresuid(uids[0], uids[1], uids[2])) {
+ pr_perror("%d: Can't revert UIDs back", pid);
+ return -1;
+ }
+
+ return 0;
+}
+
+int rst_file_params(int fd, FownEntry *fown, int flags)
+{
+ if (set_fd_flags(fd, flags) < 0)
+ return -1;
+ if (restore_fown(fd, fown) < 0)
+ return -1;
+ return 0;
+}
+
+static int collect_fd(int pid, FdinfoEntry *e, struct rst_info *rst_info)
+{
+ struct fdinfo_list_entry *le, *new_le;
+ struct file_desc *fdesc;
+
+ pr_info("Collect fdinfo pid=%d fd=%d id=%#x\n",
+ pid, e->fd, e->id);
+
+ new_le = shmalloc(sizeof(*new_le));
+ if (!new_le)
+ return -1;
+
+ futex_init(&new_le->real_pid);
+ new_le->pid = pid;
+ new_le->fe = e;
+
+ fdesc = find_file_desc(e);
+ if (fdesc == NULL) {
+ pr_err("No file for fd %d id %#x\n", e->fd, e->id);
+ return -1;
+ }
+
+ list_for_each_entry(le, &fdesc->fd_info_head, desc_list)
+ if (pid_rst_prio(new_le->pid, le->pid))
+ break;
+
+ if (fdesc->ops->collect_fd)
+ fdesc->ops->collect_fd(fdesc, new_le, rst_info);
+ else
+ collect_gen_fd(new_le, rst_info);
+
+ list_add_tail(&new_le->desc_list, &le->desc_list);
+ new_le->desc = fdesc;
+
+ return 0;
+}
+
+int prepare_ctl_tty(int pid, struct rst_info *rst_info, u32 ctl_tty_id)
+{
+ FdinfoEntry *e;
+
+ if (!ctl_tty_id)
+ return 0;
+
+ pr_info("Requesting for ctl tty %#x into service fd\n", ctl_tty_id);
+
+ e = xmalloc(sizeof(*e));
+ if (!e)
+ return -1;
+
+ fdinfo_entry__init(e);
+
+ e->id = ctl_tty_id;
+ e->fd = reserve_service_fd(CTL_TTY_OFF);
+ e->type = FD_TYPES__TTY;
+
+ if (collect_fd(pid, e, rst_info)) {
+ xfree(e);
+ return -1;
+ }
+
+ return 0;
+}
+
+int prepare_fd_pid(struct pstree_item *item)
+{
+ int ret = 0;
+ struct cr_img *img;
+ pid_t pid = item->pid.virt;
+ struct rst_info *rst_info = rsti(item);
+
+ INIT_LIST_HEAD(&rst_info->fds);
+ INIT_LIST_HEAD(&rst_info->eventpoll);
+ INIT_LIST_HEAD(&rst_info->tty_slaves);
+ INIT_LIST_HEAD(&rst_info->tty_ctty);
+
+ if (!fdinfo_per_id) {
+ img = open_image(CR_FD_FDINFO, O_RSTR, pid);
+ if (!img)
+ return -1;
+ } else {
+ if (item->ids == NULL) /* zombie */
+ return 0;
+
+ if (rsti(item)->fdt && rsti(item)->fdt->pid != item->pid.virt)
+ return 0;
+
+ img = open_image(CR_FD_FDINFO, O_RSTR, item->ids->files_id);
+ if (!img)
+ return -1;
+ }
+
+ while (1) {
+ FdinfoEntry *e;
+
+ ret = pb_read_one_eof(img, &e, PB_FDINFO);
+ if (ret <= 0)
+ break;
+
+ ret = collect_fd(pid, e, rst_info);
+ if (ret < 0) {
+ fdinfo_entry__free_unpacked(e, NULL);
+ break;
+ }
+ }
+
+ close_image(img);
+ return ret;
+}
+
+#define SETFL_MASK (O_APPEND | O_ASYNC | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
+int set_fd_flags(int fd, int flags)
+{
+ int ret;
+
+ ret = fcntl(fd, F_GETFL, 0);
+ if (ret < 0)
+ goto err;
+
+ flags = (SETFL_MASK & flags) | (ret & ~SETFL_MASK);
+
+ ret = fcntl(fd, F_SETFL, flags);
+ if (ret < 0)
+ goto err;
+
+ /* Let's check, that now actual flags contains those we need */
+ ret = fcntl(fd, F_GETFL, 0);
+ if (ret < 0)
+ goto err;
+
+ if (ret != flags) {
+ pr_err("fcntl call on fd %d (flags %#o) succeeded, "
+ "but some flags were dropped: %#o\n", fd, flags, ret);
+ return -1;
+ }
+ return 0;
+
+err:
+ pr_perror("fcntl call on fd %d (flags %x) failed", fd, flags);
+ return -1;
+}
+
+struct fd_open_state {
+ char *name;
+ int (*cb)(int, struct fdinfo_list_entry *);
+
+ /*
+ * Two last stages -- receive fds and post-open them -- are
+ * not required always. E.g. if no fd sharing takes place
+ * or task doens't have any files that need to be post-opened.
+ *
+ * Thus, in order not to scan through fdinfo-s lists in vain
+ * and speed things up a little bit, we may want to skeep these.
+ */
+ bool required;
+};
+
+static int open_transport_fd(int pid, struct fdinfo_list_entry *fle);
+static int open_fd(int pid, struct fdinfo_list_entry *fle);
+static int receive_fd(int pid, struct fdinfo_list_entry *fle);
+static int post_open_fd(int pid, struct fdinfo_list_entry *fle);
+
+static struct fd_open_state states[] = {
+ { "prepare", open_transport_fd, true,},
+ { "create", open_fd, true,},
+ { "receive", receive_fd, false,},
+ { "post_create", post_open_fd, false,},
+};
+
+#define want_recv_stage() do { states[2].required = true; } while (0)
+#define want_post_open_stage() do { states[3].required = true; } while (0)
+
+static void transport_name_gen(struct sockaddr_un *addr, int *len,
+ int pid, int fd)
+{
+ addr->sun_family = AF_UNIX;
+ snprintf(addr->sun_path, UNIX_PATH_MAX, "x/crtools-fd-%d-%d", pid, fd);
+ *len = SUN_LEN(addr);
+ *addr->sun_path = '\0';
+}
+
+static int should_open_transport(FdinfoEntry *fe, struct file_desc *fd)
+{
+ if (fd->ops->want_transport)
+ return fd->ops->want_transport(fe, fd);
+ else
+ return 0;
+}
+
+static int open_transport_fd(int pid, struct fdinfo_list_entry *fle)
+{
+ struct fdinfo_list_entry *flem;
+ struct sockaddr_un saddr;
+ int sock;
+ int ret, sun_len;
+
+ flem = file_master(fle->desc);
+
+ if (flem->pid == pid) {
+ if (flem->fe->fd != fle->fe->fd)
+ /* dup-ed file. Will be opened in the open_fd */
+ return 0;
+
+ if (!should_open_transport(fle->fe, fle->desc))
+ /* pure master file */
+ return 0;
+
+ /*
+ * some master file, that wants a transport, e.g.
+ * a pipe or unix socket pair 'slave' end
+ */
+ }
+
+ transport_name_gen(&saddr, &sun_len, getpid(), fle->fe->fd);
+
+ pr_info("\t\tCreate transport fd %s\n", saddr.sun_path + 1);
+
+
+ sock = socket(PF_UNIX, SOCK_DGRAM, 0);
+ if (sock < 0) {
+ pr_perror("Can't create socket");
+ return -1;
+ }
+ ret = bind(sock, &saddr, sun_len);
+ if (ret < 0) {
+ pr_perror("Can't bind unix socket %s", saddr.sun_path + 1);
+ goto err;
+ }
+
+ ret = reopen_fd_as(fle->fe->fd, sock);
+ if (ret < 0)
+ goto err;
+
+ pr_info("\t\tWake up fdinfo pid=%d fd=%d\n", fle->pid, fle->fe->fd);
+ futex_set_and_wake(&fle->real_pid, getpid());
+ want_recv_stage();
+
+ return 0;
+err:
+ close(sock);
+ return -1;
+}
+
+int send_fd_to_peer(int fd, struct fdinfo_list_entry *fle, int sock)
+{
+ struct sockaddr_un saddr;
+ int len;
+
+ pr_info("\t\tWait fdinfo pid=%d fd=%d\n", fle->pid, fle->fe->fd);
+ futex_wait_while(&fle->real_pid, 0);
+ transport_name_gen(&saddr, &len,
+ futex_get(&fle->real_pid), fle->fe->fd);
+ pr_info("\t\tSend fd %d to %s\n", fd, saddr.sun_path + 1);
+ return send_fd(sock, &saddr, len, fd);
+}
+
+static int send_fd_to_self(int fd, struct fdinfo_list_entry *fle, int *sock)
+{
+ int dfd = fle->fe->fd;
+
+ if (fd == dfd)
+ return 0;
+
+ /* make sure we won't clash with an inherit fd */
+ if (inherit_fd_resolve_clash(dfd) < 0)
+ return -1;
+
+ pr_info("\t\t\tGoing to dup %d into %d\n", fd, dfd);
+ if (move_img_fd(sock, dfd))
+ return -1;
+
+ if (dup2(fd, dfd) != dfd) {
+ pr_perror("Can't dup local fd %d -> %d", fd, dfd);
+ return -1;
+ }
+
+ if (fcntl(dfd, F_SETFD, fle->fe->flags) == -1) {
+ pr_perror("Unable to set file descriptor flags");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int post_open_fd(int pid, struct fdinfo_list_entry *fle)
+{
+ struct file_desc *d = fle->desc;
+
+ if (!d->ops->post_open)
+ return 0;
+
+ if (is_service_fd(fle->fe->fd, CTL_TTY_OFF))
+ return d->ops->post_open(d, fle->fe->fd);
+
+ if (fle != file_master(d))
+ return 0;
+
+ return d->ops->post_open(d, fle->fe->fd);
+}
+
+
+static int serve_out_fd(int pid, int fd, struct file_desc *d)
+{
+ int sock, ret;
+ struct fdinfo_list_entry *fle;
+
+ sock = socket(PF_UNIX, SOCK_DGRAM, 0);
+ if (sock < 0) {
+ pr_perror("Can't create socket");
+ return -1;
+ }
+
+ pr_info("\t\tCreate fd for %d\n", fd);
+
+ list_for_each_entry(fle, &d->fd_info_head, desc_list) {
+ if (pid == fle->pid)
+ ret = send_fd_to_self(fd, fle, &sock);
+ else
+ ret = send_fd_to_peer(fd, fle, sock);
+
+ if (ret) {
+ pr_err("Can't sent fd %d to %d\n", fd, fle->pid);
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ close(sock);
+ return ret;
+}
+
+static int open_fd(int pid, struct fdinfo_list_entry *fle)
+{
+ struct file_desc *d = fle->desc;
+ int new_fd;
+
+ if (d->ops->post_open)
+ want_post_open_stage();
+
+ if (fle != file_master(d))
+ return 0;
+
+ new_fd = d->ops->open(d);
+ if (new_fd < 0)
+ return -1;
+
+ if (reopen_fd_as(fle->fe->fd, new_fd))
+ return -1;
+
+ if (fcntl(fle->fe->fd, F_SETFD, fle->fe->flags) == -1) {
+ pr_perror("Unable to set file descriptor flags");
+ return -1;
+ }
+
+ return serve_out_fd(pid, fle->fe->fd, d);
+}
+
+static int receive_fd(int pid, struct fdinfo_list_entry *fle)
+{
+ int tmp;
+ struct fdinfo_list_entry *flem;
+
+ flem = file_master(fle->desc);
+ if (flem->pid == pid)
+ return 0;
+
+ pr_info("\tReceive fd for %d\n", fle->fe->fd);
+
+ tmp = recv_fd(fle->fe->fd);
+ if (tmp < 0) {
+ pr_err("Can't get fd %d\n", tmp);
+ return -1;
+ }
+ close(fle->fe->fd);
+
+ if (reopen_fd_as(fle->fe->fd, tmp) < 0)
+ return -1;
+
+ if (fcntl(fle->fe->fd, F_SETFD, fle->fe->flags) == -1) {
+ pr_perror("Unable to set file descriptor flags");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int open_fdinfo(int pid, struct fdinfo_list_entry *fle, int state)
+{
+ pr_info("\tRestoring fd %d (state -> %s)\n",
+ fle->fe->fd, states[state].name);
+ return states[state].cb(pid, fle);
+}
+
+static int open_fdinfos(int pid, struct list_head *list, int state)
+{
+ int ret = 0;
+ struct fdinfo_list_entry *fle;
+
+ list_for_each_entry(fle, list, ps_list) {
+ ret = open_fdinfo(pid, fle, state);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static struct inherit_fd *inherit_fd_lookup_fd(int fd, const char *caller);
+
+int close_old_fds(void)
+{
+ DIR *dir;
+ struct dirent *de;
+ int fd, ret;
+
+ dir = opendir_proc(PROC_SELF, "fd");
+ if (dir == NULL)
+ return -1;
+
+ while ((de = readdir(dir))) {
+ if (dir_dots(de))
+ continue;
+
+ ret = sscanf(de->d_name, "%d", &fd);
+ if (ret != 1) {
+ pr_err("Can't parse %s\n", de->d_name);
+ return -1;
+ }
+
+ if ((!is_any_service_fd(fd)) && (dirfd(dir) != fd) &&
+ !inherit_fd_lookup_fd(fd, __FUNCTION__))
+ close_safe(&fd);
+ }
+
+ closedir(dir);
+ close_pid_proc();
+
+ return 0;
+}
+
+int prepare_fds(struct pstree_item *me)
+{
+ u32 ret = 0;
+ int state;
+
+ pr_info("Opening fdinfo-s\n");
+
+ /*
+ * This must be done after forking to allow child
+ * to get the cgroup fd so it can move into the
+ * correct /tasks file if it is in a different cgroup
+ * set than its parent
+ */
+ close_service_fd(CGROUP_YARD);
+ close_pid_proc(); /* flush any proc cached fds we may have */
+
+ if (rsti(me)->fdt) {
+ struct fdt *fdt = rsti(me)->fdt;
+
+ /*
+ * Wait all tasks, who share a current fd table.
+ * We should be sure, that nobody use any file
+ * descriptor while fdtable is being restored.
+ */
+ futex_inc_and_wake(&fdt->fdt_lock);
+ futex_wait_while_lt(&fdt->fdt_lock, fdt->nr);
+
+ if (fdt->pid != me->pid.virt) {
+ pr_info("File descriptor table is shared with %d\n", fdt->pid);
+ futex_wait_until(&fdt->fdt_lock, fdt->nr + 1);
+ goto out;
+ }
+ }
+
+ for (state = 0; state < ARRAY_SIZE(states); state++) {
+ if (!states[state].required) {
+ pr_debug("Skipping %s fd stage\n", states[state].name);
+ continue;
+ }
+
+ ret = open_fdinfos(me->pid.virt, &rsti(me)->fds, state);
+ if (ret)
+ break;
+
+ /*
+ * Now handle TTYs. Slaves are delayed to be sure masters
+ * are already opened.
+ */
+ ret = open_fdinfos(me->pid.virt, &rsti(me)->tty_slaves, state);
+ if (ret)
+ break;
+
+ /*
+ * The eventpoll descriptors require all the other ones
+ * to be already restored, thus we store them in a separate
+ * list and restore at the very end.
+ */
+ ret = open_fdinfos(me->pid.virt, &rsti(me)->eventpoll, state);
+ if (ret)
+ break;
+ }
+
+ if (ret)
+ goto out_w;
+
+ for (state = 0; state < ARRAY_SIZE(states); state++) {
+ if (!states[state].required) {
+ pr_debug("Skipping %s fd stage\n", states[state].name);
+ continue;
+ }
+
+ /*
+ * Opening current TTYs require session to be already set up,
+ * thus slave peers already handled now it's time for cttys,
+ */
+ ret = open_fdinfos(me->pid.virt, &rsti(me)->tty_ctty, state);
+ if (ret)
+ break;
+ }
+out_w:
+ if (rsti(me)->fdt)
+ futex_inc_and_wake(&rsti(me)->fdt->fdt_lock);
+out:
+ close_service_fd(CR_PROC_FD_OFF);
+ tty_fini_fds();
+ return ret;
+}
+
+static int fchroot(int fd)
+{
+ char fd_path[PSFDS];
+ int proc;
+
+ /*
+ * There's no such thing in syscalls. We can emulate
+ * it using the /proc/self/fd/ :)
+ *
+ * But since there might be no /proc mount in our mount
+ * namespace, we will have to ... workaround it.
+ */
+
+ proc = get_service_fd(PROC_FD_OFF);
+ if (fchdir(proc) < 0) {
+ pr_perror("Can't chdir to proc");
+ return -1;
+ }
+
+ sprintf(fd_path, "./self/fd/%d", fd);
+ pr_debug("Going to chroot into %s\n", fd_path);
+ return chroot(fd_path);
+}
+
+int restore_fs(struct pstree_item *me)
+{
+ int dd_root = -1, dd_cwd = -1, ret, err = -1;
+ struct rst_info *ri = rsti(me);
+
+ /*
+ * First -- open both descriptors. We will not
+ * be able to open the cwd one after we chroot.
+ */
+
+ dd_root = open_reg_fd(ri->root);
+ if (dd_root < 0) {
+ pr_err("Can't open root\n");
+ goto out;
+ }
+
+ dd_cwd = open_reg_fd(ri->cwd);
+ if (dd_cwd < 0) {
+ pr_err("Can't open cwd\n");
+ goto out;
+ }
+
+ /*
+ * Now do chroot/chdir. Chroot goes first as it
+ * calls chdir into proc service descriptor so
+ * we'd need to fix chdir after it anyway.
+ */
+
+ ret = fchroot(dd_root);
+ if (ret < 0) {
+ pr_perror("Can't change root");
+ goto out;
+ }
+
+ ret = fchdir(dd_cwd);
+ if (ret < 0) {
+ pr_perror("Can't change cwd");
+ goto out;
+ }
+
+ if (ri->has_umask) {
+ pr_info("Restoring umask to %o\n", ri->umask);
+ umask(ri->umask);
+ }
+
+ err = 0;
+out:
+ if (dd_cwd >= 0)
+ close(dd_cwd);
+ if (dd_root >= 0)
+ close(dd_root);
+
+ return err;
+}
+
+int prepare_fs_pid(struct pstree_item *item)
+{
+ pid_t pid = item->pid.virt;
+ struct rst_info *ri = rsti(item);
+ struct cr_img *img;
+ FsEntry *fe;
+ int ret = -1;
+
+ img = open_image(CR_FD_FS, O_RSTR, pid);
+ if (!img)
+ goto out;
+
+ ret = pb_read_one_eof(img, &fe, PB_FS);
+ close_image(img);
+ if (ret <= 0)
+ goto out;
+
+ ri->cwd = collect_special_file(fe->cwd_id);
+ if (!ri->cwd) {
+ pr_err("Can't find task cwd file\n");
+ goto out_f;
+ }
+
+ ri->root = collect_special_file(fe->root_id);
+ if (!ri->root) {
+ pr_err("Can't find task root file\n");
+ goto out_f;
+ }
+
+ ri->has_umask = fe->has_umask;
+ ri->umask = fe->umask;
+
+ ret = 0;
+out_f:
+ fs_entry__free_unpacked(fe, NULL);
+out:
+ return ret;
+}
+
+int shared_fdt_prepare(struct pstree_item *item)
+{
+ struct pstree_item *parent = item->parent;
+ struct fdt *fdt;
+
+ if (!rsti(parent)->fdt) {
+ fdt = shmalloc(sizeof(*rsti(item)->fdt));
+ if (fdt == NULL)
+ return -1;
+
+ rsti(parent)->fdt = fdt;
+
+ futex_init(&fdt->fdt_lock);
+ fdt->nr = 1;
+ fdt->pid = parent->pid.virt;
+ } else
+ fdt = rsti(parent)->fdt;
+
+ rsti(item)->fdt = fdt;
+ rsti(item)->service_fd_id = fdt->nr;
+ fdt->nr++;
+ if (pid_rst_prio(item->pid.virt, fdt->pid))
+ fdt->pid = item->pid.virt;
+
+ return 0;
+}
+
+/*
+ * Inherit fd support.
+ *
+ * There are cases where a process's file descriptor cannot be restored
+ * from the checkpointed image. For example, a pipe file descriptor with
+ * one end in the checkpointed process and the other end in a separate
+ * process (that was not part of the checkpointed process tree) cannot be
+ * restored because after checkpoint the pipe would be broken and removed.
+ *
+ * There are also cases where the user wants to use a new file during
+ * restore instead of the original file in the checkpointed image. For
+ * example, the user wants to change the log file of a process from
+ * /path/to/oldlog to /path/to/newlog.
+ *
+ * In these cases, criu's caller should set up a new file descriptor to be
+ * inherited by the restored process and specify it with the --inherit-fd
+ * command line option. The argument of --inherit-fd has the format
+ * fd[%d]:%s, where %d tells criu which of its own file descriptor to use
+ * for restoring file identified by %s.
+ *
+ * As a debugging aid, if the argument has the format debug[%d]:%s, it tells
+ * criu to write out the string after colon to the file descriptor %d. This
+ * can be used to leave a "restore marker" in the output stream of the process.
+ *
+ * It's important to note that inherit fd support breaks applications
+ * that depend on the state of the file descriptor being inherited. So,
+ * consider inherit fd only for specific use cases that you know for sure
+ * won't break the application.
+ *
+ * For examples please visit http://criu.org/Category:HOWTO.
+ */
+
+struct inherit_fd {
+ struct list_head inh_list;
+ char *inh_id; /* file identifier */
+ int inh_fd; /* criu's descriptor to inherit */
+ dev_t inh_dev;
+ ino_t inh_ino;
+ mode_t inh_mode;
+ dev_t inh_rdev;
+};
+
+/*
+ * Return 1 if inherit fd has been closed or reused, 0 otherwise.
+ *
+ * Some parts of the file restore engine can close an inherit fd
+ * explicitly by close() or implicitly by dup2() to reuse that descriptor.
+ * In some specific functions (for example, send_fd_to_self()), we
+ * check for clashes at the beginning of the function and, therefore,
+ * these specific functions will not reuse an inherit fd. However, to
+ * avoid adding a ton of clash detect and resolve code everywhere we close()
+ * and/or dup2(), we just make sure that when we're dup()ing or close()ing
+ * our inherit fd we're still dealing with the same fd that we inherited.
+ */
+static int inherit_fd_reused(struct inherit_fd *inh)
+{
+ struct stat sbuf;
+
+ if (fstat(inh->inh_fd, &sbuf) == -1) {
+ if (errno == EBADF) {
+ pr_debug("Inherit fd %s -> %d has been closed\n",
+ inh->inh_id, inh->inh_fd);
+ return 1;
+ }
+ pr_perror("Can't fstat inherit fd %d", inh->inh_fd);
+ return -1;
+ }
+
+ if (inh->inh_dev != sbuf.st_dev || inh->inh_ino != sbuf.st_ino ||
+ inh->inh_mode != sbuf.st_mode || inh->inh_rdev != sbuf.st_rdev) {
+ pr_info("Inherit fd %s -> %d has been reused\n",
+ inh->inh_id, inh->inh_fd);
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * We can't print diagnostics messages in this function because the
+ * log file isn't initialized yet.
+ */
+int inherit_fd_parse(char *optarg)
+{
+ char *cp = NULL;
+ int n = -1;
+ int fd = -1;
+ int dbg = 0;
+
+ /*
+ * Parse the argument.
+ */
+ if (!strncmp(optarg, "fd", 2))
+ cp = &optarg[2];
+ else if (!strncmp(optarg, "debug", 5)) {
+ cp = &optarg[5];
+ dbg = 1;
+ }
+ if (cp) {
+ n = sscanf(cp, "[%d]:", &fd);
+ cp = strchr(optarg, ':');
+ }
+ if (n != 1 || fd < 0 || !cp || !cp[1]) {
+ pr_err("Invalid inherit fd argument: %s\n", optarg);
+ return -1;
+ }
+
+ /*
+ * If the argument is a debug string, write it to fd.
+ * Otherwise, add it to the inherit fd list.
+ */
+ cp++;
+ if (dbg) {
+ n = strlen(cp);
+ if (write(fd, cp, n) != n) {
+ pr_err("Can't write debug message %s to inherit fd %d\n",
+ cp, fd);
+ return -1;
+ }
+ return 0;
+ }
+
+ return inherit_fd_add(fd, cp);
+}
+
+int inherit_fd_add(int fd, char *key)
+{
+ struct inherit_fd *inh;
+ struct stat sbuf;
+
+ if (fstat(fd, &sbuf) == -1) {
+ pr_perror("Can't fstat inherit fd %d", fd);
+ return -1;
+ }
+
+ inh = xmalloc(sizeof *inh);
+ if (inh == NULL)
+ return -1;
+
+ inh->inh_id = key;
+ inh->inh_fd = fd;
+ inh->inh_dev = sbuf.st_dev;
+ inh->inh_ino = sbuf.st_ino;
+ inh->inh_mode = sbuf.st_mode;
+ inh->inh_rdev = sbuf.st_rdev;
+ list_add_tail(&inh->inh_list, &opts.inherit_fds);
+ return 0;
+}
+
+/*
+ * Log the inherit fd list. Called for diagnostics purposes
+ * after the log file is initialized.
+ */
+void inherit_fd_log(void)
+{
+ struct inherit_fd *inh;
+
+ list_for_each_entry(inh, &opts.inherit_fds, inh_list) {
+ pr_info("File %s will be restored from inherit fd %d\n",
+ inh->inh_id, inh->inh_fd);
+ }
+}
+
+/*
+ * Look up the inherit fd list by a file identifier.
+ */
+int inherit_fd_lookup_id(char *id)
+{
+ int ret;
+ struct inherit_fd *inh;
+
+ ret = -1;
+ list_for_each_entry(inh, &opts.inherit_fds, inh_list) {
+ if (!strcmp(inh->inh_id, id)) {
+ if (!inherit_fd_reused(inh)) {
+ ret = inh->inh_fd;
+ pr_debug("Found id %s (fd %d) in inherit fd list\n",
+ id, ret);
+ }
+ break;
+ }
+ }
+ return ret;
+}
+
+bool inherited_fd(struct file_desc *d, int *fd_p)
+{
+ char buf[32], *id_str;
+ int i_fd;
+
+ if (!d->ops->name)
+ return false;
+
+ id_str = d->ops->name(d, buf, sizeof(buf));
+ i_fd = inherit_fd_lookup_id(id_str);
+ if (i_fd < 0)
+ return false;
+
+ if (fd_p == NULL)
+ return true;
+
+ *fd_p = dup(i_fd);
+ if (*fd_p < 0)
+ pr_perror("Inherit fd DUP failed");
+ else
+ pr_info("File %s will be restored from fd %d dumped "
+ "from inherit fd %d\n", id_str, *fd_p, i_fd);
+ return true;
+}
+
+/*
+ * Look up the inherit fd list by a file descriptor.
+ */
+static struct inherit_fd *inherit_fd_lookup_fd(int fd, const char *caller)
+{
+ struct inherit_fd *ret;
+ struct inherit_fd *inh;
+
+ ret = NULL;
+ list_for_each_entry(inh, &opts.inherit_fds, inh_list) {
+ if (inh->inh_fd == fd) {
+ if (!inherit_fd_reused(inh)) {
+ ret = inh;
+ pr_debug("Found fd %d (id %s) in inherit fd list (caller %s)\n",
+ fd, inh->inh_id, caller);
+ }
+ break;
+ }
+ }
+ return ret;
+}
+
+/*
+ * If the specified fd clashes with an inherit fd,
+ * move the inherit fd.
+ */
+int inherit_fd_resolve_clash(int fd)
+{
+ int newfd;
+ struct inherit_fd *inh;
+
+ inh = inherit_fd_lookup_fd(fd, __FUNCTION__);
+ if (inh == NULL)
+ return 0;
+
+ newfd = dup(fd);
+ if (newfd == -1) {
+ pr_perror("Can't dup inherit fd %d", fd);
+ return -1;
+ }
+
+ if (close(fd) == -1) {
+ close(newfd);
+ pr_perror("Can't close inherit fd %d", fd);
+ return -1;
+ }
+
+ inh->inh_fd = newfd;
+ pr_debug("Inherit fd %d moved to %d to resolve clash\n", fd, inh->inh_fd);
+ return 0;
+}
+
+/*
+ * Close all inherit fds.
+ */
+int inherit_fd_fini()
+{
+ int reused;
+ struct inherit_fd *inh;
+
+ list_for_each_entry(inh, &opts.inherit_fds, inh_list) {
+ if (inh->inh_fd < 0) {
+ pr_err("File %s in inherit fd list has invalid fd %d\n",
+ inh->inh_id, inh->inh_fd);
+ return -1;
+ }
+
+ reused = inherit_fd_reused(inh);
+ if (reused < 0)
+ return -1;
+
+ if (!reused) {
+ pr_debug("Closing inherit fd %d -> %s\n", inh->inh_fd,
+ inh->inh_id);
+ if (close_safe(&inh->inh_fd) < 0)
+ return -1;
+ }
+ }
+ return 0;
+}
+
+bool external_lookup_id(char *id)
+{
+ struct external *ext;
+
+ list_for_each_entry(ext, &opts.external, node)
+ if (!strcmp(ext->id, id))
+ return true;
+ return false;
+}
diff --git a/criu/fsnotify.c b/criu/fsnotify.c
new file mode 100644
index 000000000000..59259c13e2c3
--- /dev/null
+++ b/criu/fsnotify.c
@@ -0,0 +1,940 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <string.h>
+#include <utime.h>
+#include <dirent.h>
+#include <limits.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/inotify.h>
+#include <sys/vfs.h>
+#include <linux/magic.h>
+#include <sys/wait.h>
+#include <sys/poll.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <aio.h>
+
+#include <sys/fanotify.h>
+
+#include "compiler.h"
+#include "asm/types.h"
+#include "imgset.h"
+#include "fsnotify.h"
+#include "proc_parse.h"
+#include "mount.h"
+#include "image.h"
+#include "util.h"
+#include "files.h"
+#include "files-reg.h"
+#include "file-ids.h"
+#include "log.h"
+#include "list.h"
+#include "lock.h"
+#include "irmap.h"
+#include "cr_options.h"
+#include "namespaces.h"
+#include "pstree.h"
+
+#include "protobuf.h"
+#include "protobuf/fsnotify.pb-c.h"
+#include "protobuf/mnt.pb-c.h"
+
+#undef LOG_PREFIX
+#define LOG_PREFIX "fsnotify: "
+
+struct fsnotify_mark_info {
+ struct list_head list;
+ union {
+ InotifyWdEntry *iwe;
+ FanotifyMarkEntry *fme;
+ };
+ struct file_remap *remap;
+};
+
+struct fsnotify_file_info {
+ struct list_head list;
+ union {
+ InotifyFileEntry *ife;
+ FanotifyFileEntry *ffe;
+ };
+ struct list_head marks;
+ struct file_desc d;
+};
+
+/* File handle */
+typedef struct {
+ u32 bytes;
+ u32 type;
+ u64 __handle[16];
+} fh_t;
+
+static LIST_HEAD(inotify_info_head);
+static LIST_HEAD(fanotify_info_head);
+
+/* Checks if file descriptor @lfd is inotify */
+int is_inotify_link(char *link)
+{
+ return is_anon_link_type(link, "inotify");
+}
+
+/* Checks if file descriptor @lfd is fanotify */
+int is_fanotify_link(char *link)
+{
+ return is_anon_link_type(link, "[fanotify]");
+}
+
+static void decode_handle(fh_t *handle, FhEntry *img)
+{
+ memzero(handle, sizeof(*handle));
+
+ handle->type = img->type;
+ handle->bytes = img->bytes;
+
+ memcpy(handle->__handle, img->handle,
+ min(pb_repeated_size(img, handle),
+ sizeof(handle->__handle)));
+}
+
+static int open_by_handle(void *arg, int fd, int pid)
+{
+ return open_by_handle_at(fd, arg, O_PATH);
+}
+
+static char *alloc_openable(unsigned int s_dev, unsigned long i_ino, FhEntry *f_handle)
+{
+ struct mount_info *m;
+ fh_t handle;
+ int fd = -1;
+ char *path;
+
+ decode_handle(&handle, f_handle);
+
+ /*
+ * We gonna try to open the handle and then
+ * depending on command line options and type
+ * of the filesystem (tmpfs/devtmpfs do not
+ * preserve their inodes between mounts) we
+ * might need to find out an openable path
+ * get used on restore as a watch destination.
+ */
+ for (m = mntinfo; m; m = m->next) {
+ char buf[PATH_MAX], *__path;
+ int mntfd, openable_fd;
+ struct stat st;
+
+ if (m->s_dev != s_dev)
+ continue;
+
+ mntfd = __open_mountpoint(m, -1);
+ pr_debug("\t\tTrying via mntid %d root %s ns_mountpoint @%s (%d)\n",
+ m->mnt_id, m->root, m->ns_mountpoint, mntfd);
+ if (mntfd < 0)
+ continue;
+
+ fd = userns_call(open_by_handle, UNS_FDOUT, &handle,
+ sizeof(handle), mntfd);
+ close(mntfd);
+ if (fd < 0)
+ continue;
+
+ if (read_fd_link(fd, buf, sizeof(buf)) < 0) {
+ close(fd);
+ goto err;
+ }
+ close(fd);
+
+ /*
+ * Convert into a relative path.
+ */
+ __path = (buf[1] != '\0') ? buf + 1 : ".";
+ pr_debug("\t\t\tlink as %s\n", __path);
+
+ mntfd = mntns_get_root_fd(m->nsid);
+ if (mntfd < 0)
+ goto err;
+
+ openable_fd = openat(mntfd, __path, O_PATH);
+ if (openable_fd >= 0) {
+ if (fstat(openable_fd, &st)) {
+ pr_perror("Can't stat on %s\n", __path);
+ close(openable_fd);
+ return ERR_PTR(-errno);
+ }
+ close(openable_fd);
+
+ pr_debug("\t\t\topenable (inode %s) as %s\n",
+ st.st_ino == i_ino ?
+ "match" : "don't match", __path);
+
+ if (st.st_ino == i_ino) {
+ path = xstrdup(buf);
+ if (path == NULL)
+ goto err;
+
+ f_handle->has_mnt_id = true;
+ f_handle->mnt_id = m->mnt_id;
+ return path;
+ }
+ } else
+ pr_debug("\t\t\tnot openable as %s (%m)\n", __path);
+ }
+
+ return ERR_PTR(-ENOENT);
+err:
+ return ERR_PTR(-1);
+}
+
+static int open_handle(unsigned int s_dev, unsigned long i_ino,
+ FhEntry *f_handle)
+{
+ int mntfd, fd = -1;
+ fh_t handle;
+
+ decode_handle(&handle, f_handle);
+
+ pr_debug("Opening fhandle %x:%Lx...\n",
+ s_dev, (unsigned long long)handle.__handle[0]);
+
+ mntfd = open_mount(s_dev);
+ if (mntfd < 0) {
+ pr_err("Mount root for 0x%08x not found\n", s_dev);
+ goto out;
+ }
+
+ fd = userns_call(open_by_handle, UNS_FDOUT, &handle, sizeof(handle), mntfd);
+ if (fd < 0) {
+ errno = -fd;
+ pr_perror("Can't open file handle for 0x%08x:0x%016lx",
+ s_dev, i_ino);
+ }
+
+ close(mntfd);
+out:
+ return fd;
+}
+
+int check_open_handle(unsigned int s_dev, unsigned long i_ino,
+ FhEntry *f_handle)
+{
+ int fd = -1;
+ char *path;
+
+ fd = open_handle(s_dev, i_ino, f_handle);
+ if (fd >= 0) {
+ struct mount_info *mi;
+
+ pr_debug("\tHandle 0x%x:0x%lx is openable\n", s_dev, i_ino);
+
+ mi = lookup_mnt_sdev(s_dev);
+ if (mi == NULL) {
+ pr_err("Unable to lookup a mount by dev 0x%x\n", s_dev);
+ goto err;
+ }
+
+ /*
+ * Always try to fetch watchee path first. There are several reasons:
+ *
+ * - tmpfs/devtmps do not save inode numbers between mounts,
+ * so it is critical to have the complete path under our
+ * hands for restore purpose;
+ *
+ * - in case of migration the inodes might be changed as well
+ * so the only portable solution is to carry the whole path
+ * to the watchee inside image.
+ */
+ path = alloc_openable(s_dev, i_ino, f_handle);
+ if (!IS_ERR_OR_NULL(path))
+ goto out;
+
+ if ((mi->fstype->code == FSTYPE__TMPFS) ||
+ (mi->fstype->code == FSTYPE__DEVTMPFS)) {
+ pr_err("Can't find suitable path for handle (dev %#x ino %#lx): %d\n",
+ s_dev, i_ino, (int)PTR_ERR(path));
+ goto err;
+ }
+
+ if (!opts.force_irmap)
+ /*
+ * If we're not forced to do irmap, then
+ * say we have no path for watch. Otherwise
+ * do irmap scan even if the handle is
+ * working.
+ *
+ * FIXME -- no need to open-by-handle if
+ * we are in force-irmap and not on tempfs
+ */
+ goto out_nopath;
+ }
+
+ pr_warn("\tHandle 0x%x:0x%lx cannot be opened\n", s_dev, i_ino);
+ path = irmap_lookup(s_dev, i_ino);
+ if (!path) {
+ pr_err("\tCan't dump that handle\n");
+ return -1;
+ }
+out:
+ pr_debug("\tDumping %s as path for handle\n", path);
+ f_handle->path = path;
+out_nopath:
+ close_safe(&fd);
+ return 0;
+err:
+ close_safe(&fd);
+ return -1;
+}
+
+struct watch_list {
+ struct fsnotify_params fsn_params;
+ struct list_head list;
+ int n;
+};
+
+static int dump_inotify_entry(union fdinfo_entries *e, void *arg)
+{
+ struct watch_list *wd_list = (struct watch_list *) arg;
+ struct inotify_wd_entry *wd_entry = (struct inotify_wd_entry *) e;
+ InotifyWdEntry *we = &wd_entry->e;
+
+ pr_info("wd: wd 0x%08x s_dev 0x%08x i_ino 0x%16"PRIx64" mask 0x%08x\n",
+ we->wd, we->s_dev, we->i_ino, we->mask);
+ pr_info("\t[fhandle] bytes 0x%08x type 0x%08x __handle 0x%016"PRIx64":0x%016"PRIx64"\n",
+ we->f_handle->bytes, we->f_handle->type,
+ we->f_handle->handle[0], we->f_handle->handle[1]);
+
+ if (we->mask & KERNEL_FS_EVENT_ON_CHILD)
+ pr_warn_once("\t\tDetected FS_EVENT_ON_CHILD bit "
+ "in mask (will be ignored on restore)\n");
+
+ if (check_open_handle(we->s_dev, we->i_ino, we->f_handle)) {
+ free_inotify_wd_entry(e);
+ return -1;
+ }
+
+ list_add_tail(&wd_entry->node, &wd_list->list);
+ wd_list->n++;
+
+ return 0;
+}
+
+static int dump_one_inotify(int lfd, u32 id, const struct fd_parms *p)
+{
+ struct watch_list wd_list = {.list = LIST_HEAD_INIT(wd_list.list), .n = 0};
+ InotifyFileEntry ie = INOTIFY_FILE_ENTRY__INIT;
+ union fdinfo_entries *we, *tmp;
+ int exit_code = -1, i, ret;
+
+ ret = fd_has_data(lfd);
+ if (ret < 0)
+ return -1;
+ else if (ret > 0)
+ pr_warn("The 0x%08x inotify events will be dropped\n", id);
+
+ ie.id = id;
+ ie.flags = p->flags;
+ ie.fown = (FownEntry *)&p->fown;
+
+ if (parse_fdinfo(lfd, FD_TYPES__INOTIFY, dump_inotify_entry, &wd_list))
+ goto free;
+
+ ie.wd = xmalloc(sizeof(*ie.wd) * wd_list.n);
+ if (!ie.wd)
+ goto free;
+
+ i = 0;
+ list_for_each_entry(we, &wd_list.list, ify.node)
+ ie.wd[i++] = &we->ify.e;
+ ie.n_wd = wd_list.n;
+
+ pr_info("id 0x%08x flags 0x%08x\n", ie.id, ie.flags);
+ if (pb_write_one(img_from_set(glob_imgset, CR_FD_INOTIFY_FILE), &ie, PB_INOTIFY_FILE))
+ goto free;
+
+ exit_code = 0;
+free:
+ xfree(ie.wd);
+ list_for_each_entry_safe(we, tmp, &wd_list.list, ify.node)
+ free_inotify_wd_entry(we);
+
+ return exit_code;
+}
+
+static int pre_dump_inotify_entry(union fdinfo_entries *e, void *arg)
+{
+ InotifyWdEntry *we = &e->ify.e;
+ int ret;
+
+ ret = irmap_queue_cache(we->s_dev, we->i_ino, we->f_handle);
+ free_inotify_wd_entry(e);
+
+ return ret;
+}
+
+static int pre_dump_one_inotify(int pid, int lfd)
+{
+ return parse_fdinfo_pid(pid, lfd, FD_TYPES__INOTIFY, pre_dump_inotify_entry, NULL);
+}
+
+const struct fdtype_ops inotify_dump_ops = {
+ .type = FD_TYPES__INOTIFY,
+ .dump = dump_one_inotify,
+ .pre_dump = pre_dump_one_inotify,
+};
+
+static int dump_fanotify_entry(union fdinfo_entries *e, void *arg)
+{
+ struct watch_list *wd_list = (struct watch_list *) arg;
+ FanotifyMarkEntry *fme = &e->ffy.e;
+
+ if (fme->type == MARK_TYPE__INODE) {
+
+ BUG_ON(!fme->ie);
+
+ pr_info("mark: s_dev 0x%08x i_ino 0x%016"PRIx64" mask 0x%08x\n",
+ fme->s_dev, fme->ie->i_ino, fme->mask);
+
+ pr_info("\t[fhandle] bytes 0x%08x type 0x%08x __handle 0x%016"PRIx64":0x%016"PRIx64"\n",
+ fme->ie->f_handle->bytes, fme->ie->f_handle->type,
+ fme->ie->f_handle->handle[0], fme->ie->f_handle->handle[1]);
+
+ if (check_open_handle(fme->s_dev, fme->ie->i_ino, fme->ie->f_handle))
+ goto out;
+ }
+
+ if (fme->type == MARK_TYPE__MOUNT) {
+ struct mount_info *m;
+
+ BUG_ON(!fme->me);
+
+ m = lookup_mnt_id(fme->me->mnt_id);
+ if (!m) {
+ pr_err("Can't find mnt_id 0x%x\n", fme->me->mnt_id);
+ goto out;
+ }
+ fme->s_dev = m->s_dev;
+
+ pr_info("mark: s_dev 0x%08x mnt_id 0x%08x mask 0x%08x\n",
+ fme->s_dev, fme->me->mnt_id, fme->mask);
+
+ }
+
+ list_add_tail(&e->ffy.node, &wd_list->list);
+ wd_list->n++;
+
+ return 0;
+out:
+ free_fanotify_mark_entry(e);
+ return -1;
+}
+
+static int dump_one_fanotify(int lfd, u32 id, const struct fd_parms *p)
+{
+ struct watch_list wd_list = {.list = LIST_HEAD_INIT(wd_list.list), .n = 0};
+ FanotifyFileEntry fe = FANOTIFY_FILE_ENTRY__INIT;
+ union fdinfo_entries *we, *tmp;
+ int ret = -1, i;
+
+ ret = fd_has_data(lfd);
+ if (ret < 0)
+ return -1;
+ else if (ret > 0)
+ pr_warn("The 0x%08x fanotify events will be dropped\n", id);
+ ret = -1;
+
+ fe.id = id;
+ fe.flags = p->flags;
+ fe.fown = (FownEntry *)&p->fown;
+
+ if (parse_fdinfo(lfd, FD_TYPES__FANOTIFY,
+ dump_fanotify_entry, &wd_list) < 0)
+ goto free;
+
+ fe.mark = xmalloc(sizeof(*fe.mark) * wd_list.n);
+ if (!fe.mark)
+ goto free;
+
+ i = 0;
+ list_for_each_entry(we, &wd_list.list, ify.node)
+ fe.mark[i++] = &we->ffy.e;
+ fe.n_mark = wd_list.n;
+
+ pr_info("id 0x%08x flags 0x%08x\n", fe.id, fe.flags);
+
+ fe.faflags = wd_list.fsn_params.faflags;
+ fe.evflags = wd_list.fsn_params.evflags;
+
+ ret = pb_write_one(img_from_set(glob_imgset, CR_FD_FANOTIFY_FILE), &fe, PB_FANOTIFY_FILE);
+free:
+ xfree(fe.mark);
+ list_for_each_entry_safe(we, tmp, &wd_list.list, ify.node)
+ free_fanotify_mark_entry(we);
+ return ret;
+}
+
+static int pre_dump_fanotify_entry(union fdinfo_entries *e, void *arg)
+{
+ FanotifyMarkEntry *fme = &e->ffy.e;
+ int ret = 0;
+
+ if (fme->type == MARK_TYPE__INODE)
+ ret = irmap_queue_cache(fme->s_dev, fme->ie->i_ino,
+ fme->ie->f_handle);
+
+ free_fanotify_mark_entry(e);
+ return ret;
+}
+
+static int pre_dump_one_fanotify(int pid, int lfd)
+{
+ struct fsnotify_params fsn_params = { };
+ return parse_fdinfo_pid(pid, lfd, FD_TYPES__FANOTIFY, pre_dump_fanotify_entry, &fsn_params);
+}
+
+const struct fdtype_ops fanotify_dump_ops = {
+ .type = FD_TYPES__FANOTIFY,
+ .dump = dump_one_fanotify,
+ .pre_dump = pre_dump_one_fanotify,
+};
+
+static char *get_mark_path(const char *who, struct file_remap *remap,
+ FhEntry *f_handle, unsigned long i_ino,
+ unsigned int s_dev, char *buf, int *target)
+{
+ char *path = NULL;
+
+ if (remap) {
+ int mntns_root;
+
+ mntns_root = mntns_get_root_by_mnt_id(remap->rmnt_id);
+
+ pr_debug("\t\tRestore %s watch for 0x%08x:0x%016lx (via %s)\n",
+ who, s_dev, i_ino, remap->rpath);
+ *target = openat(mntns_root, remap->rpath, O_PATH);
+ } else if (f_handle->path) {
+ int mntns_root;
+ char *path = ".";
+ uint32_t mnt_id = f_handle->has_mnt_id ? f_handle->mnt_id : -1;
+
+
+ /* irmap cache is collected in the root namespaces. */
+ mntns_root = mntns_get_root_by_mnt_id(mnt_id);
+
+ /* change "/foo" into "foo" and "/" into "." */
+ if (f_handle->path[1] != '\0')
+ path = f_handle->path + 1;
+
+ pr_debug("\t\tRestore with path hint %d:%s\n", mnt_id, path);
+ *target = openat(mntns_root, path, O_PATH);
+ } else
+ *target = open_handle(s_dev, i_ino, f_handle);
+
+ if (*target < 0) {
+ pr_perror("Unable to open %s", f_handle->path);
+ goto err;
+ }
+
+ /*
+ * fanotify/inotify open syscalls want path to attach
+ * watch to. But the only thing we have is an FD obtained
+ * via fhandle. Fortunatelly, when trying to attach the
+ * /proc/pid/fd/ link, we will watch the inode the link
+ * points to, i.e. -- just what we want.
+ */
+
+ sprintf(buf, "/proc/self/fd/%d", *target);
+ path = buf;
+
+ if (!pr_quelled(LOG_DEBUG)) {
+ char link[PATH_MAX];
+
+ if (read_fd_link(*target, link, sizeof(link)) < 0)
+ link[0] = '\0';
+
+ pr_debug("\t\tRestore %s watch for 0x%08x:0x%016lx (via %s -> %s)\n",
+ who, s_dev, i_ino, path, link);
+ }
+err:
+ return path;
+}
+
+static int restore_one_inotify(int inotify_fd, struct fsnotify_mark_info *info)
+{
+ InotifyWdEntry *iwe = info->iwe;
+ int ret = -1, target = -1;
+ char buf[PSFDS], *path;
+
+ path = get_mark_path("inotify", info->remap, iwe->f_handle,
+ iwe->i_ino, iwe->s_dev, buf, &target);
+ if (!path)
+ goto err;
+
+ /*
+ * FIXME The kernel allocates wd-s sequentially,
+ * this is suboptimal, but the kernel doesn't
+ * provide and API for this yet :(
+ */
+ while (1) {
+ int wd;
+
+ wd = inotify_add_watch(inotify_fd, path, iwe->mask);
+ if (wd < 0) {
+ pr_perror("Can't add watch for 0x%x with 0x%x", inotify_fd, iwe->wd);
+ break;
+ } else if (wd == iwe->wd) {
+ ret = 0;
+ break;
+ } else if (wd > iwe->wd) {
+ pr_err("Unsorted watch 0x%x found for 0x%x with 0x%x\n", wd, inotify_fd, iwe->wd);
+ break;
+ }
+
+ pr_debug("\t\tWatch got 0x%x but 0x%x expected\n", wd, iwe->wd);
+ inotify_rm_watch(inotify_fd, wd);
+ }
+
+err:
+ if (info->remap)
+ remap_put(info->remap);
+
+ close_safe(&target);
+ return ret;
+}
+
+static int restore_one_fanotify(int fd, struct fsnotify_mark_info *mark)
+{
+ FanotifyMarkEntry *fme = mark->fme;
+ unsigned int flags = FAN_MARK_ADD;
+ int ret = -1, target = -1;
+ char buf[PSFDS], *path = NULL;
+
+ if (fme->type == MARK_TYPE__MOUNT) {
+ struct mount_info *m;
+ int mntns_root;
+
+ m = lookup_mnt_id(fme->me->mnt_id);
+ if (!m) {
+ pr_err("Can't find mount mnt_id 0x%x\n", fme->me->mnt_id);
+ return -1;
+ }
+
+ mntns_root = mntns_get_root_fd(m->nsid);
+
+ target = openat(mntns_root, m->ns_mountpoint, O_PATH);
+ if (target == -1) {
+ pr_perror("Unable to open %s", m->ns_mountpoint);
+ goto err;
+ }
+
+ flags |= FAN_MARK_MOUNT;
+ snprintf(buf, sizeof(buf), "/proc/self/fd/%d", target);
+ path = buf;
+ } else if (fme->type == MARK_TYPE__INODE) {
+ path = get_mark_path("fanotify", mark->remap,
+ fme->ie->f_handle, fme->ie->i_ino,
+ fme->s_dev, buf, &target);
+ if (!path)
+ goto err;
+ } else {
+ pr_err("Bad fsnotify mark type 0x%x\n", fme->type);
+ goto err;
+ }
+
+ flags |= fme->mflags;
+
+ if (mark->fme->mask) {
+ ret = fanotify_mark(fd, flags, fme->mask, AT_FDCWD, path);
+ if (ret) {
+ pr_err("Adding fanotify mask 0x%x on 0x%x/%s failed (%d)\n",
+ fme->mask, fme->id, path, ret);
+ goto err;
+ }
+ }
+
+ if (fme->ignored_mask) {
+ ret = fanotify_mark(fd, flags | FAN_MARK_IGNORED_MASK,
+ fme->ignored_mask, AT_FDCWD, path);
+ if (ret) {
+ pr_err("Adding fanotify ignored-mask 0x%x on 0x%x/%s failed (%d)\n",
+ fme->ignored_mask, fme->id, path, ret);
+ goto err;
+ }
+ }
+
+ if (mark->remap)
+ remap_put(mark->remap);
+
+err:
+ close_safe(&target);
+ return ret;
+}
+
+static int open_inotify_fd(struct file_desc *d)
+{
+ struct fsnotify_file_info *info;
+ struct fsnotify_mark_info *wd_info;
+ int tmp;
+
+ info = container_of(d, struct fsnotify_file_info, d);
+
+ tmp = inotify_init1(info->ife->flags);
+ if (tmp < 0) {
+ pr_perror("Can't create inotify for 0x%08x", info->ife->id);
+ return -1;
+ }
+
+ list_for_each_entry(wd_info, &info->marks, list) {
+ pr_info("\tRestore 0x%x wd for 0x%08x\n", wd_info->iwe->wd, wd_info->iwe->id);
+ if (restore_one_inotify(tmp, wd_info)) {
+ close_safe(&tmp);
+ break;
+ }
+ }
+
+ if (restore_fown(tmp, info->ife->fown))
+ close_safe(&tmp);
+
+ return tmp;
+}
+
+static int open_fanotify_fd(struct file_desc *d)
+{
+ struct fsnotify_file_info *info;
+ struct fsnotify_mark_info *mark;
+ unsigned int flags = 0;
+ int ret;
+
+ info = container_of(d, struct fsnotify_file_info, d);
+
+ flags = info->ffe->faflags;
+ if (info->ffe->flags & O_CLOEXEC)
+ flags |= FAN_CLOEXEC;
+ if (info->ffe->flags & O_NONBLOCK)
+ flags |= FAN_NONBLOCK;
+
+ ret = fanotify_init(flags, info->ffe->evflags);
+ if (ret < 0) {
+ errno = -ret;
+ pr_perror("Can't init fanotify mark (%d)", ret);
+ return -1;
+ }
+
+ list_for_each_entry(mark, &info->marks, list) {
+ pr_info("\tRestore fanotify for 0x%08x\n", mark->fme->id);
+ if (restore_one_fanotify(ret, mark)) {
+ close_safe(&ret);
+ break;
+ }
+ }
+
+ if (restore_fown(ret, info->ffe->fown))
+ close_safe(&ret);
+
+ return ret;
+}
+
+static struct file_desc_ops inotify_desc_ops = {
+ .type = FD_TYPES__INOTIFY,
+ .open = open_inotify_fd,
+};
+
+static struct file_desc_ops fanotify_desc_ops = {
+ .type = FD_TYPES__FANOTIFY,
+ .open = open_fanotify_fd,
+};
+
+static struct fsnotify_file_info *find_inotify_info(unsigned id)
+{
+ struct fsnotify_file_info *p;
+ static struct fsnotify_file_info *last = NULL;
+
+ if (last && last->ife->id == id) {
+ /*
+ * An optimization for clean dump image -- criu puts
+ * wd-s for one inotify in one row, thus sometimes
+ * we can avoid scanning the inotify_info_head.
+ */
+ pr_debug("\t\tlast ify for 0x%08x found\n", id);
+ return last;
+ }
+
+ list_for_each_entry(p, &inotify_info_head, list)
+ if (p->ife->id == id) {
+ last = p;
+ return p;
+ }
+
+ pr_err("Can't find inotify with id 0x%08x\n", id);
+ return NULL;
+}
+
+static int __collect_inotify_mark(struct fsnotify_file_info *p, struct fsnotify_mark_info *mark)
+{
+ struct fsnotify_mark_info *m;
+
+ /*
+ * We should put marks in wd ascending order. See comment
+ * in restore_one_inotify() for explanation.
+ */
+ list_for_each_entry(m, &p->marks, list)
+ if (m->iwe->wd > mark->iwe->wd)
+ break;
+
+ list_add_tail(&mark->list, &m->list);
+ mark->remap = lookup_ghost_remap(mark->iwe->s_dev, mark->iwe->i_ino);
+ return 0;
+}
+
+static int collect_inotify_mark(struct fsnotify_mark_info *mark)
+{
+ struct fsnotify_file_info *p;
+
+ p = find_inotify_info(mark->iwe->id);
+ if (!p)
+ return -1;
+
+ return __collect_inotify_mark(p, mark);
+}
+
+static int __collect_fanotify_mark(struct fsnotify_file_info *p,
+ struct fsnotify_mark_info *mark)
+{
+ list_add(&mark->list, &p->marks);
+ if (mark->fme->type == MARK_TYPE__INODE)
+ mark->remap = lookup_ghost_remap(mark->fme->s_dev,
+ mark->fme->ie->i_ino);
+ return 0;
+}
+
+static int collect_fanotify_mark(struct fsnotify_mark_info *mark)
+{
+ struct fsnotify_file_info *p;
+
+ list_for_each_entry(p, &fanotify_info_head, list) {
+ if (p->ffe->id == mark->fme->id)
+ return __collect_inotify_mark(p, mark);
+ }
+
+ pr_err("Can't find fanotify with id 0x%08x\n", mark->fme->id);
+ return -1;
+}
+
+static int collect_one_inotify(void *o, ProtobufCMessage *msg)
+{
+ struct fsnotify_file_info *info = o;
+ int i;
+
+ info->ife = pb_msg(msg, InotifyFileEntry);
+ INIT_LIST_HEAD(&info->marks);
+ list_add(&info->list, &inotify_info_head);
+ pr_info("Collected id 0x%08x flags 0x%08x\n", info->ife->id, info->ife->flags);
+
+ for (i = 0; i < info->ife->n_wd; i++) {
+ struct fsnotify_mark_info *mark;
+
+ mark = xmalloc(sizeof(*mark));
+ if (!mark)
+ return -1;
+
+ mark->iwe = info->ife->wd[i];
+ INIT_LIST_HEAD(&mark->list);
+ mark->remap = NULL;
+
+ if (__collect_inotify_mark(info, mark))
+ return -1;
+ }
+
+ return file_desc_add(&info->d, info->ife->id, &inotify_desc_ops);
+}
+
+struct collect_image_info inotify_cinfo = {
+ .fd_type = CR_FD_INOTIFY_FILE,
+ .pb_type = PB_INOTIFY_FILE,
+ .priv_size = sizeof(struct fsnotify_file_info),
+ .collect = collect_one_inotify,
+};
+
+static int collect_one_fanotify(void *o, ProtobufCMessage *msg)
+{
+ struct fsnotify_file_info *info = o;
+ int i;
+
+ info->ffe = pb_msg(msg, FanotifyFileEntry);
+ INIT_LIST_HEAD(&info->marks);
+ list_add(&info->list, &fanotify_info_head);
+ pr_info("Collected id 0x%08x flags 0x%08x\n", info->ffe->id, info->ffe->flags);
+
+ for (i = 0; i < info->ffe->n_mark; i++) {
+ struct fsnotify_mark_info *mark;
+
+ mark = xmalloc(sizeof(*mark));
+ if (!mark)
+ return -1;
+
+ mark->fme = info->ffe->mark[i];
+ INIT_LIST_HEAD(&mark->list);
+ mark->remap = NULL;
+
+ if (__collect_fanotify_mark(info, mark))
+ return -1;
+ }
+
+ return file_desc_add(&info->d, info->ffe->id, &fanotify_desc_ops);
+}
+
+struct collect_image_info fanotify_cinfo = {
+ .fd_type = CR_FD_FANOTIFY_FILE,
+ .pb_type = PB_FANOTIFY_FILE,
+ .priv_size = sizeof(struct fsnotify_file_info),
+ .collect = collect_one_fanotify,
+};
+
+static int collect_one_inotify_mark(void *o, ProtobufCMessage *msg)
+{
+ struct fsnotify_mark_info *mark = o;
+
+ mark->iwe = pb_msg(msg, InotifyWdEntry);
+ INIT_LIST_HEAD(&mark->list);
+ mark->remap = NULL;
+
+ /*
+ * The kernel prior 4.3 might export internal event
+ * mask bits which are not part of user-space API. It
+ * is fixed in kernel but we have to keep backward
+ * compatibility with old images. So mask out
+ * inappropriate bits (in particular fdinfo might
+ * have FS_EVENT_ON_CHILD bit set).
+ */
+ mark->iwe->mask &= ~KERNEL_FS_EVENT_ON_CHILD;
+
+ return collect_inotify_mark(mark);
+}
+
+struct collect_image_info inotify_mark_cinfo = {
+ .fd_type = CR_FD_INOTIFY_WD,
+ .pb_type = PB_INOTIFY_WD,
+ .priv_size = sizeof(struct fsnotify_mark_info),
+ .collect = collect_one_inotify_mark,
+};
+
+static int collect_one_fanotify_mark(void *o, ProtobufCMessage *msg)
+{
+ struct fsnotify_mark_info *mark = o;
+
+ mark->fme = pb_msg(msg, FanotifyMarkEntry);
+ INIT_LIST_HEAD(&mark->list);
+ mark->remap = NULL;
+
+ return collect_fanotify_mark(mark);
+}
+
+struct collect_image_info fanotify_mark_cinfo = {
+ .fd_type = CR_FD_FANOTIFY_MARK,
+ .pb_type = PB_FANOTIFY_MARK,
+ .priv_size = sizeof(struct fsnotify_mark_info),
+ .collect = collect_one_fanotify_mark,
+};
diff --git a/criu/image-desc.c b/criu/image-desc.c
new file mode 100644
index 000000000000..677067538120
--- /dev/null
+++ b/criu/image-desc.c
@@ -0,0 +1,117 @@
+#include <stdlib.h>
+
+#include "image-desc.h"
+#include "cr-show.h"
+#include "magic.h"
+#include "image.h"
+
+/*
+ * The cr fd set is the set of files where the information
+ * about dumped processes is stored. Each file carries some
+ * small portion of info about the whole picture, see below
+ * for more details.
+ */
+
+#define FD_ENTRY(_name, _fmt) \
+ [CR_FD_##_name] = { \
+ .fmt = _fmt ".img", \
+ .magic = _name##_MAGIC, \
+ }
+
+#define FD_ENTRY_F(_name, _fmt, _f) \
+ [CR_FD_##_name] = { \
+ .fmt = _fmt ".img", \
+ .magic = _name##_MAGIC, \
+ .oflags = _f, \
+ }
+
+struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = {
+ FD_ENTRY(INVENTORY, "inventory"),
+ FD_ENTRY(FDINFO, "fdinfo-%d"),
+ FD_ENTRY(PAGEMAP, "pagemap-%ld"),
+ FD_ENTRY(SHMEM_PAGEMAP, "pagemap-shmem-%ld"),
+ FD_ENTRY(REG_FILES, "reg-files"),
+ FD_ENTRY(EXT_FILES, "ext-files"),
+ FD_ENTRY(NS_FILES, "ns-files"),
+ FD_ENTRY(EVENTFD_FILE, "eventfd"),
+ FD_ENTRY(EVENTPOLL_FILE,"eventpoll"),
+ FD_ENTRY(EVENTPOLL_TFD, "eventpoll-tfd"),
+ FD_ENTRY(SIGNALFD, "signalfd"),
+ FD_ENTRY(INOTIFY_FILE, "inotify"),
+ FD_ENTRY(INOTIFY_WD, "inotify-wd"),
+ FD_ENTRY(FANOTIFY_FILE, "fanotify"),
+ FD_ENTRY(FANOTIFY_MARK, "fanotify-mark"),
+ FD_ENTRY(CORE, "core-%d"),
+ FD_ENTRY(IDS, "ids-%d"),
+ FD_ENTRY(MM, "mm-%d"),
+ FD_ENTRY(VMAS, "vmas-%d"),
+ FD_ENTRY(PIPES, "pipes"),
+ FD_ENTRY_F(PIPES_DATA, "pipes-data", O_NOBUF), /* splices data */
+ FD_ENTRY(FIFO, "fifo"),
+ FD_ENTRY_F(FIFO_DATA, "fifo-data", O_NOBUF), /* the same */
+ FD_ENTRY(PSTREE, "pstree"),
+ FD_ENTRY(SIGACT, "sigacts-%d"),
+ FD_ENTRY(UNIXSK, "unixsk"),
+ FD_ENTRY(INETSK, "inetsk"),
+ FD_ENTRY(PACKETSK, "packetsk"),
+ FD_ENTRY(NETLINK_SK, "netlinksk"),
+ FD_ENTRY_F(SK_QUEUES, "sk-queues", O_NOBUF), /* lseeks the image */
+ FD_ENTRY(ITIMERS, "itimers-%d"),
+ FD_ENTRY(POSIX_TIMERS, "posix-timers-%d"),
+ FD_ENTRY(CREDS, "creds-%d"),
+ FD_ENTRY(UTSNS, "utsns-%d"),
+ FD_ENTRY(IPC_VAR, "ipcns-var-%d"),
+ FD_ENTRY(IPCNS_SHM, "ipcns-shm-%d"),
+ FD_ENTRY(IPCNS_MSG, "ipcns-msg-%d"),
+ FD_ENTRY(IPCNS_SEM, "ipcns-sem-%d"),
+ FD_ENTRY(FS, "fs-%d"),
+ FD_ENTRY(REMAP_FPATH, "remap-fpath"),
+ FD_ENTRY_F(GHOST_FILE, "ghost-file-%x", O_NOBUF),
+ FD_ENTRY(TCP_STREAM, "tcp-stream-%x"),
+ FD_ENTRY(MNTS, "mountpoints-%d"),
+ FD_ENTRY(NETDEV, "netdev-%d"),
+ FD_ENTRY(NETNS, "netns-%d"),
+ FD_ENTRY_F(IFADDR, "ifaddr-%d", O_NOBUF),
+ FD_ENTRY_F(ROUTE, "route-%d", O_NOBUF),
+ FD_ENTRY_F(ROUTE6, "route6-%d", O_NOBUF),
+ FD_ENTRY_F(RULE, "rule-%d", O_NOBUF),
+ FD_ENTRY_F(IPTABLES, "iptables-%d", O_NOBUF),
+ FD_ENTRY_F(IP6TABLES, "ip6tables-%d", O_NOBUF),
+ FD_ENTRY_F(TMPFS_IMG, "tmpfs-%d.tar.gz", O_NOBUF),
+ FD_ENTRY_F(TMPFS_DEV, "tmpfs-dev-%d.tar.gz", O_NOBUF),
+ FD_ENTRY(BINFMT_MISC, "binfmt-misc-%d"),
+ FD_ENTRY(TTY_FILES, "tty"),
+ FD_ENTRY(TTY_INFO, "tty-info"),
+ FD_ENTRY(FILE_LOCKS, "filelocks"),
+ FD_ENTRY(RLIMIT, "rlimit-%d"),
+ FD_ENTRY_F(PAGES, "pages-%u", O_NOBUF),
+ FD_ENTRY_F(PAGES_OLD, "pages-%d", O_NOBUF),
+ FD_ENTRY_F(SHM_PAGES_OLD, "pages-shmem-%ld", O_NOBUF),
+ FD_ENTRY(SIGNAL, "signal-s-%d"),
+ FD_ENTRY(PSIGNAL, "signal-p-%d"),
+ FD_ENTRY(TUNFILE, "tunfile"),
+ FD_ENTRY(CGROUP, "cgroup"),
+ FD_ENTRY(TIMERFD, "timerfd"),
+ FD_ENTRY(CPUINFO, "cpuinfo"),
+ FD_ENTRY(SECCOMP, "seccomp"),
+ FD_ENTRY(USERNS, "userns-%d"),
+ FD_ENTRY(NETNF_CT, "netns-ct-%d"),
+ FD_ENTRY(NETNF_EXP, "netns-exp-%d"),
+
+ [CR_FD_STATS] = {
+ .fmt = "stats-%s",
+ .magic = STATS_MAGIC,
+ .oflags = O_SERVICE,
+ },
+
+ [CR_FD_IRMAP_CACHE] = {
+ .fmt = "irmap-cache",
+ .magic = IRMAP_CACHE_MAGIC,
+ .oflags = O_SERVICE,
+ },
+
+ [CR_FD_FILE_LOCKS_PID] = {
+ .fmt = "filelocks-%d.img",
+ .magic = FILE_LOCKS_MAGIC,
+ },
+};
diff --git a/criu/image.c b/criu/image.c
new file mode 100644
index 000000000000..a164722bba5e
--- /dev/null
+++ b/criu/image.c
@@ -0,0 +1,561 @@
+#include <unistd.h>
+#include <stdarg.h>
+#include <fcntl.h>
+#include "crtools.h"
+#include "cr_options.h"
+#include "imgset.h"
+#include "image.h"
+#include "pstree.h"
+#include "stats.h"
+#include "cgroup.h"
+#include "lsm.h"
+#include "protobuf.h"
+#include "protobuf/inventory.pb-c.h"
+#include "protobuf/pagemap.pb-c.h"
+
+bool fdinfo_per_id = false;
+bool ns_per_id = false;
+bool img_common_magic = true;
+TaskKobjIdsEntry *root_ids;
+u32 root_cg_set;
+Lsmtype image_lsm;
+
+int check_img_inventory(void)
+{
+ int ret = -1;
+ struct cr_img *img;
+ InventoryEntry *he;
+
+ img = open_image(CR_FD_INVENTORY, O_RSTR);
+ if (!img)
+ return -1;
+
+ if (pb_read_one(img, &he, PB_INVENTORY) < 0)
+ goto out_close;
+
+ fdinfo_per_id = he->has_fdinfo_per_id ? he->fdinfo_per_id : false;
+ ns_per_id = he->has_ns_per_id ? he->ns_per_id : false;
+
+ if (he->root_ids) {
+ root_ids = xmalloc(sizeof(*root_ids));
+ if (!root_ids)
+ goto out_err;
+
+ memcpy(root_ids, he->root_ids, sizeof(*root_ids));
+ }
+
+ if (he->has_root_cg_set) {
+ if (he->root_cg_set == 0) {
+ pr_err("Corrupted root cgset\n");
+ goto out_err;
+ }
+
+ root_cg_set = he->root_cg_set;
+ }
+
+ image_lsm = he->lsmtype;
+
+ switch (he->img_version) {
+ case CRTOOLS_IMAGES_V1:
+ /* good old images. OK */
+ img_common_magic = false;
+ break;
+ case CRTOOLS_IMAGES_V1_1:
+ /* newer images with extra magic in the head */
+ break;
+ default:
+ pr_err("Not supported images version %u\n", he->img_version);
+ goto out_err;
+ }
+
+ ret = 0;
+
+out_err:
+ inventory_entry__free_unpacked(he, NULL);
+out_close:
+ close_image(img);
+ return ret;
+}
+
+int write_img_inventory(InventoryEntry *he)
+{
+ struct cr_img *img;
+
+ pr_info("Writing image inventory (version %u)\n", CRTOOLS_IMAGES_V1);
+
+ img = open_image(CR_FD_INVENTORY, O_DUMP);
+ if (!img)
+ return -1;
+
+ if (pb_write_one(img, he, PB_INVENTORY) < 0)
+ return -1;
+
+ xfree(he->root_ids);
+ close_image(img);
+ return 0;
+}
+
+int prepare_inventory(InventoryEntry *he)
+{
+ struct {
+ struct pstree_item i;
+ struct dmp_info d;
+ } crt = { };
+
+ pr_info("Perparing image inventory (version %u)\n", CRTOOLS_IMAGES_V1);
+
+ he->img_version = CRTOOLS_IMAGES_V1_1;
+ he->fdinfo_per_id = true;
+ he->has_fdinfo_per_id = true;
+ he->ns_per_id = true;
+ he->has_ns_per_id = true;
+ he->lsmtype = host_lsm_type();
+
+ crt.i.state = TASK_ALIVE;
+ crt.i.pid.real = getpid();
+ if (get_task_ids(&crt.i))
+ return -1;
+
+ he->has_root_cg_set = true;
+ if (dump_task_cgroup(NULL, &he->root_cg_set))
+ return -1;
+
+ he->root_ids = crt.i.ids;
+
+ return 0;
+}
+
+static struct cr_imgset *alloc_cr_imgset(int nr)
+{
+ struct cr_imgset *cr_imgset;
+ unsigned int i;
+
+ cr_imgset = xmalloc(sizeof(*cr_imgset));
+ if (cr_imgset == NULL)
+ return NULL;
+
+ cr_imgset->_imgs = xmalloc(nr * sizeof(struct cr_img *));
+ if (cr_imgset->_imgs == NULL) {
+ xfree(cr_imgset);
+ return NULL;
+ }
+
+ for (i = 0; i < nr; i++)
+ cr_imgset->_imgs[i] = NULL;
+ cr_imgset->fd_nr = nr;
+ return cr_imgset;
+}
+
+static void __close_cr_imgset(struct cr_imgset *cr_imgset)
+{
+ unsigned int i;
+
+ if (!cr_imgset)
+ return;
+
+ for (i = 0; i < cr_imgset->fd_nr; i++) {
+ if (!cr_imgset->_imgs[i])
+ continue;
+ close_image(cr_imgset->_imgs[i]);
+ cr_imgset->_imgs[i] = NULL;
+ }
+}
+
+void close_cr_imgset(struct cr_imgset **cr_imgset)
+{
+ if (!cr_imgset || !*cr_imgset)
+ return;
+
+ __close_cr_imgset(*cr_imgset);
+
+ xfree((*cr_imgset)->_imgs);
+ xfree(*cr_imgset);
+ *cr_imgset = NULL;
+}
+
+struct cr_imgset *cr_imgset_open_range(int pid, int from, int to,
+ unsigned long flags)
+{
+ struct cr_imgset *imgset;
+ unsigned int i;
+
+ imgset = alloc_cr_imgset(to - from);
+ if (!imgset)
+ goto err;
+
+ from++;
+ imgset->fd_off = from;
+ for (i = from; i < to; i++) {
+ struct cr_img *img;
+
+ img = open_image(i, flags, pid);
+ if (!img) {
+ if (!(flags & O_CREAT))
+ /* caller should check himself */
+ continue;
+ goto err;
+ }
+
+ imgset->_imgs[i - from] = img;
+ }
+
+ return imgset;
+
+err:
+ close_cr_imgset(&imgset);
+ return NULL;
+}
+
+struct cr_imgset *cr_task_imgset_open(int pid, int mode)
+{
+ return cr_imgset_open(pid, TASK, mode);
+}
+
+struct cr_imgset *cr_glob_imgset_open(int mode)
+{
+ return cr_imgset_open(-1 /* ignored */, GLOB, mode);
+}
+
+static int do_open_image(struct cr_img *img, int dfd, int type, unsigned long flags, char *path);
+
+struct cr_img *open_image_at(int dfd, int type, unsigned long flags, ...)
+{
+ struct cr_img *img;
+ unsigned long oflags;
+ char path[PATH_MAX];
+ va_list args;
+ bool lazy = false;
+
+ if (dfd == -1) {
+ dfd = get_service_fd(IMG_FD_OFF);
+ lazy = (flags & O_CREAT);
+ }
+
+ img = xmalloc(sizeof(*img));
+ if (!img)
+ return NULL;
+
+ oflags = flags | imgset_template[type].oflags;
+
+ va_start(args, flags);
+ vsnprintf(path, PATH_MAX, imgset_template[type].fmt, args);
+ va_end(args);
+
+ if (lazy) {
+ img->fd = LAZY_IMG_FD;
+ img->type = type;
+ img->oflags = oflags;
+ img->path = xstrdup(path);
+ return img;
+ } else
+ img->fd = EMPTY_IMG_FD;
+
+ if (do_open_image(img, dfd, type, oflags, path)) {
+ close_image(img);
+ return NULL;
+ }
+
+ return img;
+}
+
+static inline u32 head_magic(int oflags)
+{
+ return oflags & O_SERVICE ? IMG_SERVICE_MAGIC : IMG_COMMON_MAGIC;
+}
+
+static int img_check_magic(struct cr_img *img, int oflags, int type, char *path)
+{
+ u32 magic;
+
+ if (read_img(img, &magic) < 0)
+ return -1;
+
+ if (img_common_magic && (type != CR_FD_INVENTORY)) {
+ if (magic != head_magic(oflags)) {
+ pr_err("Head magic doesn't match for %s\n", path);
+ return -1;
+ }
+
+ if (read_img(img, &magic) < 0)
+ return -1;
+ }
+
+ if (magic != imgset_template[type].magic) {
+ pr_err("Magic doesn't match for %s\n", path);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int img_write_magic(struct cr_img *img, int oflags, int type)
+{
+ if (img_common_magic && (type != CR_FD_INVENTORY)) {
+ u32 cmagic;
+
+ cmagic = head_magic(oflags);
+ if (write_img(img, &cmagic))
+ return -1;
+ }
+
+ return write_img(img, &imgset_template[type].magic);
+}
+
+static int do_open_image(struct cr_img *img, int dfd, int type, unsigned long oflags, char *path)
+{
+ int ret, flags;
+
+ flags = oflags & ~(O_NOBUF | O_SERVICE);
+
+ ret = openat(dfd, path, flags, CR_FD_PERM);
+ if (ret < 0) {
+ if (!(flags & O_CREAT) && (errno == ENOENT)) {
+ pr_info("No %s image\n", path);
+ img->_x.fd = EMPTY_IMG_FD;
+ goto skip_magic;
+ }
+
+ pr_perror("Unable to open %s", path);
+ goto err;
+ }
+
+ img->_x.fd = ret;
+ if (oflags & O_NOBUF)
+ bfd_setraw(&img->_x);
+ else {
+ if (flags == O_RDONLY)
+ ret = bfdopenr(&img->_x);
+ else
+ ret = bfdopenw(&img->_x);
+
+ if (ret)
+ goto err;
+ }
+
+ if (imgset_template[type].magic == RAW_IMAGE_MAGIC)
+ goto skip_magic;
+
+ if (flags == O_RDONLY)
+ ret = img_check_magic(img, oflags, type, path);
+ else
+ ret = img_write_magic(img, oflags, type);
+ if (ret)
+ goto err;
+
+skip_magic:
+ return 0;
+
+err:
+ return -1;
+}
+
+int open_image_lazy(struct cr_img *img)
+{
+ int dfd;
+ char *path = img->path;
+
+ img->path = NULL;
+
+ dfd = get_service_fd(IMG_FD_OFF);
+ if (do_open_image(img, dfd, img->type, img->oflags, path)) {
+ xfree(path);
+ return -1;
+ }
+
+ xfree(path);
+ return 0;
+}
+
+void close_image(struct cr_img *img)
+{
+ if (lazy_image(img)) {
+ /*
+ * Remove the image file if it's there so that
+ * subsequent restore doesn't read wrong or fake
+ * data from it.
+ */
+ unlinkat(get_service_fd(IMG_FD_OFF), img->path, 0);
+ xfree(img->path);
+ } else if (!empty_image(img))
+ bclose(&img->_x);
+
+ xfree(img);
+}
+
+struct cr_img *img_from_fd(int fd)
+{
+ struct cr_img *img;
+
+ img = xmalloc(sizeof(*img));
+ if (img) {
+ img->_x.fd = fd;
+ bfd_setraw(&img->_x);
+ }
+
+ return img;
+}
+
+int open_image_dir(char *dir)
+{
+ int fd, ret;
+
+ fd = open(dir, O_RDONLY);
+ if (fd < 0) {
+ pr_perror("Can't open dir %s", dir);
+ return -1;
+ }
+
+ ret = install_service_fd(IMG_FD_OFF, fd);
+ close(fd);
+ fd = ret;
+
+ if (opts.img_parent) {
+ ret = symlinkat(opts.img_parent, fd, CR_PARENT_LINK);
+ if (ret < 0 && errno != EEXIST) {
+ pr_perror("Can't link parent snapshot");
+ goto err;
+ }
+ }
+
+ return 0;
+
+err:
+ close_image_dir();
+ return -1;
+}
+
+void close_image_dir(void)
+{
+ close_service_fd(IMG_FD_OFF);
+}
+
+static unsigned long page_ids = 1;
+
+void up_page_ids_base(void)
+{
+ /*
+ * When page server and criu dump work on
+ * the same dir, the shmem pagemaps and regular
+ * pagemaps may have IDs conflicts. Fix this by
+ * making page server produce page images with
+ * higher IDs.
+ */
+
+ BUG_ON(page_ids != 1);
+ page_ids += 0x10000;
+}
+
+struct cr_img *open_pages_image_at(int dfd, unsigned long flags, struct cr_img *pmi)
+{
+ unsigned id;
+
+ if (flags == O_RDONLY || flags == O_RDWR) {
+ PagemapHead *h;
+ if (pb_read_one(pmi, &h, PB_PAGEMAP_HEAD) < 0)
+ return NULL;
+ id = h->pages_id;
+ pagemap_head__free_unpacked(h, NULL);
+ } else {
+ PagemapHead h = PAGEMAP_HEAD__INIT;
+ id = h.pages_id = page_ids++;
+ if (pb_write_one(pmi, &h, PB_PAGEMAP_HEAD) < 0)
+ return NULL;
+ }
+
+ return open_image_at(dfd, CR_FD_PAGES, flags, id);
+}
+
+struct cr_img *open_pages_image(unsigned long flags, struct cr_img *pmi)
+{
+ return open_pages_image_at(get_service_fd(IMG_FD_OFF), flags, pmi);
+}
+
+/*
+ * Write buffer @ptr of @size bytes into @fd file
+ * Returns
+ * 0 on success
+ * -1 on error (error message is printed)
+ */
+int write_img_buf(struct cr_img *img, const void *ptr, int size)
+{
+ int ret;
+
+ ret = bwrite(&img->_x, ptr, size);
+ if (ret == size)
+ return 0;
+
+ if (ret < 0)
+ pr_perror("Can't write img file");
+ else
+ pr_err("Img trimmed %d/%d\n", ret, size);
+ return -1;
+}
+
+/*
+ * Read buffer @ptr of @size bytes from @fd file
+ * Returns
+ * 1 on success
+ * 0 on EOF (silently)
+ * -1 on error (error message is printed)
+ */
+int read_img_buf_eof(struct cr_img *img, void *ptr, int size)
+{
+ int ret;
+
+ ret = bread(&img->_x, ptr, size);
+ if (ret == size)
+ return 1;
+ if (ret == 0)
+ return 0;
+
+ if (ret < 0)
+ pr_perror("Can't read img file");
+ else
+ pr_err("Img trimmed %d/%d\n", ret, size);
+ return -1;
+}
+
+/*
+ * Read buffer @ptr of @size bytes from @fd file
+ * Returns
+ * 1 on success
+ * -1 on error or EOF (error message is printed)
+ */
+int read_img_buf(struct cr_img *img, void *ptr, int size)
+{
+ int ret;
+
+ ret = read_img_buf_eof(img, ptr, size);
+ if (ret == 0) {
+ pr_err("Unexpected EOF\n");
+ ret = -1;
+ }
+
+ return ret;
+}
+
+/*
+ * read_img_str -- same as read_img_buf, but allocates memory for
+ * the buffer and puts the '\0' at the end
+ */
+
+int read_img_str(struct cr_img *img, char **pstr, int size)
+{
+ int ret;
+ char *str;
+
+ str = xmalloc(size + 1);
+ if (!str)
+ return -1;
+
+ ret = read_img_buf(img, str, size);
+ if (ret < 0) {
+ xfree(str);
+ return -1;
+ }
+
+ str[size] = '\0';
+ *pstr = str;
+ return 0;
+}
+
diff --git a/criu/include/action-scripts.h b/criu/include/action-scripts.h
new file mode 100644
index 000000000000..8ffc2c58bad6
--- /dev/null
+++ b/criu/include/action-scripts.h
@@ -0,0 +1,29 @@
+#ifndef __CR_ACTION_SCRIPTS_H__
+#define __CR_ACTION_SCRIPTS_H__
+
+struct script {
+ struct list_head node;
+ char *path;
+ int arg;
+};
+
+#define SCRIPT_RPC_NOTIFY (char *)0x1
+
+enum script_actions {
+ ACT_PRE_DUMP = 0,
+ ACT_POST_DUMP = 1,
+ ACT_PRE_RESTORE = 2,
+ ACT_POST_RESTORE = 3,
+ ACT_NET_LOCK = 4,
+ ACT_NET_UNLOCK = 5,
+ ACT_SETUP_NS = 6,
+ ACT_POST_SETUP_NS = 7,
+
+ ACT_MAX
+};
+
+extern int add_script(char *path, int arg);
+extern int run_scripts(enum script_actions);
+extern int send_criu_rpc_script(enum script_actions act, char *name, int arg);
+
+#endif /* __CR_ACTION_SCRIPTS_H__ */
diff --git a/criu/include/aio.h b/criu/include/aio.h
new file mode 100644
index 000000000000..e839ec693da9
--- /dev/null
+++ b/criu/include/aio.h
@@ -0,0 +1,15 @@
+#ifndef __CR_AIO_H__
+#define __CR_AIO_H__
+#include "protobuf/mm.pb-c.h"
+int dump_aio_ring(MmEntry *mme, struct vma_area *vma);
+void free_aios(MmEntry *mme);
+struct parasite_ctl;
+int parasite_check_aios(struct parasite_ctl *, struct vm_area_list *);
+unsigned long aio_rings_args_size(struct vm_area_list *);
+
+struct rst_aio_ring {
+ unsigned long addr;
+ unsigned long len;
+ unsigned int nr_req;
+};
+#endif /* __CR_AIO_H__ */
diff --git a/criu/include/asm-generic/bitops.h b/criu/include/asm-generic/bitops.h
new file mode 100644
index 000000000000..190e1ab638c0
--- /dev/null
+++ b/criu/include/asm-generic/bitops.h
@@ -0,0 +1,123 @@
+/*
+ * Generic bits operations.
+ *
+ * Architectures that don't want their own implementation of those,
+ * should include this file into the arch/$ARCH/include/asm/bitops.h
+ */
+
+#ifndef __CR_GENERIC_BITOPS_H__
+#define __CR_GENERIC_BITOPS_H__
+
+#include "asm/bitsperlong.h"
+
+#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG)
+
+#define DECLARE_BITMAP(name, bits) \
+ unsigned long name[BITS_TO_LONGS(bits)]
+
+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
+/* Technically wrong, but this avoids compilation errors on some gcc
+ versions. */
+#define BITOP_ADDR(x) "=m" (*(volatile long *) (x))
+#else
+#define BITOP_ADDR(x) "+m" (*(volatile long *) (x))
+#endif
+
+#define ADDR BITOP_ADDR(addr)
+
+static inline void set_bit(int nr, volatile unsigned long *addr) {
+ addr += nr / BITS_PER_LONG;
+ *addr |= (1 << (nr % BITS_PER_LONG));
+}
+
+static inline void change_bit(int nr, volatile unsigned long *addr)
+{
+ addr += nr / BITS_PER_LONG;
+ *addr ^= (1 << (nr % BITS_PER_LONG));
+}
+
+static inline int test_bit(int nr, volatile const unsigned long *addr)
+{
+ addr += nr / BITS_PER_LONG;
+ return (*addr & (1 << (nr % BITS_PER_LONG))) ? -1 : 0;
+}
+
+static inline void clear_bit(int nr, volatile unsigned long *addr)
+{
+ addr += nr / BITS_PER_LONG;
+ *addr &= ~(1 << (nr % BITS_PER_LONG));
+}
+
+/**
+ * __ffs - find first set bit in word
+ * @word: The word to search
+ *
+ * Undefined if no bit exists, so code should check against 0 first.
+ */
+static inline unsigned long __ffs(unsigned long word)
+{
+ int p = 0;
+
+ for (; p < 8*sizeof(word); ++p) {
+ if (word & 1) {
+ break;
+ }
+
+ word >>= 1;
+ }
+
+ return p;
+}
+
+#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
+
+/*
+ * Find the next set bit in a memory region.
+ */
+static inline
+unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
+ unsigned long offset)
+{
+ const unsigned long *p = addr + BITOP_WORD(offset);
+ unsigned long result = offset & ~(BITS_PER_LONG-1);
+ unsigned long tmp;
+
+ if (offset >= size)
+ return size;
+ size -= result;
+ offset %= BITS_PER_LONG;
+ if (offset) {
+ tmp = *(p++);
+ tmp &= (~0UL << offset);
+ if (size < BITS_PER_LONG)
+ goto found_first;
+ if (tmp)
+ goto found_middle;
+ size -= BITS_PER_LONG;
+ result += BITS_PER_LONG;
+ }
+ while (size & ~(BITS_PER_LONG-1)) {
+ if ((tmp = *(p++)))
+ goto found_middle;
+ result += BITS_PER_LONG;
+ size -= BITS_PER_LONG;
+ }
+ if (!size)
+ return result;
+ tmp = *p;
+
+found_first:
+ tmp &= (~0UL >> (BITS_PER_LONG - size));
+ if (tmp == 0UL) /* Are any bits set? */
+ return result + size; /* Nope. */
+found_middle:
+ return result + __ffs(tmp);
+}
+
+#define for_each_bit(i, bitmask) \
+ for (i = find_next_bit(bitmask, sizeof(bitmask), 0); \
+ i < sizeof(bitmask); \
+ i = find_next_bit(bitmask, sizeof(bitmask), i + 1))
+
+#endif /* __CR_GENERIC_BITOPS_H__ */
diff --git a/criu/include/asm-generic/int.h b/criu/include/asm-generic/int.h
new file mode 100644
index 000000000000..ac3088d5ac3b
--- /dev/null
+++ b/criu/include/asm-generic/int.h
@@ -0,0 +1,15 @@
+#ifndef __CR_INT_H__
+#define __CR_INT_H__
+
+#include <stdint.h>
+
+typedef uint64_t u64;
+typedef int64_t s64;
+typedef uint32_t u32;
+typedef int32_t s32;
+typedef uint16_t u16;
+typedef int16_t s16;
+typedef uint8_t u8;
+typedef int8_t s8;
+
+#endif /* __CR_INT_H__ */
diff --git a/criu/include/asm-generic/string.h b/criu/include/asm-generic/string.h
new file mode 100644
index 000000000000..0a545e65960d
--- /dev/null
+++ b/criu/include/asm-generic/string.h
@@ -0,0 +1,51 @@
+#ifndef __CR_ASM_GENERIC_STRING_H__
+#define __CR_ASM_GENERIC_STRING_H__
+
+#include "compiler.h"
+
+#ifndef HAS_BUILTIN_MEMCPY
+static always_inline void *builtin_memcpy(void *to, const void *from, unsigned int n)
+{
+ int i;
+ unsigned char *cto = to;
+ const unsigned char *cfrom = from;
+
+ for (i = 0; i < n; ++i, ++cto, ++cfrom) {
+ *cto = *cfrom;
+ }
+
+ return to;
+}
+#endif
+
+#ifndef HAS_BUILTIN_MEMCMP
+static always_inline int builtin_memcmp(const void *cs, const void *ct, size_t count)
+{
+ const unsigned char *su1, *su2;
+ int res = 0;
+
+ for (su1 = cs, su2 = ct; 0 < count; ++su1, ++su2, count--)
+ if ((res = *su1 - *su2) != 0)
+ break;
+ return res;
+}
+#endif
+
+#ifndef HAS_BUILTIN_STRCMP
+static always_inline int builtin_strcmp(const char *cs, const char *ct)
+{
+ unsigned char c1, c2;
+
+ while (1) {
+ c1 = *cs++;
+ c2 = *ct++;
+ if (c1 != c2)
+ return c1 < c2 ? -1 : 1;
+ if (!c1)
+ break;
+ }
+ return 0;
+}
+#endif
+
+#endif /* __CR_ASM_GENERIC_STRING_H__ */
diff --git a/criu/include/asm-generic/vdso.h b/criu/include/asm-generic/vdso.h
new file mode 100644
index 000000000000..bb746055416b
--- /dev/null
+++ b/criu/include/asm-generic/vdso.h
@@ -0,0 +1,12 @@
+#ifndef __CR_ASM_GENERIC_VDSO_H__
+#define __CR_ASM_GENERIC_VDSO_H__
+
+#define VDSO_PROT (PROT_READ | PROT_EXEC)
+#define VVAR_PROT (PROT_READ)
+
+#define VDSO_BAD_ADDR (-1ul)
+#define VVAR_BAD_ADDR VDSO_BAD_ADDR
+#define VDSO_BAD_PFN (-1ull)
+#define VVAR_BAD_PFN VDSO_BAD_PFN
+
+#endif /* __CR_ASM_GENERIC_VDSO_H__ */
diff --git a/criu/include/bfd.h b/criu/include/bfd.h
new file mode 100644
index 000000000000..e9b4d53a43c4
--- /dev/null
+++ b/criu/include/bfd.h
@@ -0,0 +1,40 @@
+#ifndef __CR_BFD_H__
+#define __CR_BFD_H__
+
+#include "err.h"
+
+struct bfd_buf;
+struct xbuf {
+ char *mem; /* buffer */
+ char *data; /* position we see bytes at */
+ unsigned int sz; /* bytes sitting after b->pos */
+ struct bfd_buf *buf;
+};
+
+struct bfd {
+ int fd;
+ bool writable;
+ struct xbuf b;
+};
+
+static inline bool bfd_buffered(struct bfd *b)
+{
+ return b->b.mem != NULL;
+}
+
+static inline void bfd_setraw(struct bfd *b)
+{
+ b->b.mem = NULL;
+}
+
+int bfdopenr(struct bfd *f);
+int bfdopenw(struct bfd *f);
+void bclose(struct bfd *f);
+char *breadline(struct bfd *f);
+char *breadchr(struct bfd *f, char c);
+int bwrite(struct bfd *f, const void *buf, int sz);
+struct iovec;
+int bwritev(struct bfd *f, const struct iovec *iov, int cnt);
+int bread(struct bfd *f, void *buf, int sz);
+int bfd_flush_images(void);
+#endif
diff --git a/criu/include/bitmap.h b/criu/include/bitmap.h
new file mode 100644
index 000000000000..9e701b66cc2c
--- /dev/null
+++ b/criu/include/bitmap.h
@@ -0,0 +1,7 @@
+#ifndef __CR_BITMAP_H__
+#define __CR_BITMAP_H__
+
+extern void bitmap_set(unsigned long *map, int start, int nr);
+extern void bitmap_clear(unsigned long *map, int start, int nr);
+
+#endif /* __CR_BITMAP_H__ */
diff --git a/criu/include/bug.h b/criu/include/bug.h
new file mode 100644
index 000000000000..a479c673bda9
--- /dev/null
+++ b/criu/include/bug.h
@@ -0,0 +1,39 @@
+#ifndef __CR_BUG_H__
+#define __CR_BUG_H__
+
+#include <signal.h>
+#include <stdbool.h>
+
+#include "compiler.h"
+#include "log.h"
+
+#ifndef BUG_ON_HANDLER
+
+#ifdef CR_NOGLIBC
+# define __raise()
+#else
+# define __raise() raise(SIGABRT)
+#endif
+
+#ifndef __clang_analyzer__
+# define BUG_ON_HANDLER(condition) \
+ do { \
+ if ((condition)) { \
+ pr_err("BUG at %s:%d\n", __FILE__, __LINE__); \
+ __raise(); \
+ *(volatile unsigned long *)NULL = 0xdead0000 + __LINE__; \
+ } \
+ } while (0)
+#else
+# define BUG_ON_HANDLER(condition) \
+ do { \
+ assert(!condition); \
+ } while (0)
+#endif
+
+#endif /* BUG_ON_HANDLER */
+
+#define BUG_ON(condition) BUG_ON_HANDLER((condition))
+#define BUG() BUG_ON(true)
+
+#endif /* __CR_BUG_H__ */
diff --git a/criu/include/cgroup.h b/criu/include/cgroup.h
new file mode 100644
index 000000000000..393ee3d9cc05
--- /dev/null
+++ b/criu/include/cgroup.h
@@ -0,0 +1,65 @@
+#ifndef __CR_CGROUP_H__
+#define __CR_CGROUP_H__
+#include "asm/int.h"
+struct pstree_item;
+extern u32 root_cg_set;
+int dump_task_cgroup(struct pstree_item *, u32 *);
+int dump_cgroups(void);
+int prepare_task_cgroup(struct pstree_item *);
+int prepare_cgroup(void);
+/* Restore things like cpu_limit in known cgroups. */
+int prepare_cgroup_properties(void);
+int restore_freezer_state(void);
+void fini_cgroup(void);
+
+struct cg_controller;
+
+struct cgroup_prop {
+ char *name;
+ char *value;
+ mode_t mode;
+ uid_t uid;
+ gid_t gid;
+ struct list_head list;
+};
+
+/* This describes a particular cgroup path, e.g. the '/lxc/u1' part of
+ * 'blkio/lxc/u1' and any properties it has.
+ */
+struct cgroup_dir {
+ char *path;
+ mode_t mode;
+ uid_t uid;
+ gid_t gid;
+
+ struct list_head properties;
+ unsigned int n_properties;
+
+ /* this is how children are linked together */
+ struct list_head siblings;
+
+ /* more cgroup_dirs */
+ struct list_head children;
+ unsigned int n_children;
+};
+
+/* This describes a particular cgroup controller, e.g. blkio or cpuset.
+ * The heads are subdirectories organized in their tree format.
+ */
+struct cg_controller {
+ unsigned int n_controllers;
+ char **controllers;
+
+ /* cgroup_dirs */
+ struct list_head heads;
+ unsigned int n_heads;
+
+ /* for cgroup list in cgroup.c */
+ struct list_head l;
+};
+struct cg_controller *new_controller(const char *name);
+
+/* parse all global cgroup information into structures */
+int parse_cg_info(void);
+int new_cg_root_add(char *controller, char *newroot);
+#endif /* __CR_CGROUP_H__ */
diff --git a/criu/include/compiler.h b/criu/include/compiler.h
new file mode 100644
index 000000000000..6bce93562a48
--- /dev/null
+++ b/criu/include/compiler.h
@@ -0,0 +1,87 @@
+#ifndef __CR_COMPILER_H__
+#define __CR_COMPILER_H__
+
+/*
+ * Various definitions for success build,
+ * picked from various places, mostly from
+ * the linux kernel.
+ */
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
+
+#define __stringify_1(x...) #x
+#define __stringify(x...) __stringify_1(x)
+
+#define NORETURN __attribute__((__noreturn__))
+#define __packed __attribute__((__packed__))
+#define __used __attribute__((__used__))
+#define __maybe_unused __attribute__((unused))
+#define __always_unused __attribute__((unused))
+
+#define __section(S) __attribute__ ((__section__(#S)))
+
+#ifndef __always_inline
+# define __always_inline inline __attribute__((always_inline))
+#endif
+
+#define likely(x) __builtin_expect(!!(x), 1)
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
+#ifndef always_inline
+# define always_inline __always_inline
+#endif
+
+#ifndef noinline
+# define noinline __attribute__((noinline))
+#endif
+
+#define __aligned(x) __attribute__((aligned(x)))
+
+/*
+ * Macro to define stack alignment.
+ * aarch64 requires stack to be aligned to 16 bytes.
+ */
+#define __stack_aligned__ __attribute__((aligned(16)))
+
+#ifndef offsetof
+# define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
+#endif
+
+#define barrier() asm volatile("" ::: "memory")
+
+#define container_of(ptr, type, member) ({ \
+ const typeof( ((type *)0)->member ) *__mptr = (ptr); \
+ (type *)( (char *)__mptr - offsetof(type,member) );})
+
+#define __round_mask(x, y) ((__typeof__(x))((y) - 1))
+#define round_up(x, y) ((((x) - 1) | __round_mask(x, y)) + 1)
+#define round_down(x, y) ((x) & ~__round_mask(x, y))
+#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
+#define ALIGN(x, a) (((x) + (a) - 1) & ~((a) - 1))
+
+#define min(x, y) ({ \
+ typeof(x) _min1 = (x); \
+ typeof(y) _min2 = (y); \
+ (void) (&_min1 == &_min2); \
+ _min1 < _min2 ? _min1 : _min2; })
+
+#define max(x, y) ({ \
+ typeof(x) _max1 = (x); \
+ typeof(y) _max2 = (y); \
+ (void) (&_max1 == &_max2); \
+ _max1 > _max2 ? _max1 : _max2; })
+
+#define min_t(type, x, y) ({ \
+ type __min1 = (x); \
+ type __min2 = (y); \
+ __min1 < __min2 ? __min1: __min2; })
+
+#define max_t(type, x, y) ({ \
+ type __max1 = (x); \
+ type __max2 = (y); \
+ __max1 > __max2 ? __max1: __max2; })
+
+#define is_log2(v) (((v) & ((v) - 1)) == 0)
+
+#endif /* __CR_COMPILER_H__ */
diff --git a/criu/include/config-base.h b/criu/include/config-base.h
new file mode 100644
index 000000000000..5e26859658e1
--- /dev/null
+++ b/criu/include/config-base.h
@@ -0,0 +1,40 @@
+#ifndef __CR_CONFIG_BASE_H__
+#define __CR_CONFIG_BASE_H__
+
+#define PAGE_ALLOC_COSTLY_ORDER 3 /* from the kernel source code */
+struct kernel_pipe_buffer {
+ struct page *page;
+ unsigned int offset, len;
+ const struct pipe_buf_operations *ops;
+ unsigned int flags;
+ unsigned long private;
+};
+
+/*
+ * The kernel allocates the linear chunk of memory for pipe buffers.
+ * Allocation of chunks with size more than PAGE_ALLOC_COSTLY_ORDER
+ * fails very often, so we need to restrict the pipe capacity to not
+ * allocate big chunks.
+ */
+#define PIPE_MAX_SIZE ((1 << PAGE_ALLOC_COSTLY_ORDER) * PAGE_SIZE / \
+ sizeof(struct kernel_pipe_buffer))
+
+/* The number of pipes for one chunk */
+#define NR_PIPES_PER_CHUNK 8
+
+/*
+ * These things are required to compile on CentOS-6
+ */
+#ifndef F_LINUX_SPECIFIC_BASE
+# define F_LINUX_SPECIFIC_BASE 1024
+#endif
+
+#ifndef F_SETPIPE_SZ
+# define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7)
+#endif
+
+#ifndef F_GETPIPE_SZ
+# define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8)
+#endif
+
+#endif /* __CR_CONFIG_BASE_H__ */
diff --git a/criu/include/cpu.h b/criu/include/cpu.h
new file mode 100644
index 000000000000..e94525a9e780
--- /dev/null
+++ b/criu/include/cpu.h
@@ -0,0 +1,13 @@
+#ifndef __CR_CPU_H__
+#define __CR_CPU_H__
+
+#include "asm/cpu.h"
+
+extern bool cpu_has_feature(unsigned int feature);
+extern int cpu_init(void);
+extern int cpu_dump_cpuinfo(void);
+extern int cpu_validate_cpuinfo(void);
+extern int cpuinfo_dump(void);
+extern int cpuinfo_check(void);
+
+#endif /* __CR_CPU_H__ */
diff --git a/criu/include/cr-errno.h b/criu/include/cr-errno.h
new file mode 100644
index 000000000000..1f94988cf37e
--- /dev/null
+++ b/criu/include/cr-errno.h
@@ -0,0 +1,17 @@
+#ifndef __CR_ERRNO_H__
+#define __CR_ERRNO_H__
+
+void set_cr_errno(int err);
+int get_cr_errno(void);
+
+/*
+ * List of symbolic error names:
+ * ESRCH - no process can be found corresponding to that specified by pid
+ * EEXIST - process with such pid already exists
+ * EBADRQC - bad options
+ */
+
+#define set_task_cr_err(new_err) atomic_cmpxchg(&task_entries->cr_err, 0, new_err)
+#define get_task_cr_err() atomic_read(&task_entries->cr_err)
+
+#endif /* __CR_ERRNO_H__ */
diff --git a/criu/include/cr-service-const.h b/criu/include/cr-service-const.h
new file mode 100644
index 000000000000..c6d2e398f1f9
--- /dev/null
+++ b/criu/include/cr-service-const.h
@@ -0,0 +1,6 @@
+#ifndef __CR_SERVICE_CONST_H__
+#define __CR_SERVICE_CONST_H__
+
+#define CR_DEFAULT_SERVICE_ADDRESS "./criu_service.socket"
+
+#endif /* __CR_SERVICE_CONST_H__ */
diff --git a/criu/include/cr-service.h b/criu/include/cr-service.h
new file mode 100644
index 000000000000..621cedbe7827
--- /dev/null
+++ b/criu/include/cr-service.h
@@ -0,0 +1,14 @@
+#ifndef __CR_SERVICE_H__
+#define __CR_SERVICE_H__
+
+#include "protobuf/rpc.pb-c.h"
+
+extern int cr_service(bool deamon_mode);
+int cr_service_work(int sk);
+
+extern int send_criu_dump_resp(int socket_fd, bool success, bool restored);
+
+extern struct _cr_service_client *cr_service_client;
+extern unsigned int service_sk_ino;
+
+#endif /* __CR_SERVICE_H__ */
diff --git a/criu/include/cr-show.h b/criu/include/cr-show.h
new file mode 100644
index 000000000000..6ebdb4c2ac92
--- /dev/null
+++ b/criu/include/cr-show.h
@@ -0,0 +1,25 @@
+#ifndef __CR_SHOW_H__
+#define __CR_SHOW_H__
+
+#include <stdbool.h>
+#include "asm/types.h"
+
+struct cr_img;
+
+struct show_image_info {
+ u32 magic;
+ int pb_type;
+ bool single;
+ void (*payload)(struct cr_img *, void *);
+ char *fmt;
+};
+
+extern void show_siginfo(struct cr_img *);
+extern void sk_queue_data_handler(struct cr_img *, void *obj);
+extern void ipc_shm_handler(struct cr_img *, void *obj);
+extern void ipc_msg_handler(struct cr_img *, void *obj);
+extern void ipc_sem_handler(struct cr_img *, void *obj);
+extern int cr_parse_fd(struct cr_img *, u32 magic);
+extern void show_tcp_stream(struct cr_img *, void *obj);
+
+#endif /* __CR_SHOW_H__ */
diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h
new file mode 100644
index 000000000000..5c0e6332e279
--- /dev/null
+++ b/criu/include/cr_options.h
@@ -0,0 +1,117 @@
+#ifndef __CR_OPTIONS_H__
+#define __CR_OPTIONS_H__
+
+#include <stdbool.h>
+
+#include "list.h"
+
+/*
+ * CPU capability options.
+ */
+#define CPU_CAP_NONE (0u)
+#define CPU_CAP_ALL (-1u)
+#define CPU_CAP_FPU (1u) /* Only FPU capability required */
+#define CPU_CAP_CPU (2u) /* Strict CPU capability required */
+#define CPU_CAP_INS (4u) /* Instructions CPU capatibility */
+#define CPU_CAP_DEFAULT (CPU_CAP_FPU)
+
+struct cg_root_opt {
+ struct list_head node;
+ char *controller;
+ char *newroot;
+};
+
+/*
+ * Cgroup management options.
+ */
+#define CG_MODE_IGNORE (0u << 0) /* Zero is important here */
+#define CG_MODE_NONE (1u << 0)
+#define CG_MODE_PROPS (1u << 1)
+#define CG_MODE_SOFT (1u << 2)
+#define CG_MODE_FULL (1u << 3)
+#define CG_MODE_STRICT (1u << 4)
+
+#define CG_MODE_DEFAULT (CG_MODE_SOFT)
+
+/*
+ * Ghost file size we allow to carry by default.
+ */
+#define DEFAULT_GHOST_LIMIT (1 << 20)
+
+#define DEFAULT_TIMEOUT 5
+
+struct irmap;
+
+struct irmap_path_opt {
+ struct list_head node;
+ struct irmap *ir;
+};
+
+struct external {
+ struct list_head node;
+ char *id;
+};
+
+struct cr_options {
+ int final_state;
+ char *show_dump_file;
+ char *show_fmt;
+ bool check_ms_kernel;
+ bool show_pages_content;
+ union {
+ bool restore_detach;
+ bool daemon_mode;
+ };
+ bool restore_sibling;
+ bool ext_unix_sk;
+ struct list_head ext_unixsk_ids;
+ bool shell_job;
+ bool handle_file_locks;
+ bool tcp_established_ok;
+ bool evasive_devices;
+ bool link_remap_ok;
+ unsigned int rst_namespaces_flags;
+ bool log_file_per_pid;
+ bool swrk_restore;
+ char *output;
+ char *root;
+ char *pidfile;
+ char *freeze_cgroup;
+ struct list_head veth_pairs;
+ struct list_head scripts;
+ struct list_head ext_mounts;
+ struct list_head inherit_fds;
+ struct list_head external;
+ char *libdir;
+ bool use_page_server;
+ unsigned short port;
+ char *addr;
+ int ps_socket;
+ bool track_mem;
+ char *img_parent;
+ bool auto_dedup;
+ unsigned int cpu_cap;
+ bool force_irmap;
+ char **exec_cmd;
+ unsigned int manage_cgroups;
+ char *new_global_cg_root;
+ struct list_head new_cgroup_roots;
+ bool autodetect_ext_mounts;
+ bool enable_external_sharing;
+ bool enable_external_masters;
+ bool aufs; /* auto-deteced, not via cli */
+ bool overlayfs;
+ size_t ghost_limit;
+ struct list_head irmap_scan_paths;
+ bool lsm_supplied;
+ char *lsm_profile;
+ unsigned int timeout;
+};
+
+extern struct cr_options opts;
+
+extern void init_opts(void);
+
+extern int add_external(char *key);
+
+#endif /* __CR_OPTIONS_H__ */
diff --git a/criu/include/criu-log.h b/criu/include/criu-log.h
new file mode 100644
index 000000000000..fd5d6349d887
--- /dev/null
+++ b/criu/include/criu-log.h
@@ -0,0 +1,88 @@
+/*
+ This file defines types and macros for CRIU plugins.
+ Copyright (C) 2013 Parallels, Inc
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef __CRIU_LOG_H__
+#define __CRIU_LOG_H__
+
+#ifndef CR_NOGLIBC
+
+#include <string.h>
+#include <errno.h>
+
+#endif /* CR_NOGLIBC */
+
+#define LOG_UNSET (-1)
+#define LOG_MSG (0) /* Print message regardless of log level */
+#define LOG_ERROR (1) /* Errors only, when we're in trouble */
+#define LOG_WARN (2) /* Warnings, dazen and confused but trying to continue */
+#define LOG_INFO (3) /* Informative, everything is fine */
+#define LOG_DEBUG (4) /* Debug only */
+
+extern void print_on_level(unsigned int loglevel, const char *format, ...)
+ __attribute__ ((__format__ (__printf__, 2, 3)));
+
+#ifndef LOG_PREFIX
+# define LOG_PREFIX
+#endif
+
+#define print_once(loglevel, fmt, ...) \
+ do { \
+ static bool __printed; \
+ if (!__printed) { \
+ print_on_level(loglevel, fmt, ##__VA_ARGS__); \
+ __printed = 1; \
+ } \
+ } while (0)
+
+#define pr_msg(fmt, ...) \
+ print_on_level(LOG_MSG, \
+ fmt, ##__VA_ARGS__)
+
+#define pr_info(fmt, ...) \
+ print_on_level(LOG_INFO, \
+ LOG_PREFIX fmt, ##__VA_ARGS__)
+
+#define pr_err(fmt, ...) \
+ print_on_level(LOG_ERROR, \
+ "Error (%s:%d): " LOG_PREFIX fmt, \
+ __FILE__, __LINE__, ##__VA_ARGS__)
+
+#define pr_err_once(fmt, ...) \
+ print_once(LOG_ERROR, fmt, ##__VA_ARGS__)
+
+#define pr_warn(fmt, ...) \
+ print_on_level(LOG_WARN, \
+ "Warn (%s:%d): " LOG_PREFIX fmt, \
+ __FILE__, __LINE__, ##__VA_ARGS__)
+
+#define pr_warn_once(fmt, ...) \
+ print_once(LOG_WARN, fmt, ##__VA_ARGS__)
+
+#define pr_debug(fmt, ...) \
+ print_on_level(LOG_DEBUG, \
+ LOG_PREFIX fmt, ##__VA_ARGS__)
+
+#ifndef CR_NOGLIBC
+
+#define pr_perror(fmt, ...) \
+ pr_err(fmt ": %s\n", ##__VA_ARGS__, strerror(errno))
+
+#endif /* CR_NOGLIBC */
+
+#endif /* __CR_LOG_LEVELS_H__ */
diff --git a/criu/include/criu-plugin.h b/criu/include/criu-plugin.h
new file mode 100644
index 000000000000..b76f5f83900f
--- /dev/null
+++ b/criu/include/criu-plugin.h
@@ -0,0 +1,132 @@
+/*
+ * This file defines types and macros for CRIU plugins.
+ * Copyright (C) 2013-2014 Parallels, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef __CRIU_PLUGIN_H__
+#define __CRIU_PLUGIN_H__
+
+#include <limits.h>
+#include <stdbool.h>
+
+#define CRIU_PLUGIN_GEN_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c))
+#define CRIU_PLUGIN_VERSION_MAJOR 0
+#define CRIU_PLUGIN_VERSION_MINOR 2
+#define CRIU_PLUGIN_VERSION_SUBLEVEL 0
+
+#define CRIU_PLUGIN_VERSION_OLD CRIU_PLUGIN_GEN_VERSION(0,1,0)
+
+#define CRIU_PLUGIN_VERSION \
+ CRIU_PLUGIN_GEN_VERSION(CRIU_PLUGIN_VERSION_MAJOR, \
+ CRIU_PLUGIN_VERSION_MINOR, \
+ CRIU_PLUGIN_VERSION_SUBLEVEL)
+
+/*
+ * Plugin hook points and their arguments in hooks.
+ */
+enum {
+ CR_PLUGIN_HOOK__DUMP_UNIX_SK = 0,
+ CR_PLUGIN_HOOK__RESTORE_UNIX_SK = 1,
+
+ CR_PLUGIN_HOOK__DUMP_EXT_FILE = 2,
+ CR_PLUGIN_HOOK__RESTORE_EXT_FILE = 3,
+
+ CR_PLUGIN_HOOK__DUMP_EXT_MOUNT = 4,
+ CR_PLUGIN_HOOK__RESTORE_EXT_MOUNT = 5,
+
+ CR_PLUGIN_HOOK__DUMP_EXT_LINK = 6,
+
+ CR_PLUGIN_HOOK__MAX
+};
+
+#define DECLARE_PLUGIN_HOOK_ARGS(__hook, ...) \
+ typedef int (__hook ##_t)(__VA_ARGS__)
+
+DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_UNIX_SK, int fd, int id);
+DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_UNIX_SK, int id);
+DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_FILE, int fd, int id);
+DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_FILE, int id);
+DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_MOUNT, char *mountpoint, int id);
+DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_MOUNT, int id, char *mountpoint, char *old_root, int *is_file);
+DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_LINK, int index, int type, char *kind);
+
+enum {
+ CR_PLUGIN_STAGE__DUMP,
+ CR_PLUGIN_STAGE__PRE_DUMP,
+ CR_PLUGIN_STAGE__RESTORE,
+
+ CR_PLUGIN_STAGE_MAX
+};
+
+/*
+ * Plugin descriptor.
+ */
+typedef struct {
+ const char *name;
+ int (*init)(int stage);
+ void (*exit)(int stage, int ret);
+ unsigned int version;
+ unsigned int max_hooks;
+ void *hooks[CR_PLUGIN_HOOK__MAX];
+} cr_plugin_desc_t;
+
+extern cr_plugin_desc_t CR_PLUGIN_DESC;
+
+#define CR_PLUGIN_REGISTER(___name, ___init, ___exit) \
+ cr_plugin_desc_t CR_PLUGIN_DESC = { \
+ .name = ___name, \
+ .init = ___init, \
+ .exit = ___exit, \
+ .version = CRIU_PLUGIN_VERSION, \
+ .max_hooks = CR_PLUGIN_HOOK__MAX, \
+ };
+
+static inline int cr_plugin_dummy_init(int stage) { return 0; }
+static inline void cr_plugin_dummy_exit(int stage, int ret) { }
+
+#define CR_PLUGIN_REGISTER_DUMMY(___name) \
+ cr_plugin_desc_t CR_PLUGIN_DESC = { \
+ .name = ___name, \
+ .init = cr_plugin_dummy_init, \
+ .exit = cr_plugin_dummy_exit, \
+ .version = CRIU_PLUGIN_VERSION, \
+ .max_hooks = CR_PLUGIN_HOOK__MAX, \
+ };
+
+#define CR_PLUGIN_REGISTER_HOOK(__hook, __func) \
+static void __attribute__((constructor)) cr_plugin_register_hook_##__func (void) \
+{ \
+ CR_PLUGIN_DESC.hooks[__hook] = (void *)__func; \
+}
+
+/* Public API */
+extern int criu_get_image_dir(void);
+
+/*
+ * Deprecated, will be removed in next version.
+ */
+typedef int (cr_plugin_init_t)(void);
+typedef void (cr_plugin_fini_t)(void);
+typedef int (cr_plugin_dump_unix_sk_t)(int fd, int id);
+typedef int (cr_plugin_restore_unix_sk_t)(int id);
+typedef int (cr_plugin_dump_file_t)(int fd, int id);
+typedef int (cr_plugin_restore_file_t)(int id);
+typedef int (cr_plugin_dump_ext_mount_t)(char *mountpoint, int id);
+typedef int (cr_plugin_restore_ext_mount_t)(int id, char *mountpoint, char *old_root, int *is_file);
+typedef int (cr_plugin_dump_ext_link_t)(int index, int type, char *kind);
+
+#endif /* __CRIU_PLUGIN_H__ */
diff --git a/criu/include/crtools.h b/criu/include/crtools.h
new file mode 100644
index 000000000000..eaa70dcf4e4e
--- /dev/null
+++ b/criu/include/crtools.h
@@ -0,0 +1,32 @@
+#ifndef __CR_CRTOOLS_H__
+#define __CR_CRTOOLS_H__
+
+#include <sys/types.h>
+
+#include "list.h"
+#include "asm/types.h"
+#include "servicefd.h"
+
+#include "protobuf.h"
+#include "protobuf/inventory.pb-c.h"
+
+#define CR_FD_PERM (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)
+
+extern int check_img_inventory(void);
+extern int write_img_inventory(InventoryEntry *he);
+extern int prepare_inventory(InventoryEntry *he);
+
+#define LAST_PID_PATH "sys/kernel/ns_last_pid"
+
+extern int cr_dump_tasks(pid_t pid);
+extern int cr_pre_dump_tasks(pid_t pid);
+extern int cr_restore_tasks(void);
+extern int cr_show(int pid);
+extern int convert_to_elf(char *elf_path, int fd_core);
+extern int cr_check(void);
+extern int cr_exec(int pid, char **opts);
+extern int cr_dedup(void);
+
+extern int check_add_feature(char *arg);
+
+#endif /* __CR_CRTOOLS_H__ */
diff --git a/criu/include/err.h b/criu/include/err.h
new file mode 100644
index 000000000000..c5b6165a57ab
--- /dev/null
+++ b/criu/include/err.h
@@ -0,0 +1,53 @@
+/*
+ * Adopted from linux kernel
+ */
+#ifndef __CR_ERR_H__
+#define __CR_ERR_H__
+
+#include "compiler.h"
+
+/*
+ * The address of a block returned by malloc or realloc in GNU
+ * systems is always a multiple of eight (or sixteen on 64-bit systems).
+ *
+ * Thus we may encode error number in low bits.
+ */
+#define MAX_ERRNO 4095
+
+#define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO)
+
+static inline void *ERR_PTR(long error)
+{
+ return (void *)error;
+}
+
+static inline long PTR_ERR(const void *ptr)
+{
+ return (long)ptr;
+}
+
+static inline long IS_ERR(const void *ptr)
+{
+ return IS_ERR_VALUE((unsigned long)ptr);
+}
+
+static inline long IS_ERR_OR_NULL(const void *ptr)
+{
+ return !ptr || IS_ERR_VALUE((unsigned long)ptr);
+}
+
+static inline void *ERR_CAST(const void *ptr)
+{
+ /* cast away the const */
+ return (void *)ptr;
+}
+
+static inline int PTR_RET(const void *ptr)
+{
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+ else
+ return 0;
+}
+
+#endif /* __CR_ERR_H__ */
diff --git a/criu/include/errno.h b/criu/include/errno.h
new file mode 100644
index 000000000000..5c2322e9fae9
--- /dev/null
+++ b/criu/include/errno.h
@@ -0,0 +1,9 @@
+#ifndef __CR_ERRNO_H__
+#define __CR_ERRNO_H__
+
+#define ERESTARTSYS 512
+#define ERESTARTNOINTR 513
+#define ERESTARTNOHAND 514
+#define ERESTART_RESTARTBLOCK 516
+
+#endif /* __CR_ERRNO_H__ */
diff --git a/criu/include/eventfd.h b/criu/include/eventfd.h
new file mode 100644
index 000000000000..65e0af7cdc13
--- /dev/null
+++ b/criu/include/eventfd.h
@@ -0,0 +1,10 @@
+#ifndef __CR_EVENTFD_H__
+#define __CR_EVENTFD_H__
+
+#include "files.h"
+
+extern int is_eventfd_link(char *link);
+extern const struct fdtype_ops eventfd_dump_ops;
+extern struct collect_image_info eventfd_cinfo;
+
+#endif /* __CR_EVENTFD_H__ */
diff --git a/criu/include/eventpoll.h b/criu/include/eventpoll.h
new file mode 100644
index 000000000000..96a77bc984c0
--- /dev/null
+++ b/criu/include/eventpoll.h
@@ -0,0 +1,11 @@
+#ifndef __CR_EVENTPOLL_H__
+#define __CR_EVENTPOLL_H__
+
+#include "files.h"
+
+extern int is_eventpoll_link(char *link);
+extern const struct fdtype_ops eventpoll_dump_ops;
+extern struct collect_image_info epoll_tfd_cinfo;
+extern struct collect_image_info epoll_cinfo;
+
+#endif /* __CR_EVENTPOLL_H__ */
diff --git a/criu/include/fault-injection.h b/criu/include/fault-injection.h
new file mode 100644
index 000000000000..989f654b2f51
--- /dev/null
+++ b/criu/include/fault-injection.h
@@ -0,0 +1,19 @@
+#ifndef __CR_FAULT_INJECTION_H__
+#define __CR_FAULT_INJECTION_H__
+#include <stdbool.h>
+
+enum faults {
+ FI_NONE = 0,
+ FI_DUMP_EARLY,
+ FI_RESTORE_ROOT_ONLY,
+ FI_MAX,
+};
+
+extern enum faults fi_strategy;
+extern int fault_injection_init(void);
+
+static inline bool fault_injected(enum faults f)
+{
+ return fi_strategy == f;
+}
+#endif
diff --git a/criu/include/fcntl.h b/criu/include/fcntl.h
new file mode 100644
index 000000000000..6f85c5ee6923
--- /dev/null
+++ b/criu/include/fcntl.h
@@ -0,0 +1,36 @@
+#ifndef __CR_ASM_GENERIC_FCNTL_H__
+#define __CR_ASM_GENERIC_FCNTL_H__
+
+#include <sys/types.h>
+#include <fcntl.h>
+
+#ifndef F_SETOWN_EX
+#define F_SETOWN_EX 15
+#define F_GETOWN_EX 16
+
+struct f_owner_ex {
+ int type;
+ pid_t pid;
+};
+
+#endif
+
+#ifndef F_GETOWNER_UIDS
+#define F_GETOWNER_UIDS 17
+#endif
+
+#ifndef F_LINUX_SPECIFIC_BASE
+#define F_LINUX_SPECIFIC_BASE 1024
+#endif
+#ifndef F_SETPIPE_SZ
+# define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7)
+#endif
+#ifndef F_GETPIPE_SZ
+# define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8)
+#endif
+
+#ifndef O_PATH
+# define O_PATH 010000000
+#endif
+
+#endif /* __CR_ASM_GENERIC_FCNTL_H__ */
diff --git a/criu/include/fifo.h b/criu/include/fifo.h
new file mode 100644
index 000000000000..776265450612
--- /dev/null
+++ b/criu/include/fifo.h
@@ -0,0 +1,11 @@
+#ifndef __CR_FIFO_H__
+#define __CR_FIFO_H__
+
+struct fd_parms;
+struct cr_imgset;
+
+extern const struct fdtype_ops fifo_dump_ops;
+extern struct collect_image_info fifo_cinfo;
+extern int collect_fifo(void);
+
+#endif /* __CR_FIFO_H__ */
diff --git a/criu/include/file-ids.h b/criu/include/file-ids.h
new file mode 100644
index 000000000000..2da4ceffde07
--- /dev/null
+++ b/criu/include/file-ids.h
@@ -0,0 +1,20 @@
+#ifndef __CR_FILE_IDS_H__
+#define __CR_FILE_IDS_H__
+
+#include "compiler.h"
+#include "asm/types.h"
+#include "rbtree.h"
+
+#include "protobuf/fdinfo.pb-c.h"
+
+#define FD_PID_INVALID (-2U)
+#define FD_DESC_INVALID (-3U)
+
+struct fdinfo_entry;
+struct stat;
+
+struct fd_parms;
+extern int fd_id_generate(pid_t pid, FdinfoEntry *fe, struct fd_parms *p);
+extern int fd_id_generate_special(struct fd_parms *p, u32 *id);
+
+#endif /* __CR_FILE_IDS_H__ */
diff --git a/criu/include/file-lock.h b/criu/include/file-lock.h
new file mode 100644
index 000000000000..e771c0ee584c
--- /dev/null
+++ b/criu/include/file-lock.h
@@ -0,0 +1,64 @@
+#ifndef __FILE_LOCK_H__
+#define __FILE_LOCK_H__
+
+#include "list.h"
+
+#include "protobuf.h"
+#include "protobuf/file-lock.pb-c.h"
+
+#define FL_UNKNOWN -1
+#define FL_POSIX 1
+#define FL_FLOCK 2
+
+/* for posix fcntl() and lockf() */
+#ifndef F_RDLCK
+#define F_RDLCK 0
+#define F_WRLCK 1
+#define F_UNLCK 2
+#endif
+
+/* operations for bsd flock(), also used by the kernel implementation */
+#define LOCK_SH 1 /* shared lock */
+#define LOCK_EX 2 /* exclusive lock */
+#define LOCK_NB 4 /* or'd with one of the above to prevent
+ blocking */
+#define LOCK_UN 8 /* remove lock */
+
+#define LOCK_MAND 32 /* This is a mandatory flock ... */
+#define LOCK_READ 64 /* which allows concurrent read operations */
+#define LOCK_WRITE 128 /* which allows concurrent write operations */
+#define LOCK_RW 192 /* which allows concurrent read & write ops */
+
+struct file_lock {
+ long long fl_id;
+ int fl_kind;
+ int fl_ltype;
+
+ pid_t fl_owner;
+ int maj, min;
+ unsigned long i_no;
+ long long start;
+ char end[32];
+
+ struct list_head list; /* list of all file locks */
+
+ int real_owner;
+ int owners_fd;
+};
+
+extern struct list_head file_lock_list;
+
+extern struct file_lock *alloc_file_lock(void);
+extern void free_file_locks(void);
+
+extern int prepare_file_locks(int pid);
+extern struct collect_image_info file_locks_cinfo;
+
+struct pid;
+struct fd_parms;
+extern int note_file_lock(struct pid *, int fd, int lfd, struct fd_parms *);
+extern int dump_file_locks(void);
+
+#define OPT_FILE_LOCKS "file-locks"
+
+#endif /* __FILE_LOCK_H__ */
diff --git a/criu/include/files-reg.h b/criu/include/files-reg.h
new file mode 100644
index 000000000000..e2f611535294
--- /dev/null
+++ b/criu/include/files-reg.h
@@ -0,0 +1,59 @@
+#ifndef __CR_FILES_REG_H__
+#define __CR_FILES_REG_H__
+
+#include "asm/types.h"
+#include "files.h"
+#include "image.h"
+
+#include "protobuf/regfile.pb-c.h"
+#include "protobuf/ghost-file.pb-c.h"
+
+struct cr_imgset;
+struct fd_parms;
+
+struct file_remap {
+ char *rpath;
+ bool is_dir;
+ int rmnt_id;
+ unsigned int users;
+ uid_t owner;
+};
+
+struct reg_file_info {
+ struct file_desc d;
+ RegFileEntry *rfe;
+ struct file_remap *remap;
+ bool size_checked;
+ char *path;
+};
+
+extern int open_reg_by_id(u32 id);
+extern int open_reg_fd(struct file_desc *);
+extern int open_path(struct file_desc *, int (*open_cb)(int ns_root_fd,
+ struct reg_file_info *, void *), void *arg);
+extern void clear_ghost_files(void);
+
+extern int prepare_shared_reg_files(void);
+
+extern const struct fdtype_ops regfile_dump_ops;
+extern int do_open_reg_noseek_flags(int ns_root_fd, struct reg_file_info *rfi, void *arg);
+extern int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p);
+
+extern struct file_remap *lookup_ghost_remap(u32 dev, u32 ino);
+extern void remap_put(struct file_remap *remap);
+
+extern struct file_desc *try_collect_special_file(u32 id, int optional);
+#define collect_special_file(id) try_collect_special_file(id, 0)
+
+extern int collect_remaps_and_regfiles(void);
+
+extern void delete_link_remaps(void);
+extern void free_link_remaps(void);
+extern int prepare_remaps(void);
+extern void try_clean_remaps(int ns_fd);
+
+extern int strip_deleted(struct fd_link *link);
+
+extern int prepare_procfs_remaps(void);
+
+#endif /* __CR_FILES_REG_H__ */
diff --git a/criu/include/files.h b/criu/include/files.h
new file mode 100644
index 000000000000..9ea234440a1e
--- /dev/null
+++ b/criu/include/files.h
@@ -0,0 +1,183 @@
+#ifndef __CR_FILES_H__
+#define __CR_FILES_H__
+
+#include "compiler.h"
+#include "asm/types.h"
+#include "fcntl.h"
+#include "lock.h"
+#include "list.h"
+#include "image.h"
+#include "pid.h"
+#include "rst_info.h"
+
+#include "protobuf/fdinfo.pb-c.h"
+#include "protobuf/fown.pb-c.h"
+#include "protobuf/vma.pb-c.h"
+
+struct pstree_item;
+struct file_desc;
+struct cr_imgset;
+struct rst_info;
+struct parasite_ctl;
+
+struct fd_link {
+ union {
+ /* Link info for generic file (path) */
+ struct {
+ char name[PATH_MAX + 1];
+ size_t len;
+ };
+
+ /* Link info for proc-ns file */
+ struct {
+ struct ns_desc *ns_d;
+ unsigned int ns_kid;
+ };
+ };
+};
+
+struct fd_parms {
+ int fd;
+ off_t pos;
+ unsigned int flags;
+ char fd_flags;
+ struct stat stat;
+ pid_t pid;
+ FownEntry fown;
+ struct fd_link *link;
+ long fs_type;
+ int mnt_id;
+
+ struct parasite_ctl *ctl;
+};
+
+#define FD_PARMS_INIT \
+(struct fd_parms) { \
+ .fd = FD_DESC_INVALID, \
+ .fown = FOWN_ENTRY__INIT, \
+ .link = NULL, \
+ .mnt_id = -1, \
+}
+
+extern int fill_fdlink(int lfd, const struct fd_parms *p, struct fd_link *link);
+
+struct file_desc;
+
+struct fdinfo_list_entry {
+ struct list_head desc_list; /* To chain on @fd_info_head */
+ struct file_desc *desc; /* Associated file descriptor */
+ struct list_head ps_list; /* To chain per-task files */
+ int pid;
+ futex_t real_pid;
+ FdinfoEntry *fe;
+};
+
+/* reports whether fd_a takes prio over fd_b */
+static inline int fdinfo_rst_prio(struct fdinfo_list_entry *fd_a, struct fdinfo_list_entry *fd_b)
+{
+ return pid_rst_prio(fd_a->pid, fd_b->pid) ||
+ ((fd_a->pid == fd_b->pid) && (fd_a->fe->fd < fd_b->fe->fd));
+}
+
+struct file_desc_ops {
+ /* fd_types from protobuf/fdinfo.proto */
+ unsigned int type;
+ /*
+ * Opens a file by whatever syscall is required for that.
+ * The returned descriptor may be closed (dup2-ed to another)
+ * so it shouldn't be saved for any post-actions.
+ */
+ int (*open)(struct file_desc *d);
+ /*
+ * Called on a file when all files of that type are opened
+ * and with the fd being the "restored" one.
+ */
+ int (*post_open)(struct file_desc *d, int fd);
+ /*
+ * Report whether the fd in question wants a transport socket
+ * in it instead of a real file. See file_master for details.
+ */
+ int (*want_transport)(FdinfoEntry *fe, struct file_desc *d);
+ /*
+ * Called to collect a new fd before adding it on desc. Clients
+ * may chose to collect it to some specific rst_info list. See
+ * prepare_fds() for details.
+ */
+ void (*collect_fd)(struct file_desc *, struct fdinfo_list_entry *,
+ struct rst_info *);
+ char * (*name)(struct file_desc *, char *b, size_t s);
+};
+
+static inline void collect_gen_fd(struct fdinfo_list_entry *fle, struct rst_info *ri)
+{
+ list_add_tail(&fle->ps_list, &ri->fds);
+}
+
+struct file_desc {
+ u32 id; /* File id, unique */
+ struct hlist_node hash; /* Descriptor hashing and lookup */
+ struct list_head fd_info_head; /* Chain of fdinfo_list_entry-s with same ID and type but different pids */
+ struct file_desc_ops *ops; /* Associated operations */
+};
+
+struct fdtype_ops {
+ unsigned int type;
+ int (*dump)(int lfd, u32 id, const struct fd_parms *p);
+ int (*pre_dump)(int pid, int lfd);
+};
+
+struct cr_img;
+
+extern int do_dump_gen_file(struct fd_parms *p, int lfd,
+ const struct fdtype_ops *ops,
+ struct cr_img *);
+struct parasite_drain_fd;
+int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item,
+ struct parasite_drain_fd *dfds);
+int predump_task_files(int pid);
+
+extern void file_desc_init(struct file_desc *d, u32 id, struct file_desc_ops *ops);
+extern int file_desc_add(struct file_desc *d, u32 id, struct file_desc_ops *ops);
+extern struct fdinfo_list_entry *file_master(struct file_desc *d);
+extern struct file_desc *find_file_desc_raw(int type, u32 id);
+
+extern int send_fd_to_peer(int fd, struct fdinfo_list_entry *fle, int sock);
+extern int restore_fown(int fd, FownEntry *fown);
+extern int rst_file_params(int fd, FownEntry *fown, int flags);
+
+extern void show_saved_files(void);
+
+extern int prepare_fds(struct pstree_item *me);
+extern int prepare_fd_pid(struct pstree_item *me);
+extern int prepare_ctl_tty(int pid, struct rst_info *rst_info, u32 ctl_tty_id);
+extern int prepare_shared_fdinfo(void);
+extern int get_filemap_fd(struct vma_area *);
+extern int restore_fs(struct pstree_item *);
+extern int prepare_fs_pid(struct pstree_item *);
+extern int set_fd_flags(int fd, int flags);
+
+extern int close_old_fds(void);
+#ifndef AT_EMPTY_PATH
+#define AT_EMPTY_PATH 0x1000
+#endif
+
+#define LREMAP_PARAM "link-remap"
+
+extern int shared_fdt_prepare(struct pstree_item *item);
+
+extern struct collect_image_info ext_file_cinfo;
+extern int dump_unsupp_fd(struct fd_parms *p, int lfd,
+ struct cr_img *, char *more, char *info);
+
+extern int inherit_fd_parse(char *optarg);
+extern int inherit_fd_add(int fd, char *key);
+extern void inherit_fd_log(void);
+extern int inherit_fd_resolve_clash(int fd);
+extern int inherit_fd_fini(void);
+
+extern bool external_lookup_id(char *id);
+extern int inherit_fd_lookup_id(char *id);
+
+extern bool inherited_fd(struct file_desc *, int *fdp);
+
+#endif /* __CR_FILES_H__ */
diff --git a/criu/include/fs-magic.h b/criu/include/fs-magic.h
new file mode 100644
index 000000000000..d6e9e54d181d
--- /dev/null
+++ b/criu/include/fs-magic.h
@@ -0,0 +1,52 @@
+#ifndef __CR_FS_MAGIC_H__
+#define __CR_FS_MAGIC_H__
+
+#include <sys/vfs.h>
+
+/*
+ * Gather magic numbers in case if distros
+ * do not provide appropriate entry in
+ * linux/magic.h.
+ */
+
+#ifndef NFS_SUPER_MAGIC
+# define NFS_SUPER_MAGIC 0x6969
+#endif
+
+#ifndef PIPEFS_MAGIC
+# define PIPEFS_MAGIC 0x50495045
+#endif
+
+#ifndef ANON_INODE_FS_MAGIC
+# define ANON_INODE_FS_MAGIC 0x09041934
+#endif
+
+#ifndef TMPFS_MAGIC
+# define TMPFS_MAGIC 0x01021994
+#endif
+
+#ifndef SOCKFS_MAGIC
+# define SOCKFS_MAGIC 0x534f434b
+#endif
+
+#ifndef DEVPTS_SUPER_MAGIC
+#define DEVPTS_SUPER_MAGIC 0x1cd1
+#endif
+
+#ifndef BTRFS_SUPER_MAGIC
+#define BTRFS_SUPER_MAGIC 0x9123683E
+#endif
+
+#ifndef AUFS_SUPER_MAGIC
+#define AUFS_SUPER_MAGIC 0x61756673
+#endif
+
+#ifndef PROC_SUPER_MAGIC
+#define PROC_SUPER_MAGIC 0x9fa0
+#endif
+
+#ifndef BINFMTFS_MAGIC
+#define BINFMTFS_MAGIC 0x42494e4d
+#endif
+
+#endif /* __CR_FS_MAGIC_H__ */
diff --git a/criu/include/fsnotify.h b/criu/include/fsnotify.h
new file mode 100644
index 000000000000..48e3982cf7aa
--- /dev/null
+++ b/criu/include/fsnotify.h
@@ -0,0 +1,26 @@
+#ifndef __CR_FSNOTIFY_H__
+#define __CR_FSNOTIFY_H__
+
+#include "asm/types.h"
+#include "files.h"
+
+#include "protobuf.h"
+#include "protobuf/fsnotify.pb-c.h"
+
+#define KERNEL_FS_EVENT_ON_CHILD 0x08000000
+
+struct fsnotify_params {
+ u32 faflags;
+ u32 evflags;
+};
+
+extern int is_inotify_link(char *link);
+extern int is_fanotify_link(char *link);
+extern const struct fdtype_ops inotify_dump_ops;
+extern const struct fdtype_ops fanotify_dump_ops;
+extern struct collect_image_info inotify_cinfo;
+extern struct collect_image_info inotify_mark_cinfo;
+extern struct collect_image_info fanotify_cinfo;
+extern struct collect_image_info fanotify_mark_cinfo;
+
+#endif /* __CR_FSNOTIFY_H__ */
diff --git a/criu/include/image-desc.h b/criu/include/image-desc.h
new file mode 100644
index 000000000000..532ced8b4167
--- /dev/null
+++ b/criu/include/image-desc.h
@@ -0,0 +1,119 @@
+#ifndef __CR_IMAGE_DESC_H__
+#define __CR_IMAGE_DESC_H__
+
+#include "asm/int.h"
+
+enum {
+ CR_FD_INVENTORY,
+ CR_FD_STATS,
+ /*
+ * Task entries
+ */
+
+ _CR_FD_TASK_FROM,
+ CR_FD_CORE,
+ CR_FD_IDS,
+ CR_FD_MM,
+ CR_FD_SIGACT,
+ CR_FD_CREDS,
+ CR_FD_FS,
+ _CR_FD_TASK_TO,
+
+ CR_FD_PAGEMAP,
+
+ /*
+ * NS entries
+ */
+ CR_FD_UTSNS,
+ CR_FD_MNTS,
+ CR_FD_USERNS,
+
+ _CR_FD_IPCNS_FROM,
+ CR_FD_IPC_VAR,
+ CR_FD_IPCNS_SHM,
+ CR_FD_IPCNS_MSG,
+ CR_FD_IPCNS_SEM,
+ _CR_FD_IPCNS_TO,
+
+ _CR_FD_NETNS_FROM,
+ CR_FD_NETDEV,
+ CR_FD_IFADDR,
+ CR_FD_ROUTE,
+ CR_FD_ROUTE6,
+ CR_FD_RULE,
+ CR_FD_IPTABLES,
+ CR_FD_IP6TABLES,
+ CR_FD_NETNS,
+ CR_FD_NETNF_CT,
+ CR_FD_NETNF_EXP,
+ _CR_FD_NETNS_TO,
+
+ CR_FD_PSTREE,
+ CR_FD_SHMEM_PAGEMAP,
+ CR_FD_GHOST_FILE,
+ CR_FD_TCP_STREAM,
+ CR_FD_FDINFO,
+
+ _CR_FD_GLOB_FROM,
+ CR_FD_SK_QUEUES,
+ CR_FD_REG_FILES,
+ CR_FD_EXT_FILES,
+ CR_FD_NS_FILES,
+ CR_FD_INETSK,
+ CR_FD_UNIXSK,
+ CR_FD_PACKETSK,
+ CR_FD_NETLINK_SK,
+ CR_FD_PIPES,
+ CR_FD_PIPES_DATA,
+ CR_FD_FIFO,
+ CR_FD_FIFO_DATA,
+ CR_FD_TTY_FILES,
+ CR_FD_TTY_INFO,
+ CR_FD_REMAP_FPATH,
+ CR_FD_EVENTFD_FILE,
+ CR_FD_EVENTPOLL_FILE,
+ CR_FD_SIGNALFD,
+ CR_FD_INOTIFY_FILE,
+ CR_FD_FANOTIFY_FILE,
+ CR_FD_TUNFILE,
+ CR_FD_CGROUP,
+ CR_FD_TIMERFD,
+ CR_FD_FILE_LOCKS,
+ CR_FD_SECCOMP,
+ _CR_FD_GLOB_TO,
+
+ CR_FD_TMPFS_IMG,
+ CR_FD_TMPFS_DEV,
+ CR_FD_BINFMT_MISC,
+ CR_FD_PAGES,
+
+ CR_FD_VMAS,
+ CR_FD_PAGES_OLD,
+ CR_FD_SHM_PAGES_OLD,
+ CR_FD_RLIMIT,
+ CR_FD_ITIMERS,
+ CR_FD_POSIX_TIMERS,
+ CR_FD_FILE_LOCKS_PID,
+
+ CR_FD_IRMAP_CACHE,
+ CR_FD_CPUINFO,
+
+ CR_FD_SIGNAL,
+ CR_FD_PSIGNAL,
+ CR_FD_INOTIFY_WD,
+ CR_FD_FANOTIFY_MARK,
+ CR_FD_EVENTPOLL_TFD,
+
+ CR_FD_MAX
+};
+
+/* file descriptors template */
+struct cr_fd_desc_tmpl {
+ const char *fmt; /* format for the name */
+ u32 magic; /* magic in the header */
+ int oflags; /* flags for image_open */
+};
+
+extern struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX];
+
+#endif /* __CR_IMAGE_DESC_H__ */
diff --git a/criu/include/image.h b/criu/include/image.h
new file mode 100644
index 000000000000..305febf5adb6
--- /dev/null
+++ b/criu/include/image.h
@@ -0,0 +1,190 @@
+#ifndef __CR_IMAGE_H__
+#define __CR_IMAGE_H__
+
+#include <stdbool.h>
+
+#include "compiler.h"
+#include "servicefd.h"
+#include "image-desc.h"
+#include "fcntl.h"
+#include "magic.h"
+#include "bfd.h"
+#include "bug.h"
+
+#ifdef _ARCH_PPC64
+#define PAGE_IMAGE_SIZE 65536
+#else
+#define PAGE_IMAGE_SIZE 4096
+#endif /* _ARCH_PPC64 */
+#define PAGE_RSS 1
+#define PAGE_ANON 2
+
+/*
+ * Top bit set in the tgt id means we've remapped
+ * to a ghost file.
+ */
+#define REMAP_GHOST (1 << 31)
+
+/*
+ * By-default, when dumping a unix socket, we should dump its peer
+ * as well. Which in turn means, we should dump the task(s) that have
+ * this peer opened.
+ *
+ * Sometimes, we can break this rule and dump only one end of the
+ * unix sockets pair, and on restore time connect() this end back to
+ * its peer.
+ *
+ * So, to resolve this situation we mark the peers we don't dump
+ * as "external" and require the --ext-unix-sk option.
+ */
+
+#define USK_EXTERN (1 << 0)
+#define USK_SERVICE (1 << 1)
+#define USK_CALLBACK (1 << 2)
+#define USK_INHERIT (1 << 3)
+
+/*
+ * VMA_AREA status:
+ *
+ * - none
+ * VmaEntry is just allocated and has not been used
+ * for anything yet
+ * - regular
+ * VmaEntry represent some memory area which should be
+ * dumped and restored; this is a general sign that we
+ * should not skip the area content from processing in
+ * compare with special areas such as vsyscall
+ * - stack
+ * the memory area is used in application stack so we
+ * should be careful about guard page here
+ * - vsyscall
+ * special memory area injected into the task memory
+ * space by the kernel itself, represent virtual syscall
+ * implementation and it is specific to every kernel version,
+ * its contents should not be dumped ever
+ * - vdso,vvar
+ * the vDSO area, it might reqire additional memory
+ * contents modification especially when tasks are
+ * migrating between different kernel versions
+ * - heap
+ * "heap" area in application, currently for inforamtion only
+ * - file private
+ * stands for privately memory mapped files
+ * - file shared
+ * stands for shared memory mapped files
+ * - anon shared
+ * represent shared anonymous memory areas
+ * - anon private
+ * represent private anonymous memory areas
+ * - SysV IPC
+ * IPC shared memory area
+ * - socket
+ * memory map for socket
+ * - AIO ring
+ * memory area serves AIO buffers
+ * - unsupported
+ * stands for any unknown memory areas, usually means
+ * we don't know how to work with it and should stop
+ * processing exiting with error; while the rest of bits
+ * are part of image ABI, this particular one must never
+ * be used in image.
+ */
+#define VMA_AREA_NONE (0 << 0)
+#define VMA_AREA_REGULAR (1 << 0)
+#define VMA_AREA_STACK (1 << 1)
+#define VMA_AREA_VSYSCALL (1 << 2)
+#define VMA_AREA_VDSO (1 << 3)
+#define VMA_AREA_HEAP (1 << 5)
+
+#define VMA_FILE_PRIVATE (1 << 6)
+#define VMA_FILE_SHARED (1 << 7)
+#define VMA_ANON_SHARED (1 << 8)
+#define VMA_ANON_PRIVATE (1 << 9)
+
+#define VMA_AREA_SYSVIPC (1 << 10)
+#define VMA_AREA_SOCKET (1 << 11)
+#define VMA_AREA_VVAR (1 << 12)
+#define VMA_AREA_AIORING (1 << 13)
+
+#define VMA_UNSUPP (1 << 31)
+
+#define CR_CAP_SIZE 2
+
+#define TASK_COMM_LEN 16
+
+#define TASK_ALIVE 0x1
+#define TASK_DEAD 0x2
+#define TASK_STOPPED 0x3
+#define TASK_HELPER 0x4
+
+#define CR_PARENT_LINK "parent"
+
+extern bool fdinfo_per_id;
+extern bool ns_per_id;
+extern bool img_common_magic;
+
+#define O_NOBUF (O_DIRECT)
+#define O_SERVICE (O_DIRECTORY)
+#define O_DUMP (O_WRONLY | O_CREAT | O_TRUNC)
+#define O_SHOW (O_RDONLY | O_NOBUF)
+#define O_RSTR (O_RDONLY)
+
+struct cr_img {
+ union {
+ struct bfd _x;
+ struct {
+ int fd; /* should be first to coincide with _x.fd */
+ int type;
+ unsigned long oflags;
+ char *path;
+ };
+ };
+};
+
+#define EMPTY_IMG_FD (-404)
+#define LAZY_IMG_FD (-505)
+
+static inline bool empty_image(struct cr_img *img)
+{
+ return img && img->_x.fd == EMPTY_IMG_FD;
+}
+
+static inline bool lazy_image(struct cr_img *img)
+{
+ return img->_x.fd == LAZY_IMG_FD;
+}
+
+extern int open_image_lazy(struct cr_img *img);
+
+static inline int img_raw_fd(struct cr_img *img)
+{
+ if (lazy_image(img) && open_image_lazy(img))
+ return -1;
+
+ BUG_ON(bfd_buffered(&img->_x));
+ return img->_x.fd;
+}
+
+extern int open_image_dir(char *dir);
+extern void close_image_dir(void);
+
+extern struct cr_img *open_image_at(int dfd, int type, unsigned long flags, ...);
+#define open_image(typ, flags, ...) open_image_at(-1, typ, flags, ##__VA_ARGS__)
+extern int open_image_lazy(struct cr_img *img);
+extern struct cr_img *open_pages_image(unsigned long flags, struct cr_img *pmi);
+extern struct cr_img *open_pages_image_at(int dfd, unsigned long flags, struct cr_img *pmi);
+extern void up_page_ids_base(void);
+
+extern struct cr_img *img_from_fd(int fd); /* for cr-show mostly */
+
+extern int write_img_buf(struct cr_img *, const void *ptr, int size);
+#define write_img(img, ptr) write_img_buf((img), (ptr), sizeof(*(ptr)))
+extern int read_img_buf_eof(struct cr_img *, void *ptr, int size);
+#define read_img_eof(img, ptr) read_img_buf_eof((img), (ptr), sizeof(*(ptr)))
+extern int read_img_buf(struct cr_img *, void *ptr, int size);
+#define read_img(img, ptr) read_img_buf((img), (ptr), sizeof(*(ptr)))
+extern int read_img_str(struct cr_img *, char **pstr, int size);
+
+extern void close_image(struct cr_img *);
+
+#endif /* __CR_IMAGE_H__ */
diff --git a/criu/include/imgset.h b/criu/include/imgset.h
new file mode 100644
index 000000000000..04be917e2dac
--- /dev/null
+++ b/criu/include/imgset.h
@@ -0,0 +1,37 @@
+#ifndef __CR_IMGSET_H__
+#define __CR_IMGSET_H__
+
+#include "image-desc.h"
+#include "bug.h"
+#include "image.h"
+
+struct cr_imgset {
+ int fd_off;
+ int fd_nr;
+ struct cr_img **_imgs;
+};
+
+static inline struct cr_img *img_from_set(const struct cr_imgset *imgset, int type)
+{
+ int idx;
+
+ idx = type - imgset->fd_off;
+ BUG_ON(idx > imgset->fd_nr);
+
+ return imgset->_imgs[idx];
+}
+
+extern struct cr_imgset *glob_imgset;
+
+extern struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX];
+
+extern struct cr_imgset *cr_task_imgset_open(int pid, int mode);
+extern struct cr_imgset *cr_imgset_open_range(int pid, int from, int to,
+ unsigned long flags);
+#define cr_imgset_open(pid, type, flags) cr_imgset_open_range(pid, \
+ _CR_FD_##type##_FROM, _CR_FD_##type##_TO, flags)
+extern struct cr_imgset *cr_glob_imgset_open(int mode);
+
+extern void close_cr_imgset(struct cr_imgset **cr_imgset);
+
+#endif /* __CR_IMGSET_H__ */
diff --git a/criu/include/inet_diag.h b/criu/include/inet_diag.h
new file mode 100644
index 000000000000..95be2c19df84
--- /dev/null
+++ b/criu/include/inet_diag.h
@@ -0,0 +1,136 @@
+#ifndef __CR_INET_DIAG_H__
+#define __CR_INET_DIAG_H__
+
+#include <linux/types.h>
+
+/* Just some random number */
+#define TCPDIAG_GETSOCK 18
+#define DCCPDIAG_GETSOCK 19
+
+#define INET_DIAG_GETSOCK_MAX 24
+
+/* Socket identity */
+struct inet_diag_sockid {
+ __be16 idiag_sport;
+ __be16 idiag_dport;
+ __be32 idiag_src[4];
+ __be32 idiag_dst[4];
+ __u32 idiag_if;
+ __u32 idiag_cookie[2];
+#define INET_DIAG_NOCOOKIE (~0U)
+};
+
+/* Request structure */
+
+struct inet_diag_req_compat {
+ __u8 idiag_family; /* Family of addresses. */
+ __u8 idiag_src_len;
+ __u8 idiag_dst_len;
+ __u8 idiag_ext; /* Query extended information */
+
+ struct inet_diag_sockid id;
+
+ __u32 idiag_states; /* States to dump */
+ __u32 idiag_dbs; /* Tables to dump (NI) */
+};
+
+struct inet_diag_req_v2 {
+ __u8 sdiag_family;
+ __u8 sdiag_protocol;
+ __u8 idiag_ext;
+ __u8 pad;
+ __u32 idiag_states;
+ struct inet_diag_sockid id;
+};
+
+enum {
+ INET_DIAG_REQ_NONE,
+ INET_DIAG_REQ_BYTECODE,
+};
+
+#define INET_DIAG_REQ_MAX INET_DIAG_REQ_BYTECODE
+
+/* Bytecode is sequence of 4 byte commands followed by variable arguments.
+ * All the commands identified by "code" are conditional jumps forward:
+ * to offset cc+"yes" or to offset cc+"no". "yes" is supposed to be
+ * length of the command and its arguments.
+ */
+
+struct inet_diag_bc_op {
+ unsigned char code;
+ unsigned char yes;
+ unsigned short no;
+};
+
+enum {
+ INET_DIAG_BC_NOP,
+ INET_DIAG_BC_JMP,
+ INET_DIAG_BC_S_GE,
+ INET_DIAG_BC_S_LE,
+ INET_DIAG_BC_D_GE,
+ INET_DIAG_BC_D_LE,
+ INET_DIAG_BC_AUTO,
+ INET_DIAG_BC_S_COND,
+ INET_DIAG_BC_D_COND,
+};
+
+struct inet_diag_hostcond {
+ __u8 family;
+ __u8 prefix_len;
+ int port;
+ __be32 addr[0];
+};
+
+/* Base info structure. It contains socket identity (addrs/ports/cookie)
+ * and, alas, the information shown by netstat. */
+struct inet_diag_msg {
+ __u8 idiag_family;
+ __u8 idiag_state;
+ __u8 idiag_timer;
+ __u8 idiag_retrans;
+
+ struct inet_diag_sockid id;
+
+ __u32 idiag_expires;
+ __u32 idiag_rqueue;
+ __u32 idiag_wqueue;
+ __u32 idiag_uid;
+ __u32 idiag_inode;
+};
+
+/* Extensions */
+
+enum {
+ INET_DIAG_NONE,
+ INET_DIAG_MEMINFO,
+ INET_DIAG_INFO,
+ INET_DIAG_VEGASINFO,
+ INET_DIAG_CONG,
+ INET_DIAG_TOS,
+ INET_DIAG_TCLASS,
+ INET_DIAG_SKMEMINFO,
+ INET_DIAG_SHUTDOWN,
+};
+
+#define INET_DIAG_MAX INET_DIAG_SHUTDOWN
+
+
+/* INET_DIAG_MEM */
+
+struct inet_diag_meminfo {
+ __u32 idiag_rmem;
+ __u32 idiag_wmem;
+ __u32 idiag_fmem;
+ __u32 idiag_tmem;
+};
+
+/* INET_DIAG_VEGASINFO */
+
+struct tcpvegas_info {
+ __u32 tcpv_enabled;
+ __u32 tcpv_rttcnt;
+ __u32 tcpv_rtt;
+ __u32 tcpv_minrtt;
+};
+
+#endif /* __CR_INET_DIAG_H__ */
diff --git a/criu/include/ipc_ns.h b/criu/include/ipc_ns.h
new file mode 100644
index 000000000000..c8909892637c
--- /dev/null
+++ b/criu/include/ipc_ns.h
@@ -0,0 +1,9 @@
+#ifndef __CR_IPC_NS_H__
+#define __CR_IPC_NS_H__
+
+extern int dump_ipc_ns(int ns_id);
+extern int prepare_ipc_ns(int pid);
+
+extern struct ns_desc ipc_ns_desc;
+
+#endif /* __CR_IPC_NS_H__ */
diff --git a/criu/include/irmap.h b/criu/include/irmap.h
new file mode 100644
index 000000000000..033f71e3722a
--- /dev/null
+++ b/criu/include/irmap.h
@@ -0,0 +1,13 @@
+#ifndef __CR_IRMAP__H__
+#define __CR_IRMAP__H__
+char *irmap_lookup(unsigned int s_dev, unsigned long i_ino);
+struct _FhEntry;
+int irmap_queue_cache(unsigned int dev, unsigned long ino,
+ struct _FhEntry *fh);
+int irmap_predump_prep(void);
+int irmap_predump_run(void);
+int check_open_handle(unsigned int s_dev, unsigned long i_ino,
+ struct _FhEntry *f_handle);
+int irmap_load_cache(void);
+int irmap_scan_path_add(char *path);
+#endif
diff --git a/criu/include/kcmp-ids.h b/criu/include/kcmp-ids.h
new file mode 100644
index 000000000000..afe68d6d3285
--- /dev/null
+++ b/criu/include/kcmp-ids.h
@@ -0,0 +1,29 @@
+#ifndef __CR_KCMP_IDS_H__
+#define __CR_KCMP_IDS_H__
+
+#include "kcmp.h"
+
+struct kid_tree {
+ struct rb_root root;
+ unsigned kcmp_type;
+ unsigned long subid;
+
+};
+
+#define DECLARE_KCMP_TREE(name, type) \
+ struct kid_tree name = { \
+ .root = RB_ROOT, \
+ .kcmp_type = type, \
+ .subid = 1, \
+ }
+
+struct kid_elem {
+ int pid;
+ unsigned genid;
+ unsigned idx;
+};
+
+extern u32 kid_generate_gen(struct kid_tree *tree,
+ struct kid_elem *elem, int *new_id);
+
+#endif /* __CR_KCMP_IDS_H__ */
diff --git a/criu/include/kcmp.h b/criu/include/kcmp.h
new file mode 100644
index 000000000000..76f557bff047
--- /dev/null
+++ b/criu/include/kcmp.h
@@ -0,0 +1,16 @@
+#ifndef __CR_KCMP_H__
+#define __CR_KCMP_H__
+
+enum kcmp_type {
+ KCMP_FILE,
+ KCMP_VM,
+ KCMP_FILES,
+ KCMP_FS,
+ KCMP_SIGHAND,
+ KCMP_IO,
+ KCMP_SYSVSEM,
+
+ KCMP_TYPES,
+};
+
+#endif /* __CR_KCMP_H__ */
diff --git a/criu/include/kerndat.h b/criu/include/kerndat.h
new file mode 100644
index 000000000000..a02d15bc169c
--- /dev/null
+++ b/criu/include/kerndat.h
@@ -0,0 +1,58 @@
+#ifndef __CR_KERNDAT_H__
+#define __CR_KERNDAT_H__
+
+#include "asm/types.h"
+
+struct stat;
+
+/*
+ * kerndat stands for "kernel data" and is a collection
+ * of run-time information about current kernel
+ */
+
+extern int kerndat_init(void);
+extern int kerndat_init_rst(void);
+extern int kerndat_get_dirty_track(void);
+extern int kerndat_fdinfo_has_lock(void);
+extern int kerndat_loginuid(bool only_dump);
+
+enum pagemap_func {
+ PM_UNKNOWN,
+ PM_DISABLED, /* /proc/pid/pagemap doesn't open (user mode) */
+ PM_FLAGS_ONLY, /* pagemap zeroes pfn part (user mode) */
+ PM_FULL,
+};
+
+struct kerndat_s {
+ dev_t shmem_dev;
+ int tcp_max_rshare;
+ int last_cap;
+ u64 zero_page_pfn;
+ bool has_dirty_track;
+ bool has_memfd;
+ bool has_fdinfo_lock;
+ unsigned long task_size;
+ bool ipv6;
+ bool has_loginuid;
+ enum pagemap_func pmap;
+};
+
+extern struct kerndat_s kdat;
+
+enum {
+ KERNDAT_FS_STAT_DEVPTS,
+ KERNDAT_FS_STAT_DEVTMPFS,
+ KERNDAT_FS_STAT_BINFMT_MISC,
+
+ KERNDAT_FS_STAT_MAX
+};
+
+/*
+ * Check whether the fs @which with kdevice @kdev
+ * is the same as host's. If yes, this means that
+ * the fs mount is shared with host, if no -- it's
+ * a new (likely virtuzlized) fs instance.
+ */
+extern int kerndat_fs_virtualized(unsigned int which, u32 kdev);
+
+#endif /* __CR_KERNDAT_H__ */
diff --git a/criu/include/libnetlink.h b/criu/include/libnetlink.h
new file mode 100644
index 000000000000..92eded420b06
--- /dev/null
+++ b/criu/include/libnetlink.h
@@ -0,0 +1,20 @@
+#ifndef __CR_LIBNETLINK_H__
+#define __CR_LIBNETLINK_H__
+
+#define CR_NLMSG_SEQ 24680 /* arbitrary chosen */
+
+extern int parse_rtattr(struct rtattr *tb[], int max, struct rtattr *rta, int len);
+#define parse_rtattr_nested(tb, max, rta) \
+ (parse_rtattr((tb), (max), RTA_DATA(rta), RTA_PAYLOAD(rta)))
+extern int do_rtnl_req(int nl, void *req, int size,
+ int (*receive_callback)(struct nlmsghdr *h, void *),
+ int (*error_callback)(int err, void *), void *);
+
+extern int addattr_l(struct nlmsghdr *n, int maxlen, int type,
+ const void *data, int alen);
+
+#define NLMSG_TAIL(nmsg) \
+ ((struct rtattr *) (((void *) (nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len)))
+
+
+#endif /* __CR_LIBNETLINK_H__ */
diff --git a/criu/include/list.h b/criu/include/list.h
new file mode 100644
index 000000000000..ce3a3c0cd757
--- /dev/null
+++ b/criu/include/list.h
@@ -0,0 +1,423 @@
+#ifndef __CR_LIST_H__
+#define __CR_LIST_H__
+
+/*
+ * Double linked lists.
+ */
+
+#include "compiler.h"
+
+#define POISON_POINTER_DELTA 0
+#define LIST_POISON1 ((void *) 0x00100100 + POISON_POINTER_DELTA)
+#define LIST_POISON2 ((void *) 0x00200200 + POISON_POINTER_DELTA)
+
+struct list_head {
+ struct list_head *prev, *next;
+};
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+#define LIST_HEAD(name) struct list_head name = LIST_HEAD_INIT(name)
+
+static inline void INIT_LIST_HEAD(struct list_head *list)
+{
+ list->next = list;
+ list->prev = list;
+}
+
+static inline void __list_add(struct list_head *new,
+ struct list_head *prev,
+ struct list_head *next)
+{
+ next->prev = new;
+ new->next = next;
+ new->prev = prev;
+ prev->next = new;
+}
+
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+ __list_add(new, head, head->next);
+}
+
+static inline void list_add_tail(struct list_head *new, struct list_head *head)
+{
+ __list_add(new, head->prev, head);
+}
+
+static inline void __list_del(struct list_head * prev, struct list_head * next)
+{
+ next->prev = prev;
+ prev->next = next;
+}
+
+static inline void __list_del_entry(struct list_head *entry)
+{
+ __list_del(entry->prev, entry->next);
+}
+
+static inline void list_del(struct list_head *entry)
+{
+ __list_del(entry->prev, entry->next);
+ entry->next = LIST_POISON1;
+ entry->prev = LIST_POISON2;
+}
+
+static inline void list_replace(struct list_head *old,
+ struct list_head *new)
+{
+ new->next = old->next;
+ new->next->prev = new;
+ new->prev = old->prev;
+ new->prev->next = new;
+}
+
+static inline void list_replace_init(struct list_head *old,
+ struct list_head *new)
+{
+ list_replace(old, new);
+ INIT_LIST_HEAD(old);
+}
+
+static inline void list_del_init(struct list_head *entry)
+{
+ __list_del_entry(entry);
+ INIT_LIST_HEAD(entry);
+}
+
+static inline void list_move(struct list_head *list, struct list_head *head)
+{
+ __list_del_entry(list);
+ list_add(list, head);
+}
+
+static inline void list_move_tail(struct list_head *list,
+ struct list_head *head)
+{
+ __list_del_entry(list);
+ list_add_tail(list, head);
+}
+
+static inline int list_is_last(const struct list_head *list,
+ const struct list_head *head)
+{
+ return list->next == head;
+}
+
+static inline int list_is_first(const struct list_head *list,
+ const struct list_head *head)
+{
+ return list->prev == head;
+}
+
+static inline int list_empty(const struct list_head *head)
+{
+ return head->next == head;
+}
+
+static inline int list_empty_careful(const struct list_head *head)
+{
+ struct list_head *next = head->next;
+ return (next == head) && (next == head->prev);
+}
+static inline void list_rotate_left(struct list_head *head)
+{
+ struct list_head *first;
+
+ if (!list_empty(head)) {
+ first = head->next;
+ list_move_tail(first, head);
+ }
+}
+
+static inline int list_is_singular(const struct list_head *head)
+{
+ return !list_empty(head) && (head->next == head->prev);
+}
+
+static inline void __list_cut_position(struct list_head *list,
+ struct list_head *head, struct list_head *entry)
+{
+ struct list_head *new_first = entry->next;
+ list->next = head->next;
+ list->next->prev = list;
+ list->prev = entry;
+ entry->next = list;
+ head->next = new_first;
+ new_first->prev = head;
+}
+
+static inline void list_cut_position(struct list_head *list,
+ struct list_head *head, struct list_head *entry)
+{
+ if (list_empty(head))
+ return;
+ if (list_is_singular(head) &&
+ (head->next != entry && head != entry))
+ return;
+ if (entry == head)
+ INIT_LIST_HEAD(list);
+ else
+ __list_cut_position(list, head, entry);
+}
+
+static inline void __list_splice(const struct list_head *list,
+ struct list_head *prev,
+ struct list_head *next)
+{
+ struct list_head *first = list->next;
+ struct list_head *last = list->prev;
+
+ first->prev = prev;
+ prev->next = first;
+
+ last->next = next;
+ next->prev = last;
+}
+
+static inline void list_splice(const struct list_head *list,
+ struct list_head *head)
+{
+ if (!list_empty(list))
+ __list_splice(list, head, head->next);
+}
+
+static inline void list_splice_tail(struct list_head *list,
+ struct list_head *head)
+{
+ if (!list_empty(list))
+ __list_splice(list, head->prev, head);
+}
+
+static inline void list_splice_init(struct list_head *list,
+ struct list_head *head)
+{
+ if (!list_empty(list)) {
+ __list_splice(list, head, head->next);
+ INIT_LIST_HEAD(list);
+ }
+}
+
+static inline void list_splice_tail_init(struct list_head *list,
+ struct list_head *head)
+{
+ if (!list_empty(list)) {
+ __list_splice(list, head->prev, head);
+ INIT_LIST_HEAD(list);
+ }
+}
+
+#define list_entry(ptr, type, member) \
+ container_of(ptr, type, member)
+
+#define list_first_entry(ptr, type, member) \
+ list_entry((ptr)->next, type, member)
+
+#define list_for_each(pos, head) \
+ for (pos = (head)->next; pos != (head); pos = pos->next)
+
+#define __list_for_each(pos, head) \
+ for (pos = (head)->next; pos != (head); pos = pos->next)
+
+#define list_for_each_prev(pos, head) \
+ for (pos = (head)->prev; pos != (head); pos = pos->prev)
+
+#define list_for_each_safe(pos, n, head) \
+ for (pos = (head)->next, n = pos->next; pos != (head); \
+ pos = n, n = pos->next)
+
+#define list_for_each_prev_safe(pos, n, head) \
+ for (pos = (head)->prev, n = pos->prev; \
+ pos != (head); \
+ pos = n, n = pos->prev)
+
+#define list_for_each_entry(pos, head, member) \
+ for (pos = list_entry((head)->next, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = list_entry(pos->member.next, typeof(*pos), member))
+
+#define list_for_each_entry_reverse(pos, head, member) \
+ for (pos = list_entry((head)->prev, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = list_entry(pos->member.prev, typeof(*pos), member))
+
+#define list_prepare_entry(pos, head, member) \
+ ((pos) ? : list_entry(head, typeof(*pos), member))
+
+#define list_for_each_entry_continue(pos, head, member) \
+ for (pos = list_entry(pos->member.next, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = list_entry(pos->member.next, typeof(*pos), member))
+
+#define list_for_each_entry_continue_reverse(pos, head, member) \
+ for (pos = list_entry(pos->member.prev, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = list_entry(pos->member.prev, typeof(*pos), member))
+
+#define list_for_each_entry_from(pos, head, member) \
+ for (; &pos->member != (head); \
+ pos = list_entry(pos->member.next, typeof(*pos), member))
+
+#define list_for_each_entry_safe(pos, n, head, member) \
+ for (pos = list_entry((head)->next, typeof(*pos), member), \
+ n = list_entry(pos->member.next, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+#define list_for_each_entry_safe_continue(pos, n, head, member) \
+ for (pos = list_entry(pos->member.next, typeof(*pos), member), \
+ n = list_entry(pos->member.next, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+#define list_for_each_entry_safe_from(pos, n, head, member) \
+ for (n = list_entry(pos->member.next, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+#define list_for_each_entry_safe_reverse(pos, n, head, member) \
+ for (pos = list_entry((head)->prev, typeof(*pos), member), \
+ n = list_entry(pos->member.prev, typeof(*pos), member); \
+ &pos->member != (head); \
+ pos = n, n = list_entry(n->member.prev, typeof(*n), member))
+
+#define list_safe_reset_next(pos, n, member) \
+ n = list_entry(pos->member.next, typeof(*pos), member)
+
+/*
+ * Double linked lists with a single pointer list head.
+ */
+
+struct hlist_head {
+ struct hlist_node *first;
+};
+
+struct hlist_node {
+ struct hlist_node *next, **pprev;
+};
+
+#define HLIST_HEAD_INIT { .first = NULL }
+#define HLIST_HEAD(name) struct hlist_head name = { .first = NULL }
+#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL)
+
+static inline void INIT_HLIST_NODE(struct hlist_node *h)
+{
+ h->next = NULL;
+ h->pprev = NULL;
+}
+
+static inline int hlist_unhashed(const struct hlist_node *h)
+{
+ return !h->pprev;
+}
+
+static inline int hlist_empty(const struct hlist_head *h)
+{
+ return !h->first;
+}
+
+static inline void __hlist_del(struct hlist_node *n)
+{
+ struct hlist_node *next = n->next;
+ struct hlist_node **pprev = n->pprev;
+ *pprev = next;
+ if (next)
+ next->pprev = pprev;
+}
+
+static inline void hlist_del(struct hlist_node *n)
+{
+ __hlist_del(n);
+ n->next = LIST_POISON1;
+ n->pprev = LIST_POISON2;
+}
+
+static inline void hlist_del_init(struct hlist_node *n)
+{
+ if (!hlist_unhashed(n)) {
+ __hlist_del(n);
+ INIT_HLIST_NODE(n);
+ }
+}
+
+static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
+{
+ struct hlist_node *first = h->first;
+ n->next = first;
+ if (first)
+ first->pprev = &n->next;
+ h->first = n;
+ n->pprev = &h->first;
+}
+
+/* next must be != NULL */
+static inline void hlist_add_before(struct hlist_node *n,
+ struct hlist_node *next)
+{
+ n->pprev = next->pprev;
+ n->next = next;
+ next->pprev = &n->next;
+ *(n->pprev) = n;
+}
+
+static inline void hlist_add_after(struct hlist_node *n,
+ struct hlist_node *next)
+{
+ next->next = n->next;
+ n->next = next;
+ next->pprev = &n->next;
+
+ if (next->next)
+ next->next->pprev = &next->next;
+}
+
+/* after that we'll appear to be on some hlist and hlist_del will work */
+static inline void hlist_add_fake(struct hlist_node *n)
+{
+ n->pprev = &n->next;
+}
+
+/*
+ * Move a list from one list head to another. Fixup the pprev
+ * reference of the first entry if it exists.
+ */
+static inline void hlist_move_list(struct hlist_head *old,
+ struct hlist_head *new)
+{
+ new->first = old->first;
+ if (new->first)
+ new->first->pprev = &new->first;
+ old->first = NULL;
+}
+
+#define hlist_entry(ptr, type, member) container_of(ptr,type,member)
+
+#define hlist_for_each(pos, head) \
+ for (pos = (head)->first; pos ; pos = pos->next)
+
+#define hlist_for_each_safe(pos, n, head) \
+ for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \
+ pos = n)
+
+#define hlist_entry_safe(ptr, type, member) \
+ (ptr) ? hlist_entry(ptr, type, member) : NULL
+
+#define hlist_for_each_entry(pos, head, member) \
+ for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member); \
+ pos; \
+ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
+
+#define hlist_for_each_entry_continue(pos, member) \
+ for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\
+ pos; \
+ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
+
+#define hlist_for_each_entry_from(pos, member) \
+ for (; pos; \
+ pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
+
+#define hlist_for_each_entry_safe(pos, n, head, member) \
+ for (pos = hlist_entry_safe((head)->first, typeof(*pos), member); \
+ pos && ({ n = pos->member.next; 1; }); \
+ pos = hlist_entry_safe(n, typeof(*pos), member))
+
+#endif /* __CR_LIST_H__ */
diff --git a/criu/include/lock.h b/criu/include/lock.h
new file mode 100644
index 000000000000..1678d10ef988
--- /dev/null
+++ b/criu/include/lock.h
@@ -0,0 +1,157 @@
+#ifndef __CR_LOCK_H__
+#define __CR_LOCK_H__
+
+#include <linux/futex.h>
+#include <sys/time.h>
+#include <limits.h>
+#include <errno.h>
+
+#include "asm/types.h"
+#include "asm/atomic.h"
+#include "bug.h"
+
+#ifdef CR_NOGLIBC
+# include "syscall.h"
+#else
+# include <sys/syscall.h>
+static inline long sys_futex(void *addr1, int op, int val1,
+ struct timespec *timeout, void *addr2, int val3)
+{
+ int rc = syscall(SYS_futex, addr1, op, val1, timeout, addr2, val3);
+ if (rc == -1) rc = -errno;
+ return rc;
+}
+#endif
+
+typedef struct {
+ atomic_t raw;
+} futex_t;
+
+#define FUTEX_ABORT_FLAG (0x80000000)
+#define FUTEX_ABORT_RAW (-1U)
+
+/* Get current futex @f value */
+static inline u32 futex_get(futex_t *f)
+{
+ return atomic_read(&f->raw);
+}
+
+/* Set futex @f value to @v */
+static inline void futex_set(futex_t *f, u32 v)
+{
+ atomic_set(&f->raw, (int)v);
+}
+
+#define futex_init(f) futex_set(f, 0)
+
+/* Wait on futex @__f value @__v become in condition @__c */
+#define futex_wait_if_cond(__f, __v, __cond) \
+ do { \
+ int ret; \
+ u32 tmp; \
+ \
+ while (1) { \
+ struct timespec to = {.tv_sec = 120}; \
+ tmp = (u32)atomic_read(&(__f)->raw); \
+ if ((tmp & FUTEX_ABORT_FLAG) || \
+ (tmp __cond (__v))) \
+ break; \
+ ret = sys_futex((u32 *)&(__f)->raw.counter, FUTEX_WAIT,\
+ tmp, &to, NULL, 0); \
+ if (ret == -ETIMEDOUT) { \
+ pr_warn("blocked for more than 120 seconds\n"); \
+ continue; \
+ } \
+ if (ret == -EINTR || ret == -EWOULDBLOCK) \
+ continue; \
+ if (ret < 0) { \
+ pr_err("futex() returned an unexpected error: %d\n", ret); \
+ BUG(); \
+ } \
+ } \
+ } while (0)
+
+/* Set futex @f to @v and wake up all waiters */
+static inline void futex_set_and_wake(futex_t *f, u32 v)
+{
+ atomic_set(&f->raw, (int)v);
+ BUG_ON(sys_futex((u32 *)&f->raw.counter, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) < 0);
+}
+
+/* Mark futex @f as wait abort needed and wake up all waiters */
+static inline void futex_abort_and_wake(futex_t *f)
+{
+ BUILD_BUG_ON(!(FUTEX_ABORT_RAW & FUTEX_ABORT_FLAG));
+ futex_set_and_wake(f, FUTEX_ABORT_RAW);
+}
+
+/* Decrement futex @f value and wake up all waiters */
+static inline void futex_dec_and_wake(futex_t *f)
+{
+ atomic_dec(&f->raw);
+ BUG_ON(sys_futex((u32 *)&f->raw.counter, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) < 0);
+}
+
+/* Increment futex @f value and wake up all waiters */
+static inline void futex_inc_and_wake(futex_t *f)
+{
+ atomic_inc(&f->raw);
+ BUG_ON(sys_futex((u32 *)&f->raw.counter, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) < 0);
+}
+
+/* Plain increment futex @f value */
+static inline void futex_inc(futex_t *f) { atomic_inc(&f->raw); }
+
+/* Plain decrement futex @f value */
+static inline void futex_dec(futex_t *f) { atomic_dec(&f->raw); }
+
+/* Wait until futex @f value become @v */
+#define futex_wait_until(f, v) futex_wait_if_cond(f, v, ==)
+
+/* Wait while futex @f value is greater than @v */
+#define futex_wait_while_gt(f, v) futex_wait_if_cond(f, v, <=)
+
+/* Wait while futex @f value is less than @v */
+#define futex_wait_while_lt(f, v) futex_wait_if_cond(f, v, >=)
+
+/* Wait while futex @f value is equal to @v */
+#define futex_wait_while_eq(f, v) futex_wait_if_cond(f, v, !=)
+
+/* Wait while futex @f value is @v */
+static inline void futex_wait_while(futex_t *f, u32 v)
+{
+ while ((u32)atomic_read(&f->raw) == v) {
+ int ret = sys_futex((u32 *)&f->raw.counter, FUTEX_WAIT, v, NULL, NULL, 0);
+ BUG_ON(ret < 0 && ret != -EWOULDBLOCK);
+ }
+}
+
+typedef struct {
+ atomic_t raw;
+} mutex_t;
+
+static inline void mutex_init(mutex_t *m)
+{
+ u32 c = 0;
+ atomic_set(&m->raw, (int)c);
+}
+
+static inline void mutex_lock(mutex_t *m)
+{
+ u32 c;
+ int ret;
+
+ while ((c = (u32)atomic_inc_return(&m->raw)) != 1) {
+ ret = sys_futex((u32 *)&m->raw.counter, FUTEX_WAIT, c, NULL, NULL, 0);
+ BUG_ON(ret < 0 && ret != -EWOULDBLOCK);
+ }
+}
+
+static inline void mutex_unlock(mutex_t *m)
+{
+ u32 c = 0;
+ atomic_set(&m->raw, (int)c);
+ BUG_ON(sys_futex((u32 *)&m->raw.counter, FUTEX_WAKE, 1, NULL, NULL, 0) < 0);
+}
+
+#endif /* __CR_LOCK_H__ */
diff --git a/criu/include/log.h b/criu/include/log.h
new file mode 100644
index 000000000000..fe53a7c928b1
--- /dev/null
+++ b/criu/include/log.h
@@ -0,0 +1,41 @@
+#ifndef __CR_LOG_H__
+#define __CR_LOG_H__
+
+#include <inttypes.h>
+
+#include "criu-log.h"
+
+extern int log_init(const char *output);
+extern void log_fini(void);
+extern int log_init_by_pid(void);
+extern void log_closedir(void);
+
+extern void log_set_fd(int fd);
+extern int log_get_fd(void);
+
+extern void log_set_loglevel(unsigned int loglevel);
+extern unsigned int log_get_loglevel(void);
+
+#define LOG_SIMPLE_CHUNK 72
+
+extern int vprint_num(char *buf, int blen, int num, char **ps);
+extern void simple_sprintf(char output[LOG_SIMPLE_CHUNK], const char *format, ...)
+ __attribute__ ((__format__ (__printf__, 2, 3)));
+
+extern int write_pidfile(int pid);
+
+#define DEFAULT_LOGLEVEL LOG_WARN
+
+#define DEFAULT_LOG_FILENAME "criu.log"
+
+struct cr_img;
+
+extern void print_data(unsigned long addr, unsigned char *data, size_t size);
+extern void print_image_data(struct cr_img *, unsigned int length, int show);
+
+static inline int pr_quelled(unsigned int loglevel)
+{
+ return log_get_loglevel() < loglevel && loglevel != LOG_MSG;
+}
+
+#endif /* __CR_LOG_H__ */
diff --git a/criu/include/lsm.h b/criu/include/lsm.h
new file mode 100644
index 000000000000..bd13ef70b4c6
--- /dev/null
+++ b/criu/include/lsm.h
@@ -0,0 +1,35 @@
+#ifndef __CR_LSM_H__
+#define __CR_LSM_H__
+
+#include "protobuf/inventory.pb-c.h"
+#include "protobuf/creds.pb-c.h"
+
+/*
+ * Get the Lsmtype for the current host.
+ */
+extern Lsmtype host_lsm_type(void);
+
+/*
+ * Initilize the Lsmtype for the current host
+ */
+extern void kerndat_lsm(void);
+
+/*
+ * Read the LSM profile for the pstree item
+ */
+extern int collect_lsm_profile(pid_t, CredsEntry *);
+
+/*
+ * Validate that the LSM profiles can be correctly applied (must happen after
+ * pstree is set up).
+ */
+int validate_lsm(char *profile);
+
+/*
+ * Render the profile name in the way that the LSM wants it written to
+ * /proc/<pid>/attr/current.
+ */
+int render_lsm_profile(char *profile, char **val);
+
+extern int parse_lsm_arg(char *arg);
+#endif /* __CR_LSM_H__ */
diff --git a/criu/include/magic.h b/criu/include/magic.h
new file mode 100644
index 000000000000..b11a70eddebe
--- /dev/null
+++ b/criu/include/magic.h
@@ -0,0 +1,115 @@
+#ifndef __CR_MAGIC_H__
+#define __CR_MAGIC_H__
+
+/*
+ * Basic multi-file images
+ */
+
+#define CRTOOLS_IMAGES_V1 1
+/*
+ * v1.1 has common magic in the head of each image file,
+ * except for inventory
+ */
+#define CRTOOLS_IMAGES_V1_1 2
+
+/*
+ * Raw images are images in which data is stored in some
+ * non-crtool format (ip tool dumps, tarballs, etc.)
+ */
+
+#define RAW_IMAGE_MAGIC 0x0
+
+/*
+ * Images have the IMG_COMMON_MAGIC in the head. Service files
+ * such as stats and irmap-cache have the IMG_SERVICE_MAGIC.
+ */
+
+#define IMG_COMMON_MAGIC 0x54564319 /* Sarov (a.k.a. Arzamas-16) */
+#define IMG_SERVICE_MAGIC 0x55105940 /* Zlatoust */
+
+/*
+ * The magic-s below correspond to coordinates
+ * of various Russian towns in the NNNNEEEE form.
+ */
+
+#define INVENTORY_MAGIC 0x58313116 /* Veliky Novgorod */
+#define PSTREE_MAGIC 0x50273030 /* Kyiv */
+#define FDINFO_MAGIC 0x56213732 /* Dmitrov */
+#define PAGEMAP_MAGIC 0x56084025 /* Vladimir */
+#define SHMEM_PAGEMAP_MAGIC PAGEMAP_MAGIC
+#define PAGES_MAGIC RAW_IMAGE_MAGIC
+#define CORE_MAGIC 0x55053847 /* Kolomna */
+#define IDS_MAGIC 0x54432030 /* Konigsberg */
+#define VMAS_MAGIC 0x54123737 /* Tula */
+#define PIPES_MAGIC 0x56513555 /* Tver */
+#define PIPES_DATA_MAGIC 0x56453709 /* Dubna */
+#define FIFO_MAGIC 0x58364939 /* Kirov */
+#define FIFO_DATA_MAGIC 0x59333054 /* Tosno */
+#define SIGACT_MAGIC 0x55344201 /* Murom */
+#define UNIXSK_MAGIC 0x54373943 /* Ryazan */
+#define INETSK_MAGIC 0x56443851 /* Pereslavl */
+#define PACKETSK_MAGIC 0x60454618 /* Veliky Ustyug */
+#define ITIMERS_MAGIC 0x57464056 /* Kostroma */
+#define POSIX_TIMERS_MAGIC 0x52603957 /* Lipetsk */
+#define SK_QUEUES_MAGIC 0x56264026 /* Suzdal */
+#define UTSNS_MAGIC 0x54473203 /* Smolensk */
+#define CREDS_MAGIC 0x54023547 /* Kozelsk */
+#define IPC_VAR_MAGIC 0x53115007 /* Samara */
+#define IPCNS_SHM_MAGIC 0x46283044 /* Odessa */
+#define IPCNS_MSG_MAGIC 0x55453737 /* Moscow */
+#define IPCNS_SEM_MAGIC 0x59573019 /* St. Petersburg */
+#define REG_FILES_MAGIC 0x50363636 /* Belgorod */
+#define EXT_FILES_MAGIC 0x59255641 /* Usolye */
+#define FS_MAGIC 0x51403912 /* Voronezh */
+#define MM_MAGIC 0x57492820 /* Pskov */
+#define REMAP_FPATH_MAGIC 0x59133954 /* Vologda */
+#define GHOST_FILE_MAGIC 0x52583605 /* Oryol */
+#define TCP_STREAM_MAGIC 0x51465506 /* Orenburg */
+#define EVENTFD_FILE_MAGIC 0x44523722 /* Anapa */
+#define EVENTPOLL_FILE_MAGIC 0x45023858 /* Krasnodar */
+#define EVENTPOLL_TFD_MAGIC 0x44433746 /* Novorossiysk */
+#define SIGNALFD_MAGIC 0x57323820 /* Uglich */
+#define INOTIFY_FILE_MAGIC 0x48424431 /* Volgograd */
+#define INOTIFY_WD_MAGIC 0x54562009 /* Svetlogorsk (Rauschen) */
+#define MNTS_MAGIC 0x55563928 /* Petushki */
+#define NETDEV_MAGIC 0x57373951 /* Yaroslavl */
+#define NETNS_MAGIC 0x55933752 /* Dolgoprudny */
+#define TTY_FILES_MAGIC 0x59433025 /* Pushkin */
+#define TTY_INFO_MAGIC 0x59453036 /* Kolpino */
+#define FILE_LOCKS_MAGIC 0x54323616 /* Kaluga */
+#define RLIMIT_MAGIC 0x57113925 /* Rostov */
+#define FANOTIFY_FILE_MAGIC 0x55096122 /* Chelyabinsk */
+#define FANOTIFY_MARK_MAGIC 0x56506035 /* Yekaterinburg */
+#define SIGNAL_MAGIC 0x59255647 /* Berezniki */
+#define PSIGNAL_MAGIC SIGNAL_MAGIC
+#define NETLINK_SK_MAGIC 0x58005614 /* Perm */
+#define NS_FILES_MAGIC 0x61394011 /* Nyandoma */
+#define TUNFILE_MAGIC 0x57143751 /* Kalyazin */
+#define CGROUP_MAGIC 0x59383330 /* Tikhvin */
+#define TIMERFD_MAGIC 0x50493712 /* Korocha */
+#define CPUINFO_MAGIC 0x61404013 /* Nyandoma */
+#define USERNS_MAGIC 0x55474906 /* Kazan */
+#define SECCOMP_MAGIC 0x64413049 /* Kostomuksha */
+#define BINFMT_MISC_MAGIC 0x67343323 /* Apatity */
+
+#define IFADDR_MAGIC RAW_IMAGE_MAGIC
+#define ROUTE_MAGIC RAW_IMAGE_MAGIC
+#define ROUTE6_MAGIC RAW_IMAGE_MAGIC
+#define RULE_MAGIC RAW_IMAGE_MAGIC
+#define TMPFS_IMG_MAGIC RAW_IMAGE_MAGIC
+#define TMPFS_DEV_MAGIC RAW_IMAGE_MAGIC
+#define IPTABLES_MAGIC RAW_IMAGE_MAGIC
+#define IP6TABLES_MAGIC RAW_IMAGE_MAGIC
+#define NETNF_CT_MAGIC RAW_IMAGE_MAGIC
+#define NETNF_EXP_MAGIC RAW_IMAGE_MAGIC
+
+#define PAGES_OLD_MAGIC PAGEMAP_MAGIC
+#define SHM_PAGES_OLD_MAGIC PAGEMAP_MAGIC
+
+/*
+ * These are special files, not exactly images
+ */
+#define STATS_MAGIC 0x57093306 /* Ostashkov */
+#define IRMAP_CACHE_MAGIC 0x57004059 /* Ivanovo */
+
+#endif /* __CR_MAGIC_H__ */
diff --git a/criu/include/mem.h b/criu/include/mem.h
new file mode 100644
index 000000000000..5269cad029cf
--- /dev/null
+++ b/criu/include/mem.h
@@ -0,0 +1,27 @@
+#ifndef __CR_MEM_H__
+#define __CR_MEM_H__
+
+struct parasite_ctl;
+struct vm_area_list;
+struct page_pipe;
+struct pstree_item;
+
+extern int prepare_mm_pid(struct pstree_item *i);
+extern int do_task_reset_dirty_track(int pid);
+extern unsigned int dump_pages_args_size(struct vm_area_list *vmas);
+extern int parasite_dump_pages_seized(struct parasite_ctl *ctl,
+ struct vm_area_list *vma_area_list,
+ struct page_pipe **pp);
+
+#define PME_PRESENT (1ULL << 63)
+#define PME_SWAP (1ULL << 62)
+#define PME_FILE (1ULL << 61)
+#define PME_SOFT_DIRTY (1ULL << 55)
+#define PME_PSHIFT_BITS (6)
+#define PME_STATUS_BITS (3)
+#define PME_STATUS_OFFSET (64 - PME_STATUS_BITS)
+#define PME_PSHIFT_OFFSET (PME_STATUS_OFFSET - PME_PSHIFT_BITS)
+#define PME_PFRAME_MASK ((1ULL << PME_PSHIFT_OFFSET) - 1)
+#define PME_PFRAME(x) ((x) & PME_PFRAME_MASK)
+
+#endif /* __CR_MEM_H__ */
diff --git a/criu/include/mman.h b/criu/include/mman.h
new file mode 100644
index 000000000000..340d36927152
--- /dev/null
+++ b/criu/include/mman.h
@@ -0,0 +1,17 @@
+#ifndef __CR_MMAN_H__
+#define __CR_MMAN_H__
+
+#ifndef MAP_HUGETLB
+# define MAP_HUGETLB 0x40000
+#endif
+#ifndef MADV_HUGEPAGE
+# define MADV_HUGEPAGE 14
+#endif
+#ifndef MADV_NOHUGEPAGE
+# define MADV_NOHUGEPAGE 15
+#endif
+#ifndef MADV_DONTDUMP
+# define MADV_DONTDUMP 16
+#endif
+
+#endif /* __CR_MMAN_H__ */
diff --git a/criu/include/mount.h b/criu/include/mount.h
new file mode 100644
index 000000000000..b3bbdcea53c7
--- /dev/null
+++ b/criu/include/mount.h
@@ -0,0 +1,129 @@
+#ifndef __CR_MOUNT_H__
+#define __CR_MOUNT_H__
+
+#include <sys/types.h>
+
+#include "asm/types.h"
+#include "list.h"
+
+struct proc_mountinfo;
+struct pstree_item;
+struct fstype;
+struct ns_id;
+
+/*
+ * Structure to keep external mount points resolving info.
+ *
+ * On dump the key is the mountpoint as seen from the mount
+ * namespace, the val is some name that will be put into image
+ * instead of the mount point's root path.
+ *
+ * On restore the key is the name from the image (the one
+ * mentioned above) and the val is the path in criu's mount
+ * namespace that will become the mount point's root, i.e. --
+ * be bind mounted to the respective mountpoint.
+ */
+struct ext_mount {
+ struct list_head list;
+ char *key;
+ char *val;
+};
+
+#define MOUNT_INVALID_DEV (0)
+
+struct mount_info {
+ int mnt_id;
+ int parent_mnt_id;
+ unsigned int s_dev;
+ unsigned int s_dev_rt;
+ char *root;
+ /*
+ * During dump mountpoint contains path with dot at the
+ * beginning. It allows to use openat, statat, etc without
+ * creating a temporary copy of the path.
+ *
+ * On restore mountpoint is prepended with so called ns
+ * root path -- it's a place in fs where the namespace
+ * mount tree is constructed. Check mnt_roots for details.
+ * The ns_mountpoint contains path w/o this prefix.
+ */
+ char *mountpoint;
+ char *ns_mountpoint;
+ unsigned flags;
+ unsigned sb_flags;
+ int master_id;
+ int shared_id;
+ struct fstype *fstype;
+ char *source;
+ char *options;
+ union {
+ bool mounted;
+ bool dumped;
+ };
+ bool need_plugin;
+ bool is_ns_root;
+ bool deleted;
+ struct mount_info *next;
+ struct ns_id *nsid;
+
+ struct ext_mount *external;
+ bool internal_sharing;
+
+ /* tree linkage */
+ struct mount_info *parent;
+ struct mount_info *bind;
+ struct list_head children;
+ struct list_head siblings;
+
+ struct list_head mnt_bind; /* circular list of derivatives of one real mount */
+ struct list_head mnt_share; /* circular list of shared mounts */
+ struct list_head mnt_slave_list; /* list of slave mounts */
+ struct list_head mnt_slave; /* slave list entry */
+ struct mount_info *mnt_master; /* slave is on master->mnt_slave_list */
+
+ struct list_head postpone;
+
+ void *private; /* associated filesystem data */
+};
+
+extern struct mount_info *mntinfo;
+extern struct ns_desc mnt_ns_desc;
+
+extern struct mount_info *mnt_entry_alloc();
+extern void mnt_entry_free(struct mount_info *mi);
+
+extern int __mntns_get_root_fd(pid_t pid);
+extern int mntns_get_root_fd(struct ns_id *ns);
+extern int mntns_get_root_by_mnt_id(int mnt_id);
+extern struct ns_id *lookup_nsid_by_mnt_id(int mnt_id);
+
+extern int open_mount(unsigned int s_dev);
+extern int __open_mountpoint(struct mount_info *pm, int mnt_fd);
+extern struct fstype *find_fstype_by_name(char *fst);
+extern bool add_fsname_auto(const char *names);
+
+extern struct mount_info *collect_mntinfo(struct ns_id *ns, bool for_dump);
+extern int prepare_mnt_ns(void);
+
+extern int pivot_root(const char *new_root, const char *put_old);
+
+extern struct mount_info *lookup_overlayfs(char *rpath, unsigned int s_dev,
+ unsigned int st_ino, unsigned int mnt_id);
+extern struct mount_info *lookup_mnt_id(unsigned int id);
+extern struct mount_info *lookup_mnt_sdev(unsigned int s_dev);
+
+extern dev_t phys_stat_resolve_dev(struct ns_id *, dev_t st_dev, const char *path);
+extern bool phys_stat_dev_match(dev_t st_dev, dev_t phys_dev,
+ struct ns_id *, const char *path);
+
+extern int restore_task_mnt_ns(struct pstree_item *current);
+extern void fini_restore_mntns(void);
+extern int depopulate_roots_yard(void);
+
+extern int rst_get_mnt_root(int mnt_id, char *path, int plen);
+extern int ext_mount_add(char *key, char *val);
+extern int mntns_maybe_create_roots(void);
+extern int read_mnt_ns_img(void);
+extern void cleanup_mnt_ns(void);
+
+#endif /* __CR_MOUNT_H__ */
diff --git a/criu/include/namespaces.h b/criu/include/namespaces.h
new file mode 100644
index 000000000000..4ce5a3470b98
--- /dev/null
+++ b/criu/include/namespaces.h
@@ -0,0 +1,130 @@
+#ifndef __CR_NS_H__
+#define __CR_NS_H__
+
+#include "compiler.h"
+#include "files.h"
+
+/* Nested namespaces are supported only for these types */
+#define CLONE_SUBNS (CLONE_NEWNS)
+
+struct ns_desc {
+ unsigned int cflag;
+ char *str;
+ size_t len;
+};
+
+enum ns_type {
+ NS_UNKNOWN = 0,
+ NS_CRIU,
+ NS_ROOT,
+ NS_OTHER,
+};
+
+struct ns_id {
+ unsigned int kid;
+ unsigned int id;
+ pid_t ns_pid;
+ struct ns_desc *nd;
+ struct ns_id *next;
+ enum ns_type type;
+
+ /*
+ * For mount namespaces on restore -- indicates that
+ * the namespace in question is created (all mounts
+ * are mounted) and other tasks may do setns on it
+ * and proceed.
+ */
+ futex_t ns_populated;
+
+ union {
+ struct {
+ struct mount_info *mntinfo_list;
+ struct mount_info *mntinfo_tree;
+ int ns_fd;
+ int root_fd;
+ } mnt;
+
+ struct {
+ int nlsk; /* for sockets collection */
+ int seqsk; /* to talk to parasite daemons */
+ } net;
+ };
+};
+extern struct ns_id *ns_ids;
+
+#define NS_DESC_ENTRY(_cflag, _str) \
+ { \
+ .cflag = _cflag, \
+ .str = _str, \
+ .len = sizeof(_str) - 1, \
+ }
+
+extern bool check_ns_proc(struct fd_link *link);
+
+extern struct ns_desc pid_ns_desc;
+extern struct ns_desc user_ns_desc;
+extern unsigned long root_ns_mask;
+
+extern const struct fdtype_ops nsfile_dump_ops;
+extern struct collect_image_info nsfile_cinfo;
+
+extern int walk_namespaces(struct ns_desc *nd, int (*cb)(struct ns_id *, void *), void *oarg);
+extern int collect_namespaces(bool for_dump);
+extern int collect_mnt_namespaces(bool for_dump);
+extern int dump_mnt_namespaces(void);
+extern int dump_namespaces(struct pstree_item *item, unsigned int ns_flags);
+extern int prepare_namespace_before_tasks(void);
+extern int prepare_namespace(struct pstree_item *item, unsigned long clone_flags);
+extern int try_show_namespaces(int pid);
+
+extern int switch_ns(int pid, struct ns_desc *nd, int *rst);
+extern int restore_ns(int rst, struct ns_desc *nd);
+
+extern int dump_task_ns_ids(struct pstree_item *);
+extern int predump_task_ns_ids(struct pstree_item *);
+extern struct ns_id *rst_new_ns_id(unsigned int id, pid_t pid, struct ns_desc *nd, enum ns_type t);
+extern int rst_add_ns_id(unsigned int id, struct pstree_item *, struct ns_desc *nd);
+extern struct ns_id *lookup_ns_by_id(unsigned int id, struct ns_desc *nd);
+
+extern int collect_user_namespaces(bool for_dump);
+extern int prepare_userns(struct pstree_item *item);
+extern int stop_usernsd(void);
+extern int userns_uid(int uid);
+extern int userns_gid(int gid);
+extern int dump_user_ns(pid_t pid, int ns_id);
+extern void free_userns_maps(void);
+
+typedef int (*uns_call_t)(void *arg, int fd, pid_t pid);
+/*
+ * Async call -- The call is guaranteed to be done till the
+ * CR_STATE_COMPLETE happens. The function may return even
+ * before the call starts.
+ * W/o flag the call is synchronous -- this function returns
+ * strictly after the call finishes.
+ */
+#define UNS_ASYNC 0x1
+/*
+ * The call returns an FD which should be sent back. Conflicts
+ * with UNS_ASYNC.
+ */
+#define UNS_FDOUT 0x2
+
+#define MAX_UNSFD_MSG_SIZE 4096
+
+/*
+ * When we're restoring inside user namespace, some things are
+ * not allowed to be done there due to insufficient capabilities.
+ * If the operation in question can be offloaded to another process,
+ * this call allows to do that.
+ *
+ * In case we're not in userns, just call the callback immediatelly
+ * in the context of calling task.
+ */
+extern int __userns_call(const char *func_name, uns_call_t call, int flags,
+ void *arg, size_t arg_size, int fd);
+
+#define userns_call(__call, __flags, __arg, __arg_size, __fd) \
+ __userns_call(__stringify(__call), __call, __flags, \
+ __arg, __arg_size, __fd)
+
+#endif /* __CR_NS_H__ */
diff --git a/criu/include/net.h b/criu/include/net.h
new file mode 100644
index 000000000000..900b1365634e
--- /dev/null
+++ b/criu/include/net.h
@@ -0,0 +1,33 @@
+#ifndef __CR_NET_H__
+#define __CR_NET_H__
+
+#include "list.h"
+
+struct cr_imgset;
+extern int dump_net_ns(int ns_id);
+extern int prepare_net_ns(int pid);
+extern int netns_keep_nsfd(void);
+
+struct veth_pair {
+ struct list_head node;
+ char *inside;
+ char *outside;
+ char *bridge;
+};
+
+extern int collect_net_namespaces(bool for_dump);
+
+extern int network_lock(void);
+extern void network_unlock(void);
+
+extern struct ns_desc net_ns_desc;
+
+#include "protobuf/netdev.pb-c.h"
+extern int write_netdev_img(NetDeviceEntry *nde, struct cr_imgset *fds);
+extern int read_ns_sys_file(char *path, char *buf, int len);
+extern int restore_link_parms(NetDeviceEntry *nde, int nlsk);
+
+extern int veth_pair_add(char *in, char *out);
+extern int move_veth_to_bridge(void);
+
+#endif /* __CR_NET_H__ */
diff --git a/criu/include/netfilter.h b/criu/include/netfilter.h
new file mode 100644
index 000000000000..f3667fc81ea4
--- /dev/null
+++ b/criu/include/netfilter.h
@@ -0,0 +1,11 @@
+#ifndef __CR_NETFILTER_H__
+#define __CR_NETFILTER_H__
+
+struct inet_sk_desc;
+extern int nf_lock_connection(struct inet_sk_desc *);
+extern int nf_unlock_connection(struct inet_sk_desc *);
+
+struct inet_sk_info;
+extern int nf_unlock_connection_info(struct inet_sk_info *);
+
+#endif /* __CR_NETFILTER_H__ */
diff --git a/criu/include/netlink_diag.h b/criu/include/netlink_diag.h
new file mode 100644
index 000000000000..14ca403b8b3d
--- /dev/null
+++ b/criu/include/netlink_diag.h
@@ -0,0 +1,42 @@
+#ifndef __CR_NETLINK_DIAG_H__
+#define __CR_NETLINK_DIAG_H__
+
+#include <linux/types.h>
+
+struct netlink_diag_req {
+ __u8 sdiag_family;
+ __u8 sdiag_protocol;
+ __u16 pad;
+ __u32 ndiag_ino;
+ __u32 ndiag_show;
+ __u32 ndiag_cookie[2];
+};
+
+struct netlink_diag_msg {
+ __u8 ndiag_family;
+ __u8 ndiag_type;
+ __u8 ndiag_protocol;
+ __u8 ndiag_state;
+
+ __u32 ndiag_portid;
+ __u32 ndiag_dst_portid;
+ __u32 ndiag_dst_group;
+ __u32 ndiag_ino;
+ __u32 ndiag_cookie[2];
+};
+
+enum {
+ NETLINK_DIAG_MEMINFO,
+ NETLINK_DIAG_GROUPS,
+
+ __NETLINK_DIAG_MAX,
+};
+
+#define NETLINK_DIAG_MAX (__NETLINK_DIAG_MAX - 1)
+
+#define NDIAG_PROTO_ALL ((__u8) ~0)
+
+#define NDIAG_SHOW_MEMINFO 0x00000001 /* show memory info of a socket */
+#define NDIAG_SHOW_GROUPS 0x00000002 /* show groups of a netlink socket */
+
+#endif /* __CR_NETLINK_DIAG_H__ */
diff --git a/criu/include/packet_diag.h b/criu/include/packet_diag.h
new file mode 100644
index 000000000000..e5d9193a8c42
--- /dev/null
+++ b/criu/include/packet_diag.h
@@ -0,0 +1,76 @@
+#ifndef __CR_PACKET_DIAG_H__
+#define __CR_PACKET_DIAG_H__
+
+#include <linux/types.h>
+
+struct packet_diag_req {
+ __u8 sdiag_family;
+ __u8 sdiag_protocol;
+ __u16 pad;
+ __u32 pdiag_ino;
+ __u32 pdiag_show;
+ __u32 pdiag_cookie[2];
+};
+
+#define PACKET_SHOW_INFO 0x00000001 /* Basic packet_sk information */
+#define PACKET_SHOW_MCLIST 0x00000002 /* A set of packet_diag_mclist-s */
+#define PACKET_SHOW_RING_CFG 0x00000004 /* Rings configuration parameters */
+#define PACKET_SHOW_FANOUT 0x00000008
+
+struct packet_diag_msg {
+ __u8 pdiag_family;
+ __u8 pdiag_type;
+ __u16 pdiag_num;
+
+ __u32 pdiag_ino;
+ __u32 pdiag_cookie[2];
+};
+
+enum {
+ PACKET_DIAG_INFO,
+ PACKET_DIAG_MCLIST,
+ PACKET_DIAG_RX_RING,
+ PACKET_DIAG_TX_RING,
+ PACKET_DIAG_FANOUT,
+
+ PACKET_DIAG_MAX,
+};
+
+struct packet_diag_info {
+ __u32 pdi_index;
+ __u32 pdi_version;
+ __u32 pdi_reserve;
+ __u32 pdi_copy_thresh;
+ __u32 pdi_tstamp;
+ __u32 pdi_flags;
+
+#define PDI_RUNNING 0x1
+#define PDI_AUXDATA 0x2
+#define PDI_ORIGDEV 0x4
+#define PDI_VNETHDR 0x8
+#define PDI_LOSS 0x10
+};
+
+#ifndef MAX_ADDR_LEN
+#define MAX_ADDR_LEN 32
+#endif
+
+struct packet_diag_mclist {
+ __u32 pdmc_index;
+ __u32 pdmc_count;
+ __u16 pdmc_type;
+ __u16 pdmc_alen;
+ __u8 pdmc_addr[MAX_ADDR_LEN];
+};
+
+struct packet_diag_ring {
+ __u32 pdr_block_size;
+ __u32 pdr_block_nr;
+ __u32 pdr_frame_size;
+ __u32 pdr_frame_nr;
+ __u32 pdr_retire_tmo;
+ __u32 pdr_sizeof_priv;
+ __u32 pdr_features;
+};
+
+#endif /* __CR_PACKET_DIAG_H__ */
diff --git a/criu/include/page-pipe.h b/criu/include/page-pipe.h
new file mode 100644
index 000000000000..a2dc26852dd8
--- /dev/null
+++ b/criu/include/page-pipe.h
@@ -0,0 +1,107 @@
+#ifndef __CR_PAGE_PIPE_H__
+#define __CR_PAGE_PIPE_H__
+
+#include <sys/uio.h>
+#include "list.h"
+
+/*
+ * page_pipe is a descriptor of task's virtual memory
+ * with pipes, containing pages.
+ *
+ * A page-pipe may contain holes -- these are pagemap
+ * entries without pages. Holes are stored in separate
+ * array to optimize paged iovs feed into vmsplice --
+ * they will be sent there in one go.
+ *
+ * A hole is a pagemap entry that doesn't have pages
+ * in it, since they are present in previous (parent)
+ * snapshot.
+ *
+ *
+ * This page-pipe vs holes vs task vmem vs image layout
+ * is described below.
+ *
+ * Task memory: (+ present, - not present pages)
+ * 0 0 0 0 1 1 1
+ * 0 3 6 B 1 8 C
+ * ---+++-----++++++-------++++----
+ *
+ * Page-pipe iovs:
+ *
+ * bufs = 03:3,0B:6,18:4
+ * holes = <empty>
+ *
+ * The pagemap.img would purely contain page-pipe bufs.
+ *
+ * Pages image will contain pages at
+ *
+ * 03,04,05,0B,0C,0D,0E,0F,10,18,19,1A,1B
+ *
+ * stored one by one.
+ *
+ * Not let's imagine task touches some pages and its mem
+ * looks like: (+ present, = old present, - non present)
+ *
+ * 0 0 0 0 11 11 1
+ * 0 3 6 B 12 78 C
+ * ---==+-----====+++-----++===----
+ *
+ * (not new pages at 11 and 17 vaddrs)
+ *
+ * The new --snapshot'ed page-pipe would look like
+ *
+ * bufs = 05:1,0F:3,17:2
+ * holes = 03:2,0B:4,19:3
+ *
+ * So the pagemap.img would look like
+ *
+ * 03:2:P,05:1,0B:4:P,0F:3,17:2,19:3:P
+ *
+ * (the page_xfer_dump_pages generates one)
+ *
+ * where P means "in parent", i.e. respective pages should
+ * be looked up in the parent pagemap (not pages.img, but
+ * the pagemap, and then the offset in previous pages.img
+ * should be calculated, see the read_pagemap_page routine).
+ *
+ * New pages.img file would contain only pages for
+ *
+ * 05,0F,10,11,17,18
+ */
+
+struct page_pipe_buf {
+ int p[2]; /* pipe with pages */
+ unsigned int pipe_size; /* how many pages can be fit into pipe */
+ unsigned int pages_in; /* how many pages are there */
+ unsigned int nr_segs; /* how many iov-s are busy */
+ struct iovec *iov; /* vaddr:len map */
+ struct list_head l; /* links into page_pipe->bufs */
+};
+
+struct page_pipe {
+ unsigned int nr_pipes; /* how many page_pipe_bufs in there */
+ struct list_head bufs; /* list of bufs */
+ struct list_head free_bufs; /* list of bufs */
+ unsigned int nr_iovs; /* number of iovs */
+ unsigned int free_iov; /* first free iov */
+ struct iovec *iovs; /* iovs. They are provided into create_page_pipe
+ and all bufs have their iov-s in there */
+
+ unsigned int nr_holes; /* number of holes allocated */
+ unsigned int free_hole; /* number of holes in use */
+ struct iovec *holes; /* holes */
+
+ bool chunk_mode; /* Restrict the maximum buffer size of pipes
+ and dump memory for a few iterations */
+};
+
+extern struct page_pipe *create_page_pipe(unsigned int nr,
+ struct iovec *, bool chunk_mode);
+extern void destroy_page_pipe(struct page_pipe *p);
+extern int page_pipe_add_page(struct page_pipe *p, unsigned long addr);
+extern int page_pipe_add_hole(struct page_pipe *p, unsigned long addr);
+
+extern void debug_show_page_pipe(struct page_pipe *pp);
+void page_pipe_reinit(struct page_pipe *pp);
+
+#endif /* __CR_PAGE_PIPE_H__ */
diff --git a/criu/include/page-read.h b/criu/include/page-read.h
new file mode 100644
index 000000000000..827e4acd5d47
--- /dev/null
+++ b/criu/include/page-read.h
@@ -0,0 +1,90 @@
+#ifndef __CR_PAGE_READ_H__
+#define __CR_PAGE_READ_H__
+
+#include "protobuf/pagemap.pb-c.h"
+
+/*
+ * page_read -- engine, that reads pages from image file(s)
+ *
+ * Several page-read's can be arranged in a chain to read
+ * pages from a series of snapshot.
+ *
+ * A task's address space vs pagemaps+page image pairs can
+ * look like this (taken from comment in page-pipe.h):
+ *
+ * task:
+ *
+ * 0 0 0 0 1 1 1
+ * 0 3 6 B 2 7 C
+ * ---+++-----+++++++-----+++++----
+ * pm1: ---+++-----++++++-------++++----
+ * pm2: ---==+-----====+++-----++===----
+ *
+ * Here + is present page, - is non prsent, = is present,
+ * but is not modified from last snapshot.
+ *
+ * Thus pagemap.img and pages.img entries are
+ *
+ * pm1: 03:3,0B:6,18:4
+ * pm2: 03:2:P,05:1,0B:4:P,0F:3,17:2,19:3:P
+ *
+ * where P means "page is in parent pagemap".
+ *
+ * pg1: 03,04,05,0B,0C,0D,0E,0F,10,18,19,1A,1B
+ * pg2: 05,0F,10,11,17,18
+ *
+ * When trying to restore from these 4 files we'd have
+ * to carefull scan pagemap.img's one by one and read or
+ * skip pages from pages.img where appropriate.
+ *
+ * All this is implemented in read_pagemap_page.
+ */
+
+struct page_read {
+ /*
+ * gets next vaddr:len pair to work on.
+ * Pagemap entries should be returned in sorted order.
+ */
+ int (*get_pagemap)(struct page_read *, struct iovec *iov);
+ /* reads page from current pagemap */
+ int (*read_pages)(struct page_read *, unsigned long vaddr, int nr, void *);
+ /* stop working on current pagemap */
+ void (*put_pagemap)(struct page_read *);
+ void (*close)(struct page_read *);
+
+ /* Private data of reader */
+ struct cr_img *pmi;
+ struct cr_img *pi;
+
+ PagemapEntry *pe; /* current pagemap we are on */
+ struct page_read *parent; /* parent pagemap (if ->in_parent
+ pagemap is met in image, then
+ go to this guy for page, see
+ read_pagemap_page */
+ unsigned long cvaddr; /* vaddr we are on */
+
+ struct iovec bunch; /* record consequent neighbour
+ iovecs to punch together */
+ unsigned id; /* for logging */
+};
+
+#define PR_SHMEM 0x1
+#define PR_TASK 0x2
+
+#define PR_TYPE_MASK 0x3
+#define PR_MOD 0x4 /* Will need to modify */
+
+/*
+ * -1 -- error
+ * 0 -- no images
+ * 1 -- opened
+ */
+extern int open_page_read(int pid, struct page_read *, int pr_flags);
+extern int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flags);
+extern void pagemap2iovec(PagemapEntry *pe, struct iovec *iov);
+extern void iovec2pagemap(struct iovec *iov, PagemapEntry *pe);
+extern int seek_pagemap_page(struct page_read *pr, unsigned long vaddr, bool warn);
+
+extern int dedup_one_iovec(struct page_read *pr, struct iovec *iov);
+extern int punch_hole(struct page_read *pr, unsigned long off, unsigned long len, bool cleanup);
+#endif /* __CR_PAGE_READ_H__ */
diff --git a/criu/include/page-xfer.h b/criu/include/page-xfer.h
new file mode 100644
index 000000000000..8492daaff974
--- /dev/null
+++ b/criu/include/page-xfer.h
@@ -0,0 +1,47 @@
+#ifndef __CR_PAGE_XFER__H__
+#define __CR_PAGE_XFER__H__
+#include "page-read.h"
+
+extern int cr_page_server(bool daemon_mode, int cfd);
+
+/*
+ * page_xfer -- transfer pages into image file.
+ * Two images backends are implemented -- local image file
+ * and page-server image file.
+ */
+
+struct page_xfer {
+ /* transfers one vaddr:len entry */
+ int (*write_pagemap)(struct page_xfer *self, struct iovec *iov);
+ /* transfers pages related to previous pagemap */
+ int (*write_pages)(struct page_xfer *self, int pipe, unsigned long len);
+ /* transfers one hole -- vaddr:len entry w/o pages */
+ int (*write_hole)(struct page_xfer *self, struct iovec *iov);
+ void (*close)(struct page_xfer *self);
+
+ /* private data for every page-xfer engine */
+ union {
+ struct /* local */ {
+ struct cr_img *pmi; /* pagemaps */
+ struct cr_img *pi; /* pages */
+ };
+
+ struct /* page-server */ {
+ int sk;
+ u64 dst_id;
+ };
+ };
+
+ struct page_read *parent;
+};
+
+extern int open_page_xfer(struct page_xfer *xfer, int fd_type, long id);
+struct page_pipe;
+extern int page_xfer_dump_pages(struct page_xfer *, struct page_pipe *,
+ unsigned long off);
+extern int connect_to_page_server(void);
+extern int disconnect_from_page_server(void);
+
+extern int check_parent_page_xfer(int fd_type, long id);
+
+#endif /* __CR_PAGE_XFER__H__ */
diff --git a/criu/include/pagemap-cache.h b/criu/include/pagemap-cache.h
new file mode 100644
index 000000000000..e0880906d74f
--- /dev/null
+++ b/criu/include/pagemap-cache.h
@@ -0,0 +1,30 @@
+#ifndef __CR_PAGEMAP_H__
+#define __CR_PAGEMAP_H__
+
+#include <sys/types.h>
+#include "asm/page.h"
+#include "asm/int.h"
+
+#include "list.h"
+
+struct vma_area;
+
+#define PAGEMAP_PFN_OFF(addr) (PAGE_PFN(addr) * sizeof(u64))
+
+typedef struct {
+ pid_t pid; /* which process it belongs */
+ unsigned long start; /* start of area */
+ unsigned long end; /* end of area */
+ const struct list_head *vma_head; /* list head of VMAs we're serving */
+ u64 *map; /* local buffer */
+ size_t map_len; /* length of a buffer */
+ int fd; /* file to read PMs from */
+} pmc_t;
+
+#define PMC_INIT (pmc_t){ }
+
+extern int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t size);
+extern u64 *pmc_get_map(pmc_t *pmc, const struct vma_area *vma);
+extern void pmc_fini(pmc_t *pmc);
+
+#endif /* __CR_PAGEMAP_H__ */
diff --git a/criu/include/parasite-syscall.h b/criu/include/parasite-syscall.h
new file mode 100644
index 000000000000..57612df7478e
--- /dev/null
+++ b/criu/include/parasite-syscall.h
@@ -0,0 +1,139 @@
+#ifndef __CR_PARASITE_SYSCALL_H__
+#define __CR_PARASITE_SYSCALL_H__
+
+#include "asm/types.h"
+#include "pid.h"
+#include "list.h"
+#include "config.h"
+
+#define BUILTIN_SYSCALL_SIZE 8
+
+struct parasite_dump_thread;
+struct parasite_dump_misc;
+struct parasite_drain_fd;
+struct vm_area_list;
+struct pstree_item;
+struct _CredsEntry;
+struct _CoreEntry;
+struct list_head;
+struct cr_imgset;
+struct fd_opts;
+struct pid;
+
+struct thread_ctx {
+ k_rtsigset_t sigmask;
+ user_regs_struct_t regs;
+};
+
+/* parasite control block */
+struct parasite_ctl {
+ struct pid pid;
+ void *remote_map;
+ void *local_map;
+ void *sigreturn_addr; /* A place for the breakpoint */
+ unsigned long map_length;
+
+ /* thread leader data */
+ bool daemonized;
+
+ struct thread_ctx orig;
+
+ void *rstack; /* thread leader stack*/
+ struct rt_sigframe *sigframe;
+ struct rt_sigframe *rsigframe; /* address in a parasite */
+
+ void *r_thread_stack; /* stack for non-leader threads */
+
+ unsigned long parasite_ip; /* service routine start ip */
+ unsigned long syscall_ip; /* entry point of infection */
+
+ unsigned int *addr_cmd; /* addr for command */
+ void *addr_args; /* address for arguments */
+ unsigned long args_size;
+ int tsock; /* transport socket for transfering fds */
+
+ struct list_head pre_list;
+ struct page_pipe *mem_pp;
+};
+
+extern int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct cr_imgset *cr_imgset);
+extern int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *);
+
+struct proc_posix_timers_stat;
+extern int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args,
+ struct parasite_ctl *ctl, struct pstree_item *);
+
+#define parasite_args(ctl, type) \
+ ({ \
+ BUILD_BUG_ON(sizeof(type) > PARASITE_ARG_SIZE_MIN); \
+ ctl->addr_args; \
+ })
+
+extern void *parasite_args_s(struct parasite_ctl *ctl, int args_size);
+extern int parasite_send_fd(struct parasite_ctl *ctl, int fd);
+
+/*
+ * Execute a command in parasite when it's in daemon mode.
+ * The __-ed version is asyncronous (doesn't wait for ack).
+ */
+extern int parasite_execute_daemon(unsigned int cmd, struct parasite_ctl *ctl);
+extern int __parasite_execute_daemon(unsigned int cmd, struct parasite_ctl *ctl);
+
+extern int __parasite_wait_daemon_ack(unsigned int cmd,
+ struct parasite_ctl *ctl);
+
+extern int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_misc *misc);
+extern int parasite_dump_creds(struct parasite_ctl *ctl, struct _CredsEntry *ce);
+extern int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, struct _CoreEntry *core);
+extern int parasite_dump_thread_seized(struct parasite_ctl *ctl, int id,
+ struct pid *tid, struct _CoreEntry *core);
+extern int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread *dt);
+
+extern int parasite_drain_fds_seized(struct parasite_ctl *ctl,
+ struct parasite_drain_fd *dfds,
+ int *lfds, struct fd_opts *flags);
+extern int parasite_get_proc_fd_seized(struct parasite_ctl *ctl);
+
+extern int parasite_cure_remote(struct parasite_ctl *ctl);
+extern int parasite_cure_local(struct parasite_ctl *ctl);
+extern int parasite_cure_seized(struct parasite_ctl *ctl);
+extern struct parasite_ctl *parasite_infect_seized(pid_t pid,
+ struct pstree_item *item,
+ struct vm_area_list *vma_area_list);
+extern void parasite_ensure_args_size(unsigned long sz);
+extern struct parasite_ctl *parasite_prep_ctl(pid_t pid,
+ struct vm_area_list *vma_area_list);
+extern int parasite_map_exchange(struct parasite_ctl *ctl, unsigned long size);
+
+extern struct parasite_tty_args *parasite_dump_tty(struct parasite_ctl *ctl, int fd, int type);
+
+extern int parasite_init_threads_seized(struct parasite_ctl *ctl, struct pstree_item *item);
+extern int parasite_fini_threads_seized(struct parasite_ctl *ctl);
+
+extern int syscall_seized(struct parasite_ctl *ctl, int nr, unsigned long *ret,
+ unsigned long arg1, unsigned long arg2,
+ unsigned long arg3, unsigned long arg4,
+ unsigned long arg5, unsigned long arg6);
+
+extern int __parasite_execute_syscall(struct parasite_ctl *ctl,
+ user_regs_struct_t *regs);
+extern bool arch_can_dump_task(pid_t pid);
+
+/*
+ * The PTRACE_SYSCALL will trap task twice -- on
+ * enter into and on exit from syscall. If we trace
+ * a single task, we may skip half of all getregs
+ * calls -- on exit we don't need them.
+ */
+enum trace_flags {
+ TRACE_ALL,
+ TRACE_ENTER,
+ TRACE_EXIT,
+};
+
+extern int parasite_stop_daemon(struct parasite_ctl *ctl);
+extern int parasite_stop_on_syscall(int tasks, int sys_nr, enum trace_flags trace);
+extern int parasite_unmap(struct parasite_ctl *ctl, unsigned long addr);
+extern int ptrace_stop_pie(pid_t pid, void *addr, enum trace_flags *tf);
+
+#endif /* __CR_PARASITE_SYSCALL_H__ */
diff --git a/criu/include/parasite-vdso.h b/criu/include/parasite-vdso.h
new file mode 100644
index 000000000000..d4dc89b47ade
--- /dev/null
+++ b/criu/include/parasite-vdso.h
@@ -0,0 +1,93 @@
+#ifndef __CR_PARASITE_VDSO_H__
+#define __CR_PARASITE_VDSO_H__
+
+#include "config.h"
+
+#ifdef CONFIG_VDSO
+
+#include "util-vdso.h"
+#include "protobuf/vma.pb-c.h"
+
+struct parasite_ctl;
+struct vm_area_list;
+
+/* Check if symbol present in symtable */
+static inline bool vdso_symbol_empty(struct vdso_symbol *s)
+{
+ return s->offset == VDSO_BAD_ADDR && s->name[0] == '\0';
+}
+
+/*
+ * Special mark which allows to identify runtime vdso where
+ * calls from proxy vdso are redirected. This mark usually
+ * placed at the start of vdso area where Elf header lives.
+ * Since such runtime vdso is solevey used by proxy and
+ * nobody else is supposed to access it, it's more-less
+ * safe to screw the Elf header with @signature and
+ * @proxy_addr.
+ *
+ * The @proxy_addr deserves a few comments. When we redirect
+ * the calls from proxy to runtime vdso, on next checkpoint
+ * it won't be possible to find which VMA is proxy, thus
+ * we save its address in the member.
+ */
+struct vdso_mark {
+ u64 signature;
+ unsigned long proxy_vdso_addr;
+
+ unsigned long version;
+
+ /*
+ * In case of new vDSO format the VVAR area address
+ * neeed for easier discovering where it lives without
+ * relying on procfs output.
+ */
+ unsigned long proxy_vvar_addr;
+};
+
+#define VDSO_MARK_SIGNATURE (0x6f73647675697263ULL) /* Magic number (criuvdso) */
+#define VDSO_MARK_SIGNATURE_V2 (0x4f53447675697263ULL) /* Magic number (criuvDSO) */
+#define VDSO_MARK_CUR_VERSION (2)
+
+static inline void vdso_put_mark(void *where, unsigned long proxy_vdso_addr, unsigned long proxy_vvar_addr)
+{
+ struct vdso_mark *m = where;
+
+ m->signature = VDSO_MARK_SIGNATURE_V2;
+ m->proxy_vdso_addr = proxy_vdso_addr;
+ m->version = VDSO_MARK_CUR_VERSION;
+ m->proxy_vvar_addr = proxy_vvar_addr;
+}
+
+static inline bool is_vdso_mark(void *addr)
+{
+ struct vdso_mark *m = addr;
+
+ if (m->signature == VDSO_MARK_SIGNATURE_V2) {
+ /*
+ * New format
+ */
+ return true;
+ } else if (m->signature == VDSO_MARK_SIGNATURE) {
+ /*
+ * Old format -- simply extend the mark up
+ * to the version we support.
+ */
+ vdso_put_mark(m, m->proxy_vdso_addr, VVAR_BAD_ADDR);
+ return true;
+ }
+ return false;
+}
+
+extern int vdso_do_park(struct vdso_symtable *sym_rt, unsigned long park_at, unsigned long park_size);
+extern int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t);
+extern int vdso_proxify(char *who, struct vdso_symtable *sym_rt,
+ unsigned long vdso_rt_parked_at, size_t index,
+ VmaEntry *vmas, size_t nr_vmas);
+
+#else /* CONFIG_VDSO */
+#define vdso_do_park(sym_rt, park_at, park_size) (0)
+
+#endif /* CONFIG_VDSO */
+
+#endif /* __CR_PARASITE_VDSO_H__ */
diff --git a/criu/include/parasite.h b/criu/include/parasite.h
new file mode 100644
index 000000000000..063903b84874
--- /dev/null
+++ b/criu/include/parasite.h
@@ -0,0 +1,253 @@
+#ifndef __CR_PARASITE_H__
+#define __CR_PARASITE_H__
+
+#define PARASITE_STACK_SIZE (16 << 10)
+#define PARASITE_ARG_SIZE_MIN ( 1 << 12)
+
+#define PARASITE_MAX_SIZE (64 << 10)
+
+#ifndef __ASSEMBLY__
+
+#include <sys/un.h>
+#include <sys/time.h>
+#include <time.h>
+#include <signal.h>
+
+#include "image.h"
+#include "util-pie.h"
+
+#include "protobuf/vma.pb-c.h"
+#include "protobuf/tty.pb-c.h"
+
+#define __head __used __section(.head.text)
+
+enum {
+ PARASITE_CMD_IDLE = 0,
+ PARASITE_CMD_ACK,
+
+ PARASITE_CMD_INIT_DAEMON,
+ PARASITE_CMD_DUMP_THREAD,
+ PARASITE_CMD_UNMAP,
+
+ /*
+ * These two must be greater than INITs.
+ */
+ PARASITE_CMD_DAEMONIZED,
+
+ PARASITE_CMD_FINI,
+
+ PARASITE_CMD_MPROTECT_VMAS,
+ PARASITE_CMD_DUMPPAGES,
+
+ PARASITE_CMD_DUMP_SIGACTS,
+ PARASITE_CMD_DUMP_ITIMERS,
+ PARASITE_CMD_DUMP_POSIX_TIMERS,
+ PARASITE_CMD_DUMP_MISC,
+ PARASITE_CMD_DRAIN_FDS,
+ PARASITE_CMD_GET_PROC_FD,
+ PARASITE_CMD_DUMP_TTY,
+ PARASITE_CMD_CHECK_VDSO_MARK,
+ PARASITE_CMD_CHECK_AIOS,
+
+ PARASITE_CMD_MAX,
+};
+
+struct ctl_msg {
+ unsigned int cmd; /* command itself */
+ unsigned int ack; /* ack on command */
+ int err; /* error code on reply */
+};
+
+#define ctl_msg_cmd(_cmd) \
+ (struct ctl_msg){.cmd = _cmd, }
+
+#define ctl_msg_ack(_cmd, _err) \
+ (struct ctl_msg){.cmd = _cmd, .ack = _cmd, .err = _err, }
+
+struct parasite_init_args {
+ int h_addr_len;
+ struct sockaddr_un h_addr;
+
+ int log_level;
+
+ struct rt_sigframe *sigframe;
+
+ void *sigreturn_addr;
+};
+
+struct parasite_unmap_args {
+ void *parasite_start;
+ unsigned long parasite_len;
+};
+
+struct parasite_vma_entry
+{
+ unsigned long start;
+ unsigned long len;
+ int prot;
+};
+
+struct parasite_vdso_vma_entry {
+ unsigned long start;
+ unsigned long len;
+ unsigned long proxy_vdso_addr;
+ unsigned long proxy_vvar_addr;
+ int is_marked;
+ bool try_fill_symtable;
+ bool is_vdso;
+};
+
+struct parasite_dump_pages_args {
+ unsigned int nr_vmas;
+ unsigned int add_prot;
+ unsigned int off;
+ unsigned int nr_segs;
+ unsigned int nr_pages;
+};
+
+static inline struct parasite_vma_entry *pargs_vmas(struct parasite_dump_pages_args *a)
+{
+ return (struct parasite_vma_entry *)(a + 1);
+}
+
+static inline struct iovec *pargs_iovs(struct parasite_dump_pages_args *a)
+{
+ return (struct iovec *)(pargs_vmas(a) + a->nr_vmas);
+}
+
+struct parasite_dump_sa_args {
+ rt_sigaction_t sas[SIGMAX];
+};
+
+struct parasite_dump_itimers_args {
+ struct itimerval real;
+ struct itimerval virt;
+ struct itimerval prof;
+};
+
+struct posix_timer {
+ int it_id;
+ struct itimerspec val;
+ int overrun;
+};
+
+struct parasite_dump_posix_timers_args {
+ int timer_n;
+ struct posix_timer timer[0];
+};
+
+struct parasite_aio {
+ unsigned long ctx;
+ unsigned int max_reqs;
+ unsigned int *vma_nr_reqs;
+};
+
+struct parasite_check_aios_args {
+ unsigned nr_rings;
+ struct parasite_aio ring[0];
+};
+
+static inline int posix_timers_dump_size(int timer_n)
+{
+ return sizeof(int) + sizeof(struct posix_timer) * timer_n;
+}
+
+/*
+ * Misc sfuff, that is too small for separate file, but cannot
+ * be read w/o using parasite
+ */
+
+struct parasite_dump_misc {
+ unsigned long brk;
+
+ u32 pid;
+ u32 sid;
+ u32 pgid;
+ u32 umask;
+
+ int dumpable;
+};
+
+/*
+ * Calculate how long we can make the groups array in parasite_dump_creds
+ * and still fit the struct in one page
+ */
+#define PARASITE_MAX_GROUPS \
+ ((PAGE_SIZE - sizeof(struct parasite_dump_thread) - \
+ offsetof(struct parasite_dump_creds, groups)) / sizeof(unsigned int)) /* groups */
+
+struct parasite_dump_creds {
+ unsigned int cap_last_cap;
+
+ u32 cap_inh[CR_CAP_SIZE];
+ u32 cap_prm[CR_CAP_SIZE];
+ u32 cap_eff[CR_CAP_SIZE];
+ u32 cap_bnd[CR_CAP_SIZE];
+
+ int uids[4];
+ int gids[4];
+ unsigned int secbits;
+ unsigned int ngroups;
+ /*
+ * FIXME -- this structure is passed to parasite code
+ * through parasite args area so in parasite_dump_creds()
+ * call we check for size of this data fits the size of
+ * the area. Unfortunatelly, we _actually_ use more bytes
+ * than the sizeof() -- we put PARASITE_MAX_GROUPS int-s
+ * in there, so the size check is not correct.
+ *
+ * However, all this works simply because we make sure
+ * the PARASITE_MAX_GROUPS is so, that the total amount
+ * of memory in use doesn't exceed the PAGE_SIZE and the
+ * args area is at least one page (PARASITE_ARG_SIZE_MIN).
+ */
+ unsigned int groups[0];
+};
+
+struct parasite_dump_thread {
+ unsigned int *tid_addr;
+ pid_t tid;
+ tls_t tls;
+ stack_t sas;
+ int pdeath_sig;
+ struct parasite_dump_creds creds[0];
+};
+
+static inline void copy_sas(ThreadSasEntry *dst, const stack_t *src)
+{
+ dst->ss_sp = encode_pointer(src->ss_sp);
+ dst->ss_size = (u64)src->ss_size;
+ dst->ss_flags = src->ss_flags;
+}
+
+#define PARASITE_MAX_FDS (PAGE_SIZE / sizeof(int))
+
+struct parasite_drain_fd {
+ int nr_fds;
+ int fds[PARASITE_MAX_FDS];
+};
+
+static inline int drain_fds_size(struct parasite_drain_fd *dfds)
+{
+ return sizeof(dfds->nr_fds) + dfds->nr_fds * sizeof(dfds->fds[0]);
+}
+
+struct parasite_tty_args {
+ int fd;
+ int type;
+
+ int sid;
+ int pgrp;
+ bool hangup;
+
+ int st_pckt;
+ int st_lock;
+ int st_excl;
+};
+
+/* the parasite prefix is added by gen_offsets.sh */
+#define parasite_sym(pblob, name) ((void *)(pblob) + parasite_blob_offset__##name)
+
+#endif /* !__ASSEMBLY__ */
+
+#endif /* __CR_PARASITE_H__ */
diff --git a/criu/include/pid.h b/criu/include/pid.h
new file mode 100644
index 000000000000..d073944cea7d
--- /dev/null
+++ b/criu/include/pid.h
@@ -0,0 +1,32 @@
+#ifndef __CR_PID_H__
+#define __CR_PID_H__
+
+#include "stdbool.h"
+
+struct pid {
+ /*
+ * The @real pid is used to fetch tasks during dumping stage,
+ * This is a global pid seen from the context where the dumping
+ * is running.
+ */
+ pid_t real;
+
+ /*
+ * The @virt pid is one which used in the image itself and keeps
+ * the pid value to be restored. This pid fetched from the
+ * dumpee context, because the dumpee might have own pid namespace.
+ */
+ pid_t virt;
+};
+
+/*
+ * When we have to restore a shared resource, we mush select which
+ * task should do it, and make other(s) wait for it. In order to
+ * avoid deadlocks, always make task with lower pid be the restorer.
+ */
+static inline bool pid_rst_prio(unsigned pid_a, unsigned pid_b)
+{
+ return pid_a < pid_b;
+}
+
+#endif /* __CR_PID_H__ */
diff --git a/criu/include/pipes.h b/criu/include/pipes.h
new file mode 100644
index 000000000000..c8786164c4ec
--- /dev/null
+++ b/criu/include/pipes.h
@@ -0,0 +1,57 @@
+#ifndef __CR_PIPES_H__
+#define __CR_PIPES_H__
+
+#include "protobuf/pipe-data.pb-c.h"
+#include "protobuf/pipe.pb-c.h"
+
+extern struct collect_image_info pipe_cinfo;
+extern int collect_pipes(void);
+extern void mark_pipe_master(void);
+extern const struct fdtype_ops pipe_dump_ops;
+
+static inline u32 pipe_id(const struct fd_parms *p)
+{
+ return p->stat.st_ino;
+}
+
+#define NR_PIPES_WITH_DATA 1024
+
+struct pipe_data_dump {
+ int img_type;
+ unsigned int nr;
+ u32 ids[NR_PIPES_WITH_DATA];
+};
+
+extern int dump_one_pipe_data(struct pipe_data_dump *pd, int lfd, const struct fd_parms *p);
+
+struct pipe_data_rst {
+ PipeDataEntry *pde;
+ void *data;
+ struct pipe_data_rst *next;
+};
+
+#define PIPE_DATA_HASH_BITS 5
+#define PIPE_DATA_HASH_SIZE (1 << PIPE_DATA_HASH_BITS)
+#define PIPE_DATA_HASH_MASK (PIPE_DATA_HASH_SIZE - 1)
+
+extern int collect_pipe_data(int img_type, struct pipe_data_rst **hash);
+extern int restore_pipe_data(int img_type, int pfd, u32 id, struct pipe_data_rst **hash);
+
+/*
+ * The sequence of objects which should be restored:
+ * pipe -> files struct-s -> fd-s.
+ * pipe_entry describes pipe's file structs-s.
+ * A pipe doesn't have own properties, so it has no object.
+ */
+
+struct pipe_info {
+ PipeEntry *pe;
+ struct list_head pipe_list; /* All pipe_info with the same pipe_id
+ * This is pure circular list without head */
+ struct list_head list; /* list head for fdinfo_list_entry-s */
+ struct file_desc d;
+ unsigned int create : 1,
+ reopen : 1;
+};
+
+#endif /* __CR_PIPES_H__ */
diff --git a/criu/include/plugin.h b/criu/include/plugin.h
new file mode 100644
index 000000000000..2855836206d9
--- /dev/null
+++ b/criu/include/plugin.h
@@ -0,0 +1,46 @@
+#ifndef __CR_PLUGIN_H__
+#define __CR_PLUGIN_H__
+
+#include "criu-plugin.h"
+#include "compiler.h"
+#include "list.h"
+
+#define CR_PLUGIN_DEFAULT "/var/lib/criu/"
+
+void cr_plugin_fini(int stage, int err);
+int cr_plugin_init(int stage);
+
+typedef struct {
+ struct list_head head;
+ struct list_head hook_chain[CR_PLUGIN_HOOK__MAX];
+} cr_plugin_ctl_t;
+
+extern cr_plugin_ctl_t cr_plugin_ctl;
+
+typedef struct {
+ cr_plugin_desc_t *d;
+ struct list_head list;
+ void *dlhandle;
+ struct list_head link[CR_PLUGIN_HOOK__MAX];
+} plugin_desc_t;
+
+#define run_plugins(__hook, ...) \
+({ \
+ plugin_desc_t *this; \
+ int __ret = -ENOTSUP; \
+ \
+ list_for_each_entry(this, &cr_plugin_ctl.hook_chain[CR_PLUGIN_HOOK__ ##__hook], \
+ link[CR_PLUGIN_HOOK__ ##__hook]) { \
+ pr_debug("plugin: `%s' hook %u -> %p\n", \
+ this->d->name, CR_PLUGIN_HOOK__ ##__hook, \
+ this->d->hooks[CR_PLUGIN_HOOK__ ##__hook]); \
+ __ret = ((CR_PLUGIN_HOOK__ ##__hook ##_t *) \
+ this->d->hooks[CR_PLUGIN_HOOK__ ##__hook])(__VA_ARGS__); \
+ if (__ret == -ENOTSUP) \
+ continue; \
+ break; \
+ } \
+ __ret; \
+})
+
+#endif
diff --git a/criu/include/posix-timer.h b/criu/include/posix-timer.h
new file mode 100644
index 000000000000..568bf4a27e9d
--- /dev/null
+++ b/criu/include/posix-timer.h
@@ -0,0 +1,27 @@
+#ifndef __CR_PROC_POSIX_TIMER_H__
+#define __CR_PROC_POSIX_TIMER_H__
+
+#include "list.h"
+
+struct str_posix_timer {
+ long it_id;
+ int clock_id;
+ int si_signo;
+ int it_sigev_notify;
+ void * sival_ptr;
+};
+
+struct proc_posix_timer {
+ struct list_head list;
+ struct str_posix_timer spt;
+};
+
+struct proc_posix_timers_stat {
+ int timer_n;
+ struct list_head timers;
+};
+
+extern int parse_posix_timers(pid_t pid, struct proc_posix_timers_stat * args);
+void free_posix_timers(struct proc_posix_timers_stat *st);
+
+#endif /* __CR_PROC_POSIX_TIMER_H__ */
diff --git a/criu/include/prctl.h b/criu/include/prctl.h
new file mode 100644
index 000000000000..b48d95286277
--- /dev/null
+++ b/criu/include/prctl.h
@@ -0,0 +1,77 @@
+#ifndef __CR_PRCTL_H__
+#define __CR_PRCTL_H__
+
+#include "asm/int.h"
+
+#ifndef PR_SET_NAME
+# define PR_SET_NAME 15
+#endif
+#ifndef PR_GET_NAME
+# define PR_GET_NAME 16
+#endif
+#ifndef PR_SET_SECCOMP
+# define PR_SET_SECCOMP 22
+#endif
+#ifndef PR_CAPBSET_READ
+# define PR_CAPBSET_READ 23
+#endif
+#ifndef PR_CAPBSET_DROP
+# define PR_CAPBSET_DROP 24
+#endif
+#ifndef PR_GET_SECUREBITS
+# define PR_GET_SECUREBITS 27
+#endif
+#ifndef PR_SET_SECUREBITS
+# define PR_SET_SECUREBITS 28
+#endif
+#ifndef PR_GET_DUMPABLE
+# define PR_GET_DUMPABLE 3
+#endif
+#ifndef PR_SET_DUMPABLE
+# define PR_SET_DUMPABLE 4
+#endif
+
+#ifndef PR_SET_MM
+#define PR_SET_MM 35
+# define PR_SET_MM_START_CODE 1
+# define PR_SET_MM_END_CODE 2
+# define PR_SET_MM_START_DATA 3
+# define PR_SET_MM_END_DATA 4
+# define PR_SET_MM_START_STACK 5
+# define PR_SET_MM_START_BRK 6
+# define PR_SET_MM_BRK 7
+# define PR_SET_MM_ARG_START 8
+# define PR_SET_MM_ARG_END 9
+# define PR_SET_MM_ENV_START 10
+# define PR_SET_MM_ENV_END 11
+# define PR_SET_MM_AUXV 12
+# define PR_SET_MM_EXE_FILE 13
+#endif
+
+#ifndef PR_SET_MM_MAP
+# define PR_SET_MM_MAP 14
+# define PR_SET_MM_MAP_SIZE 15
+
+struct prctl_mm_map {
+ u64 start_code;
+ u64 end_code;
+ u64 start_data;
+ u64 end_data;
+ u64 start_brk;
+ u64 brk;
+ u64 start_stack;
+ u64 arg_start;
+ u64 arg_end;
+ u64 env_start;
+ u64 env_end;
+ u64 *auxv;
+ u32 auxv_size;
+ u32 exe_fd;
+};
+#endif
+
+#ifndef PR_GET_TID_ADDRESS
+# define PR_GET_TID_ADDRESS 40
+#endif
+
+#endif /* __CR_PRCTL_H__ */
diff --git a/criu/include/proc_parse.h b/criu/include/proc_parse.h
new file mode 100644
index 000000000000..33cd07712c89
--- /dev/null
+++ b/criu/include/proc_parse.h
@@ -0,0 +1,217 @@
+#ifndef __CR_PROC_PARSE_H__
+#define __CR_PROC_PARSE_H__
+
+#include <sys/types.h>
+#include "asm/types.h"
+#include "image.h"
+#include "list.h"
+#include "cgroup.h"
+#include "mount.h"
+
+#include "protobuf/eventfd.pb-c.h"
+#include "protobuf/eventpoll.pb-c.h"
+#include "protobuf/signalfd.pb-c.h"
+#include "protobuf/fsnotify.pb-c.h"
+#include "protobuf/timerfd.pb-c.h"
+#include "protobuf/seccomp.pb-c.h"
+
+#define PROC_TASK_COMM_LEN 32
+#define PROC_TASK_COMM_LEN_FMT "(%31s"
+
+struct proc_pid_stat {
+ int pid;
+ char comm[PROC_TASK_COMM_LEN];
+ char state;
+ int ppid;
+ int pgid;
+ int sid;
+ int tty_nr;
+ int tty_pgrp;
+ unsigned int flags;
+ unsigned long min_flt;
+ unsigned long cmin_flt;
+ unsigned long maj_flt;
+ unsigned long cmaj_flt;
+ unsigned long utime;
+ unsigned long stime;
+ long cutime;
+ long cstime;
+ long priority;
+ long nice;
+ int num_threads;
+ int zero0;
+ unsigned long long start_time;
+ unsigned long vsize;
+ long mm_rss;
+ unsigned long rsslim;
+ unsigned long start_code;
+ unsigned long end_code;
+ unsigned long start_stack;
+ unsigned long esp;
+ unsigned long eip;
+ unsigned long sig_pending;
+ unsigned long sig_blocked;
+ unsigned long sig_ignored;
+ unsigned long sig_handled;
+ unsigned long wchan;
+ unsigned long zero1;
+ unsigned long zero2;
+ int exit_signal;
+ int task_cpu;
+ unsigned int rt_priority;
+ unsigned int policy;
+ unsigned long long delayacct_blkio_ticks;
+ unsigned long gtime;
+ long cgtime;
+ unsigned long start_data;
+ unsigned long end_data;
+ unsigned long start_brk;
+ unsigned long arg_start;
+ unsigned long arg_end;
+ unsigned long env_start;
+ unsigned long env_end;
+ int exit_code;
+};
+
+struct seccomp_info {
+ SeccompFilter filter;
+ int id;
+ struct seccomp_info *prev;
+};
+
+#define PROC_CAP_SIZE 2
+
+struct proc_status_creds {
+ unsigned int uids[4];
+ unsigned int gids[4];
+
+ char state;
+ int ppid;
+ unsigned long long sigpnd;
+ unsigned long long shdpnd;
+
+ int seccomp_mode;
+ u32 last_filter;
+
+ /*
+ * Keep them at the end of structure
+ * for fast comparision reason.
+ */
+ u32 cap_inh[PROC_CAP_SIZE];
+ u32 cap_prm[PROC_CAP_SIZE];
+ u32 cap_eff[PROC_CAP_SIZE];
+ u32 cap_bnd[PROC_CAP_SIZE];
+};
+
+bool proc_status_creds_dumpable(struct proc_status_creds *parent,
+ struct proc_status_creds *child);
+
+typedef int (*mount_fn_t)(struct mount_info *mi, const char *src, const
+ char *fstype, unsigned long mountflags);
+
+struct fstype {
+ char *name;
+ int code;
+ int (*dump)(struct mount_info *pm);
+ int (*restore)(struct mount_info *pm);
+ int (*parse)(struct mount_info *pm);
+ mount_fn_t mount;
+};
+
+struct vm_area_list;
+
+#define INVALID_UID ((uid_t)-1)
+
+extern bool add_skip_mount(const char *mountpoint);
+extern struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool for_dump);
+extern int parse_pid_stat(pid_t pid, struct proc_pid_stat *s);
+extern unsigned int parse_pid_loginuid(pid_t pid, int *err, bool ignore_noent);
+extern int parse_pid_oom_score_adj(pid_t pid, int *err);
+extern int prepare_loginuid(unsigned int value, unsigned int loglevel);
+extern int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list);
+extern int parse_self_maps_lite(struct vm_area_list *vms);
+extern int parse_pid_status(pid_t pid, struct proc_status_creds *);
+
+struct inotify_wd_entry {
+ InotifyWdEntry e;
+ FhEntry f_handle;
+ struct list_head node;
+};
+
+struct fanotify_mark_entry {
+ FanotifyMarkEntry e;
+ FhEntry f_handle;
+ struct list_head node;
+ union {
+ FanotifyInodeMarkEntry ie;
+ FanotifyMountMarkEntry me;
+ };
+};
+
+struct eventpoll_tfd_entry {
+ EventpollTfdEntry e;
+ struct list_head node;
+};
+
+union fdinfo_entries {
+ EventfdFileEntry efd;
+ SignalfdEntry sfd;
+ struct inotify_wd_entry ify;
+ struct fanotify_mark_entry ffy;
+ struct eventpoll_tfd_entry epl;
+ TimerfdEntry tfy;
+};
+
+extern void free_inotify_wd_entry(union fdinfo_entries *e);
+extern void free_fanotify_mark_entry(union fdinfo_entries *e);
+extern void free_event_poll_entry(union fdinfo_entries *e);
+
+struct fdinfo_common {
+ off64_t pos;
+ int flags;
+ int mnt_id;
+ int owner;
+};
+
+extern int parse_fdinfo(int fd, int type,
+ int (*cb)(union fdinfo_entries *e, void *arg), void *arg);
+extern int parse_fdinfo_pid(int pid, int fd, int type,
+ int (*cb)(union fdinfo_entries *e, void *arg), void *arg);
+extern int parse_file_locks(void);
+extern int get_fd_mntid(int fd, int *mnt_id);
+
+struct pid;
+extern int parse_threads(int pid, struct pid **_t, int *_n);
+
+extern int check_mnt_id(void);
+
+/*
+ * This struct describes a group controlled by one controller.
+ * The @name is the controller name or 'name=...' for named cgroups.
+ * The @path is the path from the hierarchy root.
+ */
+
+struct cg_ctl {
+ struct list_head l;
+ char *name;
+ char *path;
+};
+
+/*
+ * Returns the list of cg_ctl-s sorted by name
+ */
+
+extern int parse_task_cgroup(int pid, struct list_head *l, unsigned int *n);
+extern void put_ctls(struct list_head *);
+
+int collect_controllers(struct list_head *cgroups, unsigned int *n_cgroups);
+
+/* callback for AUFS support */
+extern int aufs_parse(struct mount_info *mi);
+
+/* callback for OverlayFS support */
+extern int overlayfs_parse(struct mount_info *mi);
+
+int parse_children(pid_t pid, pid_t **_c, int *_n);
+
+#endif /* __CR_PROC_PARSE_H__ */
diff --git a/criu/include/protobuf-desc.h b/criu/include/protobuf-desc.h
new file mode 100644
index 000000000000..bb66a868958d
--- /dev/null
+++ b/criu/include/protobuf-desc.h
@@ -0,0 +1,91 @@
+#ifndef __CR_PROTOBUF_DESC_H__
+#define __CR_PROTOBUF_DESC_H__
+
+#include <sys/types.h>
+#include <google/protobuf-c/protobuf-c.h>
+
+enum {
+ /* PB_AUTOGEN_START */
+ PB_INVENTORY, /* 0 */
+ PB_STATS,
+ PB_FDINFO,
+ PB_CORE,
+ PB_MM,
+ PB_VMA,
+ PB_ITIMER,
+ PB_POSIX_TIMER,
+ PB_CREDS,
+ PB_FS,
+ PB_UTSNS, /* 10 */
+ PB_IPC_VAR,
+ PB_IPC_SHM,
+ PB_IPC_SEM,
+ PB_MNT,
+ PB_PSTREE,
+ PB_GHOST_FILE,
+ PB_TCP_STREAM,
+ PB_REG_FILE,
+ PB_EXT_FILE,
+ PB_NS_FILE, /* 20 */
+ PB_INET_SK,
+ PB_UNIX_SK,
+ PB_PACKET_SOCK,
+ PB_NETLINK_SK,
+ PB_PIPE,
+ PB_FIFO,
+ PB_PIPE_DATA,
+ PB_EVENTFD_FILE,
+ PB_EVENTPOLL_FILE,
+ PB_EVENTPOLL_TFD, /* 30 */
+ PB_SIGNALFD,
+ PB_INOTIFY_FILE,
+ PB_INOTIFY_WD,
+ PB_FANOTIFY_FILE,
+ PB_FANOTIFY_MARK,
+ PB_TTY_FILE,
+ PB_TTY_INFO,
+ PB_FILE_LOCK,
+ PB_RLIMIT,
+ PB_PAGEMAP, /* 40 */
+ PB_SIGINFO,
+ PB_TUNFILE,
+ PB_IRMAP_CACHE,
+ PB_CGROUP,
+ PB_SECCOMP,
+ PB_TIMERFD,
+ PB_CPUINFO,
+ PB_USERNS,
+ PB_NETNS,
+ PB_BINFMT_MISC, /* 50 */
+
+ /* PB_AUTOGEN_STOP */
+
+ PB_PAGEMAP_HEAD,
+ PB_IDS,
+ PB_SIGACT,
+ PB_NETDEV,
+ PB_REMAP_FPATH,
+ PB_SK_QUEUES,
+ PB_IPCNS_MSG,
+ PB_IPCNS_MSG_ENT,
+
+ PB_MAX,
+};
+
+typedef size_t (*pb_getpksize_t)(void *obj);
+typedef size_t (*pb_pack_t)(void *obj, void *where);
+typedef void *(*pb_unpack_t)(void *allocator, size_t size, void *from);
+typedef void (*pb_free_t)(void *obj, void *allocator);
+
+struct cr_pb_message_desc {
+ pb_getpksize_t getpksize;
+ pb_pack_t pack;
+ pb_unpack_t unpack;
+ pb_free_t free;
+ const ProtobufCMessageDescriptor *pb_desc;
+};
+
+extern void cr_pb_init(void);
+extern struct cr_pb_message_desc cr_pb_descs[PB_MAX];
+
+#endif /* __CR_PROTOBUF_DESC_H__ */
diff --git a/criu/include/protobuf.h b/criu/include/protobuf.h
new file mode 100644
index 000000000000..3d76b13eda32
--- /dev/null
+++ b/criu/include/protobuf.h
@@ -0,0 +1,57 @@
+#ifndef __CR_PROTOBUF_H__
+#define __CR_PROTOBUF_H__
+
+#include "protobuf-desc.h"
+
+#include "asm/types.h"
+#include "compiler.h"
+#include "util.h"
+
+struct cr_img;
+
+extern int do_pb_read_one(struct cr_img *, void **objp, int type, bool eof);
+
+#define pb_read_one(fd, objp, type) do_pb_read_one(fd, (void **)objp, type, false)
+#define pb_read_one_eof(fd, objp, type) do_pb_read_one(fd, (void **)objp, type, true)
+
+extern int pb_write_one(struct cr_img *, void *obj, int type);
+
+#define pb_pksize(__obj, __proto_message_name) \
+ (__proto_message_name ##__get_packed_size(__obj) + sizeof(u32))
+
+#define pb_repeated_size(__obj, __member) \
+ ((size_t)(sizeof(*(__obj)->__member) * (__obj)->n_ ##__member))
+
+#define pb_msg(__base, __type) \
+ container_of(__base, __type, base)
+
+#include <google/protobuf-c/protobuf-c.h>
+
+extern void do_pb_show_plain(struct cr_img *, int type, int single_entry,
+ void (*payload_hadler)(struct cr_img *, void *obj),
+ const char *pretty_fmt);
+
+/* Don't have objects at hands to also do typechecking here */
+#define pb_show_plain_payload_pretty(__fd, __type, payload_hadler, pretty) \
+ do_pb_show_plain(__fd, __type, 0, payload_hadler, pretty)
+
+#define pb_show_plain_payload(__fd, __proto_message_name, payload_hadler) \
+ pb_show_plain_payload_pretty(__fd, __proto_message_name, payload_hadler, NULL)
+
+#define pb_show_plain_pretty(__fd, __proto_message_name, __pretty) \
+ pb_show_plain_payload_pretty(__fd, __proto_message_name, NULL, __pretty)
+
+struct collect_image_info {
+ int fd_type;
+ int pb_type;
+ unsigned int priv_size;
+ int (*collect)(void *, ProtobufCMessage *);
+ unsigned flags;
+};
+
+#define COLLECT_SHARED 0x1 /* use shared memory for obj-s */
+#define COLLECT_HAPPENED 0x4 /* image was opened and collected */
+
+extern int collect_image(struct collect_image_info *);
+
+#endif /* __CR_PROTOBUF_H__ */
diff --git a/criu/include/pstree.h b/criu/include/pstree.h
new file mode 100644
index 000000000000..47ce676a9eba
--- /dev/null
+++ b/criu/include/pstree.h
@@ -0,0 +1,102 @@
+#ifndef __CR_PSTREE_H__
+#define __CR_PSTREE_H__
+
+#include "list.h"
+#include "pid.h"
+#include "image.h"
+#include "rst_info.h"
+#include "protobuf/core.pb-c.h"
+
+/*
+ * That's the init process which usually inherit
+ * all orphaned children in the system.
+ */
+#define INIT_PID (1)
+struct pstree_item {
+ struct pstree_item *parent;
+ struct list_head children; /* list of my children */
+ struct list_head sibling; /* linkage in my parent's children list */
+
+ struct pid pid;
+ pid_t pgid;
+ pid_t sid;
+ pid_t born_sid;
+
+ int state; /* TASK_XXX constants */
+
+ int nr_threads; /* number of threads */
+ struct pid *threads; /* array of threads */
+ CoreEntry **core;
+ TaskKobjIdsEntry *ids;
+};
+
+/* See alloc_pstree_item() for details */
+static inline struct rst_info *rsti(struct pstree_item *i)
+{
+ return (struct rst_info *)(i + 1);
+}
+
+struct ns_id;
+struct dmp_info {
+ struct ns_id *netns;
+ /*
+ * We keep the creds here so that we can compare creds while seizing
+ * threads. Dumping tasks with different creds is not supported.
+ */
+ struct proc_status_creds *pi_creds;
+};
+
+static inline struct dmp_info *dmpi(struct pstree_item *i)
+{
+ return (struct dmp_info *)(i + 1);
+}
+
+/* ids is alocated and initialized for all alive tasks */
+static inline int shared_fdtable(struct pstree_item *item)
+{
+ return (item->parent &&
+ item->ids->files_id == item->parent->ids->files_id);
+}
+
+static inline bool task_alive(struct pstree_item *i)
+{
+ return (i->state == TASK_ALIVE) || (i->state == TASK_STOPPED);
+}
+
+extern void free_pstree(struct pstree_item *root_item);
+extern struct pstree_item *__alloc_pstree_item(bool rst);
+#define alloc_pstree_item() __alloc_pstree_item(false)
+#define alloc_pstree_item_with_rst() __alloc_pstree_item(true)
+extern struct pstree_item *alloc_pstree_helper(void);
+
+extern struct pstree_item *root_item;
+extern struct pstree_item *pstree_item_next(struct pstree_item *item);
+#define for_each_pstree_item(pi) \
+ for (pi = root_item; pi != NULL; pi = pstree_item_next(pi))
+
+extern bool restore_before_setsid(struct pstree_item *child);
+extern int prepare_pstree(void);
+
+extern int dump_pstree(struct pstree_item *root_item);
+
+struct pstree_item *pstree_item_by_real(pid_t virt);
+struct pstree_item *pstree_item_by_virt(pid_t virt);
+
+extern int pid_to_virt(pid_t pid);
+extern bool pid_in_pstree(pid_t pid);
+
+struct task_entries;
+extern struct task_entries *task_entries;
+
+extern int get_task_ids(struct pstree_item *);
+extern struct _TaskKobjIdsEntry *root_ids;
+
+extern void core_entry_free(CoreEntry *core);
+extern CoreEntry *core_entry_alloc(int alloc_thread_info, int alloc_tc);
+extern int pstree_alloc_cores(struct pstree_item *item);
+extern void pstree_free_cores(struct pstree_item *item);
+
+extern int collect_pstree_ids(void);
+
+extern int preorder_pstree_traversal(struct pstree_item *item, int (*f)(struct pstree_item *));
+#endif /* __CR_PSTREE_H__ */
diff --git a/criu/include/ptrace.h b/criu/include/ptrace.h
new file mode 100644
index 000000000000..047b1e2ab008
--- /dev/null
+++ b/criu/include/ptrace.h
@@ -0,0 +1,84 @@
+#ifndef __CR_PTRACE_H__
+#define __CR_PTRACE_H__
+
+#include <linux/types.h>
+#include <sys/ptrace.h>
+
+#include "config.h"
+#include "proc_parse.h"
+
+/* some constants for ptrace */
+#ifndef PTRACE_SEIZE
+# define PTRACE_SEIZE 0x4206
+#endif
+
+#ifndef PTRACE_O_SUSPEND_SECCOMP
+# define PTRACE_O_SUSPEND_SECCOMP (1 << 21)
+#endif
+
+#ifndef PTRACE_INTERRUPT
+# define PTRACE_INTERRUPT 0x4207
+#endif
+
+#ifndef PTRACE_LISTEN
+#define PTRACE_LISTEN 0x4208
+#endif
+
+#ifndef PTRACE_PEEKSIGINFO
+#define PTRACE_PEEKSIGINFO 0x4209
+
+/* Read signals from a shared (process wide) queue */
+#define PTRACE_PEEKSIGINFO_SHARED (1 << 0)
+#endif
+
+#ifndef CONFIG_HAS_PEEKSIGINFO_ARGS
+struct ptrace_peeksiginfo_args {
+ __u64 off; /* from which siginfo to start */
+ __u32 flags;
+ __u32 nr; /* how may siginfos to take */
+};
+#endif
+
+#ifndef PTRACE_GETREGSET
+# define PTRACE_GETREGSET 0x4204
+# define PTRACE_SETREGSET 0x4205
+#endif
+
+#define PTRACE_GETSIGMASK 0x420a
+#define PTRACE_SETSIGMASK 0x420b
+
+#ifndef PTRACE_SECCOMP_GET_FILTER
+#define PTRACE_SECCOMP_GET_FILTER 0x420c
+#endif
+
+#define PTRACE_SEIZE_DEVEL 0x80000000
+
+#define PTRACE_EVENT_FORK 1
+#define PTRACE_EVENT_VFORK 2
+#define PTRACE_EVENT_CLONE 3
+#define PTRACE_EVENT_EXEC 4
+#define PTRACE_EVENT_VFORK_DONE 5
+#define PTRACE_EVENT_EXIT 6
+#define PTRACE_EVENT_STOP 128
+
+#define PTRACE_O_TRACESYSGOOD 0x00000001
+#define PTRACE_O_TRACEFORK 0x00000002
+#define PTRACE_O_TRACEVFORK 0x00000004
+#define PTRACE_O_TRACECLONE 0x00000008
+#define PTRACE_O_TRACEEXEC 0x00000010
+#define PTRACE_O_TRACEVFORKDONE 0x00000020
+#define PTRACE_O_TRACEEXIT 0x00000040
+
+#define SI_EVENT(_si_code) (((_si_code) & 0xFFFF) >> 8)
+
+extern int processes_to_wait;
+
+extern int seize_catch_task(pid_t pid);
+extern int seize_wait_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds);
+extern int suspend_seccomp(pid_t pid);
+extern int unseize_task(pid_t pid, int orig_state, int state);
+extern int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes);
+extern int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes);
+extern int ptrace_swap_area(pid_t pid, void *dst, void *src, long bytes);
+
+#endif /* __CR_PTRACE_H__ */
diff --git a/criu/include/rbtree.h b/criu/include/rbtree.h
new file mode 100644
index 000000000000..f6082103298f
--- /dev/null
+++ b/criu/include/rbtree.h
@@ -0,0 +1,89 @@
+/*
+ * RBtree implementation adopted from the Linux kernel sources.
+ */
+
+#ifndef __CR_RBTREE_H__
+#define __CR_RBTREE_H__
+
+#include <stddef.h>
+
+#include "compiler.h"
+#include "asm/types.h"
+
+#define RB_RED 0
+#define RB_BLACK 1
+#define RB_MASK 3
+
+struct rb_node {
+ unsigned long rb_parent_color; /* Keeps both parent anc color */
+ struct rb_node *rb_right;
+ struct rb_node *rb_left;
+} __aligned(sizeof(long));
+
+struct rb_root {
+ struct rb_node *rb_node;
+};
+
+#define rb_parent(r) ((struct rb_node *)((r)->rb_parent_color & ~RB_MASK))
+#define rb_color(r) ((r)->rb_parent_color & RB_BLACK)
+#define rb_is_red(r) (!rb_color(r))
+#define rb_is_black(r) (rb_color(r))
+#define rb_set_red(r) do { (r)->rb_parent_color &= ~RB_BLACK; } while (0)
+#define rb_set_black(r) do { (r)->rb_parent_color |= RB_BLACK; } while (0)
+
+static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
+{
+ rb->rb_parent_color = (rb->rb_parent_color & RB_MASK) | (unsigned long)p;
+}
+
+static inline void rb_set_color(struct rb_node *rb, int color)
+{
+ rb->rb_parent_color = (rb->rb_parent_color & ~RB_BLACK) | color;
+}
+
+#define RB_ROOT (struct rb_root){ NULL, }
+#define rb_entry(ptr, type, member) container_of(ptr, type, member)
+
+#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL)
+#define RB_EMPTY_NODE(node) (rb_parent(node) == node)
+#define RB_CLEAR_NODE(node) (rb_set_parent(node, node))
+
+static inline void rb_init_node(struct rb_node *node)
+{
+ *node = (struct rb_node){ };
+
+ RB_CLEAR_NODE(node);
+}
+
+extern void rb_insert_color(struct rb_node *node, struct rb_root *root);
+extern void rb_erase(struct rb_node *node, struct rb_root *root);
+
+/* Find logical next and previous nodes in a tree */
+extern struct rb_node *rb_first(const struct rb_root *root);
+extern struct rb_node *rb_last(const struct rb_root *root);
+extern struct rb_node *rb_next(const struct rb_node *node);
+extern struct rb_node *rb_prev(const struct rb_node *node);
+
+/* Fast replacement of a single node without remove/rebalance/add/rebalance */
+extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
+ struct rb_root *root);
+
+static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
+ struct rb_node **rb_link)
+{
+ node->rb_parent_color = (unsigned long)parent;
+ node->rb_left = node->rb_right = NULL;
+
+ *rb_link = node;
+}
+
+static inline void rb_link_and_balance(struct rb_root *root,
+ struct rb_node *node,
+ struct rb_node *parent,
+ struct rb_node **rb_link)
+{
+ rb_link_node(node, parent, rb_link);
+ rb_insert_color(node, root);
+}
+
+#endif /* __CR_RBTREE_H__ */
diff --git a/criu/include/restorer.h b/criu/include/restorer.h
new file mode 100644
index 000000000000..4c4377cdaf67
--- /dev/null
+++ b/criu/include/restorer.h
@@ -0,0 +1,241 @@
+#ifndef __CR_RESTORER_H__
+#define __CR_RESTORER_H__
+
+#include <signal.h>
+#include <limits.h>
+#include <sys/resource.h>
+
+#include "compiler.h"
+#include "asm/types.h"
+#include "asm/fpu.h"
+#include "image.h"
+#include "lock.h"
+#include "util.h"
+#include "asm/restorer.h"
+#include "rst_info.h"
+#include "config.h"
+
+#include "posix-timer.h"
+#include "timerfd.h"
+#include "shmem.h"
+#include "sigframe.h"
+#include "parasite-vdso.h"
+
+#include <time.h>
+
+#include "protobuf/mm.pb-c.h"
+#include "protobuf/vma.pb-c.h"
+#include "protobuf/creds.pb-c.h"
+#include "protobuf/core.pb-c.h"
+
+struct task_restore_core_args;
+struct thread_restore_args;
+
+typedef long (*task_restore_fcall_t) (struct task_restore_core_args *args);
+typedef long (*thread_restore_fcall_t) (struct thread_restore_args *args);
+
+#define RESTORE_CMD__NONE 0
+#define RESTORE_CMD__GET_SELF_LEN 1
+#define RESTORE_CMD__RESTORE_CORE 2
+#define RESTORE_CMD__RESTORE_THREAD 3
+
+/*
+ * These *must* be power of two values.
+ */
+#define RESTORE_ARGS_SIZE (512)
+#define RESTORE_STACK_REDZONE (128)
+#define RESTORE_STACK_SIZE (KILO(32))
+
+struct restore_mem_zone {
+ u8 redzone[RESTORE_STACK_REDZONE];
+ u8 stack[RESTORE_STACK_SIZE];
+ u8 rt_sigframe[RESTORE_STACK_SIGFRAME];
+} __stack_aligned__;
+
+struct rst_sched_param {
+ int policy;
+ int nice;
+ int prio;
+};
+
+struct restore_posix_timer {
+ struct str_posix_timer spt;
+ struct itimerspec val;
+ int overrun;
+};
+
+struct task_restore_core_args;
+
+/*
+ * We should be able to construct fpu sigframe in sigreturn_prep_fpu_frame,
+ * so the mem_zone.rt_sigframe should be 64-bytes aligned. To make things
+ * simpler, force both _args alignment be 64 bytes.
+ */
+
+struct thread_creds_args {
+ CredsEntry creds;
+
+ unsigned int cap_last_cap;
+
+ u32 cap_inh[CR_CAP_SIZE];
+ u32 cap_prm[CR_CAP_SIZE];
+ u32 cap_eff[CR_CAP_SIZE];
+ u32 cap_bnd[CR_CAP_SIZE];
+
+ unsigned int secbits;
+ char *lsm_profile;
+ unsigned int *groups;
+
+ unsigned long mem_lsm_profile_pos;
+ unsigned long mem_groups_pos;
+
+ unsigned long mem_pos_next;
+};
+
+struct thread_restore_args {
+ struct restore_mem_zone mem_zone;
+
+ int pid;
+ UserRegsEntry gpregs;
+ u64 clear_tid_addr;
+
+ bool has_futex;
+ u64 futex_rla;
+ u32 futex_rla_len;
+
+ struct rst_sched_param sp;
+
+ struct task_restore_args *ta;
+
+ tls_t tls;
+
+ siginfo_t *siginfo;
+ unsigned int siginfo_n;
+
+ int pdeath_sig;
+
+ struct thread_creds_args *creds_args;
+} __aligned(64);
+
+struct task_restore_args {
+ struct thread_restore_args *t; /* thread group leader */
+
+ int fd_exe_link; /* opened self->exe file */
+ int logfd;
+ unsigned int loglevel;
+
+ /* threads restoration */
+ int nr_threads; /* number of threads */
+ thread_restore_fcall_t clone_restore_fn; /* helper address for clone() call */
+ struct thread_restore_args *thread_args; /* array of thread arguments */
+ struct task_entries *task_entries;
+ void *rst_mem;
+ unsigned long rst_mem_size;
+
+ /* Below arrays get remapped from RM_PRIVATE in sigreturn_restore */
+ VmaEntry *vmas;
+ unsigned int vmas_n;
+
+ struct restore_posix_timer *posix_timers;
+ unsigned int posix_timers_n;
+
+ struct restore_timerfd *timerfd;
+ unsigned int timerfd_n;
+
+ siginfo_t *siginfo;
+ unsigned int siginfo_n;
+
+ struct rst_tcp_sock *tcp_socks;
+ unsigned int tcp_socks_n;
+
+ struct rst_aio_ring *rings;
+ unsigned int rings_n;
+
+ struct rlimit *rlims;
+ unsigned int rlims_n;
+
+ pid_t *helpers /* the TASK_HELPERS to wait on at the end of restore */;
+ unsigned int helpers_n;
+
+ pid_t *zombies;
+ unsigned int zombies_n;
+
+ struct sock_fprog *seccomp_filters;
+ unsigned int seccomp_filters_n;
+
+ /* * * * * * * * * * * * * * * * * * * * */
+
+ unsigned long task_size;
+ unsigned long premmapped_addr;
+ unsigned long premmapped_len;
+ rt_sigaction_t sigchld_act;
+
+ void *bootstrap_start;
+ unsigned long bootstrap_len;
+
+ struct itimerval itimers[3];
+
+ MmEntry mm;
+ auxv_t mm_saved_auxv[AT_VECTOR_SIZE];
+ u32 mm_saved_auxv_size;
+ char comm[TASK_COMM_LEN];
+
+ /*
+ * proc_fd is a handle to /proc that the restorer blob can use to open
+ * files there, because some of them can't be opened before the
+ * restorer blob is called.
+ */
+ int proc_fd;
+
+ int seccomp_mode;
+
+#ifdef CONFIG_VDSO
+ unsigned long vdso_rt_size;
+ struct vdso_symtable vdso_sym_rt; /* runtime vdso symbols */
+ unsigned long vdso_rt_parked_at; /* safe place to keep vdso */
+#endif
+ void **breakpoint;
+} __aligned(64);
+
+/*
+ * For arm64 stack needs to aligned to 16 bytes.
+ * Hence align to 16 bytes for all
+*/
+#define RESTORE_ALIGN_STACK(start, size) \
+ (ALIGN((start) + (size) - 16, 16))
+
+static inline unsigned long restorer_stack(struct thread_restore_args *a)
+{
+ return RESTORE_ALIGN_STACK((long)a->mem_zone.stack, RESTORE_STACK_SIZE);
+}
+
+enum {
+ CR_STATE_FAIL = -1,
+ CR_STATE_RESTORE_NS = 0, /* is used for executing "setup-namespace" scripts */
+ CR_STATE_RESTORE_SHARED,
+ CR_STATE_FORKING,
+ CR_STATE_RESTORE,
+ CR_STATE_RESTORE_SIGCHLD,
+ /*
+ * For security reason processes can be resumed only when all
+ * credentials are restored. Otherwise someone can attach to a
+ * process, which are not restored credentials yet and execute
+ * some code.
+ */
+ CR_STATE_RESTORE_CREDS,
+ CR_STATE_COMPLETE
+};
+
+#define restore_finish_stage(__stage) ({ \
+ futex_dec_and_wake(&task_entries->nr_in_progress); \
+ futex_wait_while(&task_entries->start, __stage); \
+ (s32) futex_get(&task_entries->start); \
+ })
+
+
+/* the restorer_blob_offset__ prefix is added by gen_offsets.sh */
+#define __blob_offset(name) restorer_blob_offset__ ## name
+#define _blob_offset(name) __blob_offset(name)
+#define restorer_sym(rblob, name) (void*)(rblob + _blob_offset(name))
+
+#endif /* __CR_RESTORER_H__ */
diff --git a/criu/include/rst-malloc.h b/criu/include/rst-malloc.h
new file mode 100644
index 000000000000..001fa4183c60
--- /dev/null
+++ b/criu/include/rst-malloc.h
@@ -0,0 +1,74 @@
+#ifndef __CR_RST_MALLOC__H__
+#define __CR_RST_MALLOC__H__
+
+/*
+ * On restore we need differetn types of memory allocation.
+ * Here's an engine that tries to generalize them all. The
+ * main difference is in how the buffer with objects is being
+ * grown up.
+ *
+ * Buffers, that are to be used by restorer will be remapped
+ * into restorer address space with rst_mem_remap() call. Thus
+ * we have to either keep track of all the buffers and objects,
+ * or keep objects one-by-one in a plain linear buffer. The
+ * engine uses the 2nd approach.
+ */
+
+enum {
+ /*
+ * Shared non-remapable allocations. These can happen only
+ * in "global" context, i.e. when objects are allocated to
+ * be used by any process to be restored. The objects are
+ * not going to be used in restorer blob, thus allocation
+ * engine grows buffers in a simple manner.
+ */
+ RM_SHARED,
+ /*
+ * Shared objects, that are about to be used in restorer
+ * blob. For these the *_remap_* stuff below is used to get
+ * the actual pointer on any object. Growing a buffer is
+ * done with mremap, so that we don't have to keep track
+ * of all the buffer chunks and can remap them in restorer
+ * in one call.
+ */
+ RM_SHREMAP,
+ /*
+ * Privately used objects. Buffer grow and remap is the
+ * same as for SHREMAP, but memory regions are MAP_PRIVATE.
+ */
+ RM_PRIVATE,
+
+ RST_MEM_TYPES,
+};
+
+/*
+ * Disables SHARED and SHREMAP allocations, turns on PRIVATE
+ */
+extern void rst_mem_switch_to_private(void);
+/*
+ * Reports a cookie of a current shared buffer position, that
+ * can later be used in rst_mem_remap_ptr() to find out the object
+ * pointer in the restorer blob.
+ */
+extern unsigned long rst_mem_align_cpos(int type);
+extern void *rst_mem_remap_ptr(unsigned long pos, int type);
+/*
+ * Allocate and free objects. We don't need to free arbitrary
+ * object, thus allocation is simple (linear) and only the
+ * last object can be freed (pop-ed from buffer).
+ */
+extern void *rst_mem_alloc(unsigned long size, int type);
+extern void rst_mem_free_last(int type);
+
+/* Word-align the current freelist pointer for the next allocation. If we don't
+ * align pointers, some futex and atomic operations can fail.
+ */
+extern void rst_mem_align(int type);
+
+/*
+ * Routines to remap SHREMAP and PRIVATE into restorer address space
+ */
+extern unsigned long rst_mem_lock(void);
+extern int rst_mem_remap(void *to);
+
+#endif /* __CR_RST_MALLOC__H__ */
diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h
new file mode 100644
index 000000000000..b72e5d0c868e
--- /dev/null
+++ b/criu/include/rst_info.h
@@ -0,0 +1,71 @@
+#ifndef __CR_RST_INFO_H__
+#define __CR_RST_INFO_H__
+
+#include "lock.h"
+#include "list.h"
+#include "vma.h"
+
+struct task_entries {
+ int nr_threads, nr_tasks, nr_helpers;
+ atomic_t nr_zombies;
+ futex_t nr_in_progress;
+ futex_t start;
+ atomic_t cr_err;
+ mutex_t userns_sync_lock;
+};
+
+struct fdt {
+ int nr; /* How many tasks share this fd table */
+ pid_t pid; /* Who should restore this fd table */
+ /*
+ * The fd table is ready for restoing, if fdt_lock is equal to nr
+ * The fdt table was restrored, if fdt_lock is equal to nr + 1
+ */
+ futex_t fdt_lock;
+};
+
+struct _MmEntry;
+
+struct rst_info {
+ struct list_head fds;
+ struct list_head eventpoll;
+ struct list_head tty_slaves;
+ struct list_head tty_ctty;
+
+ void *premmapped_addr;
+ unsigned long premmapped_len;
+ unsigned long clone_flags;
+
+ void *munmap_restorer;
+
+ int nr_zombies;
+
+ int service_fd_id;
+ struct fdt *fdt;
+
+ struct vm_area_list vmas;
+ struct _MmEntry *mm;
+
+ u32 cg_set;
+
+ union {
+ struct pstree_item *pgrp_leader;
+ futex_t pgrp_set;
+ };
+
+ struct file_desc *cwd;
+ struct file_desc *root;
+ bool has_umask;
+ u32 umask;
+
+ /*
+ * We set this flag when process has seccomp filters
+ * so that we know to suspend them before we unmap the
+ * restorer blob.
+ */
+ bool has_seccomp;
+
+ void *breakpoint;
+};
+
+#endif /* __CR_RST_INFO_H__ */
diff --git a/criu/include/seccomp.h b/criu/include/seccomp.h
new file mode 100644
index 000000000000..b5b26c80996d
--- /dev/null
+++ b/criu/include/seccomp.h
@@ -0,0 +1,32 @@
+#ifndef __CR_SECCOMP_H__
+#define __CR_SECCOMP_H__
+
+#include <linux/seccomp.h>
+#include <linux/filter.h>
+
+#include "protobuf/core.pb-c.h"
+
+#ifndef SECCOMP_MODE_DISABLED
+#define SECCOMP_MODE_DISABLED 0
+#endif
+
+#ifndef SECCOMP_MODE_STRICT
+#define SECCOMP_MODE_STRICT 1
+#endif
+
+#ifndef SECCOMP_MODE_FILTER
+#define SECCOMP_MODE_FILTER 2
+#endif
+
+#ifndef SECCOMP_SET_MODE_FILTER
+#define SECCOMP_SET_MODE_FILTER 1
+#endif
+
+#ifndef SECCOMP_FILTER_FLAG_TSYNC
+#define SECCOMP_FILTER_FLAG_TSYNC 1
+#endif
+
+extern int collect_seccomp_filters(void);
+extern int prepare_seccomp_filters(void);
+extern int seccomp_filters_get_rst_pos(CoreEntry *item, int *count, unsigned long *pos);
+#endif
diff --git a/criu/include/seize.h b/criu/include/seize.h
new file mode 100644
index 000000000000..315fab2d36d9
--- /dev/null
+++ b/criu/include/seize.h
@@ -0,0 +1,8 @@
+#ifndef __CR_SEIZE_H__
+#define __CR_SEIZE_H__
+
+extern int collect_pstree(pid_t pid);
+extern void pstree_switch_state(struct pstree_item *root_item, int st);
+extern const char *get_real_freezer_state(void);
+
+#endif
diff --git a/criu/include/servicefd.h b/criu/include/servicefd.h
new file mode 100644
index 000000000000..a9e35a223420
--- /dev/null
+++ b/criu/include/servicefd.h
@@ -0,0 +1,35 @@
+#ifndef __CR_SERVICE_FD_H__
+#define __CR_SERVICE_FD_H__
+
+#include <stdbool.h>
+
+enum sfd_type {
+ SERVICE_FD_MIN,
+
+ LOG_FD_OFF,
+ IMG_FD_OFF,
+ PROC_FD_OFF, /* fd with /proc for all proc_ calls */
+ CTL_TTY_OFF,
+ SELF_STDIN_OFF,
+ CR_PROC_FD_OFF, /* some other's proc fd.
+ * For dump -- target ns' proc
+ * For restore -- CRIU ns' proc
+ */
+ ROOT_FD_OFF, /* Root of the namespace we dump/restore */
+ CGROUP_YARD,
+ USERNSD_SK, /* Socket for usernsd */
+ NS_FD_OFF, /* Node's net namespace fd */
+
+ SERVICE_FD_MAX
+};
+
+extern int clone_service_fd(int id);
+extern int init_service_fd(void);
+extern int get_service_fd(enum sfd_type type);
+extern int reserve_service_fd(enum sfd_type type);
+extern int install_service_fd(enum sfd_type type, int fd);
+extern int close_service_fd(enum sfd_type type);
+extern bool is_service_fd(int fd, enum sfd_type type);
+extern bool is_any_service_fd(int fd);
+
+#endif /* __CR_SERVICE_FD_H__ */
diff --git a/criu/include/setproctitle.h b/criu/include/setproctitle.h
new file mode 100644
index 000000000000..bc634331bde4
--- /dev/null
+++ b/criu/include/setproctitle.h
@@ -0,0 +1,19 @@
+#ifndef __CR_SETPROCTITLE_H__
+#define __CR_SETPROCTITLE_H__
+
+#ifdef CONFIG_HAS_LIBBSD
+#include <bsd/unistd.h>
+#else
+
+/*
+ * setproctitle_init is in the libbsd since v0.6.0. This macro allows to
+ * compile criu with libbsd<0.6.0.
+ */
+#ifndef CONFIG_HAS_SETPROCTITLE_INIT
+#define setproctitle_init(argc, argv, envp)
+#endif
+
+#define setproctitle(fmt, ...)
+#endif
+
+#endif /* __CR_SETPROCTITLE_H__ */
diff --git a/criu/include/shmem.h b/criu/include/shmem.h
new file mode 100644
index 000000000000..47dd0fd3b396
--- /dev/null
+++ b/criu/include/shmem.h
@@ -0,0 +1,15 @@
+#ifndef __CR_SHMEM_H__
+#define __CR_SHMEM_H__
+
+#include "lock.h"
+#include "protobuf/vma.pb-c.h"
+
+struct _VmaEntry;
+extern int collect_shmem(int pid, struct _VmaEntry *vi);
+extern void show_saved_shmems(void);
+extern int get_shmem_fd(int pid, VmaEntry *vi);
+
+extern int cr_dump_shmem(void);
+extern int add_shmem_area(pid_t pid, VmaEntry *vma);
+
+#endif /* __CR_SHMEM_H__ */
diff --git a/criu/include/sigframe.h b/criu/include/sigframe.h
new file mode 100644
index 000000000000..5ab09b1fd662
--- /dev/null
+++ b/criu/include/sigframe.h
@@ -0,0 +1,66 @@
+/*
+ * Generic sigframe bits.
+ */
+
+#ifndef __CR_SIGFRAME_H__
+#define __CR_SIGFRAME_H__
+
+#include "asm/types.h"
+#include "protobuf/core.pb-c.h"
+
+struct rt_sigframe;
+
+/* sigframe should be aligned on 64 byte for x86 and 8 bytes for arm */
+#define RESTORE_STACK_SIGFRAME ALIGN(sizeof(struct rt_sigframe) + SIGFRAME_OFFSET, 64)
+
+#ifndef __ARCH_SI_PREAMBLE_SIZE
+#define __ARCH_SI_PREAMBLE_SIZE (3 * sizeof(int))
+#endif
+
+#define SI_MAX_SIZE 128
+#ifndef SI_PAD_SIZE
+#define SI_PAD_SIZE ((SI_MAX_SIZE - __ARCH_SI_PREAMBLE_SIZE) / sizeof(int))
+#endif
+
+typedef struct rt_siginfo {
+ int si_signo;
+ int si_errno;
+ int si_code;
+ int _pad[SI_PAD_SIZE];
+} rt_siginfo_t;
+
+typedef struct rt_sigaltstack {
+ void *ss_sp;
+ int ss_flags;
+ size_t ss_size;
+} rt_stack_t;
+
+struct rt_ucontext {
+ unsigned long uc_flags;
+ struct rt_ucontext *uc_link;
+ rt_stack_t uc_stack;
+ struct rt_sigcontext uc_mcontext;
+ k_rtsigset_t uc_sigmask; /* mask last for extensibility */
+ int __unused[32 - (sizeof (k_rtsigset_t) / sizeof (int))];
+ unsigned long uc_regspace[128] __attribute__((__aligned__(8)));
+};
+
+extern int construct_sigframe(struct rt_sigframe *sigframe,
+ struct rt_sigframe *rsigframe,
+ CoreEntry *core);
+
+/*
+ * FIXME Convert it to inline helper, which requires
+ * to unweave types mess we've generated for
+ * run-time data.
+ */
+#define setup_sas(sigframe, sas) \
+do { \
+ if ((sas)) { \
+ RT_SIGFRAME_UC((sigframe)).uc_stack.ss_sp = (void *)decode_pointer((sas)->ss_sp); \
+ RT_SIGFRAME_UC((sigframe)).uc_stack.ss_flags = (int)(sas)->ss_flags; \
+ RT_SIGFRAME_UC((sigframe)).uc_stack.ss_size = (size_t)(sas)->ss_size; \
+ } \
+} while (0)
+
+#endif /* __CR_SIGFRAME_H__ */
diff --git a/criu/include/signalfd.h b/criu/include/signalfd.h
new file mode 100644
index 000000000000..c7af81977b29
--- /dev/null
+++ b/criu/include/signalfd.h
@@ -0,0 +1,10 @@
+#ifndef __CR_SIGNALFD_H__
+#define __CR_SIGNALFD_H__
+
+struct cr_imgset;
+struct fd_parms;
+extern int is_signalfd_link(char *link);
+extern const struct fdtype_ops signalfd_dump_ops;
+extern struct collect_image_info signalfd_cinfo;
+
+#endif /* __CR_SIGNALFD_H__ */
diff --git a/criu/include/sk-inet.h b/criu/include/sk-inet.h
new file mode 100644
index 000000000000..5b5fca63870c
--- /dev/null
+++ b/criu/include/sk-inet.h
@@ -0,0 +1,88 @@
+#ifndef __CR_SK_INET_H__
+#define __CR_SK_INET_H__
+
+#include <netinet/tcp.h>
+
+#include "sockets.h"
+#include "files.h"
+#include "list.h"
+#include "protobuf.h"
+#include "protobuf/sk-inet.pb-c.h"
+
+#define INET_ADDR_LEN 40
+#ifndef TCP_REPAIR
+#define TCP_REPAIR 19 /* TCP sock is under repair right now */
+#define TCP_REPAIR_QUEUE 20
+#define TCP_QUEUE_SEQ 21
+#define TCP_REPAIR_OPTIONS 22
+#endif
+
+struct inet_sk_desc {
+ struct socket_desc sd;
+ unsigned int type;
+ unsigned int src_port;
+ unsigned int dst_port;
+ unsigned int state;
+ unsigned int rqlen;
+ unsigned int wqlen; /* sent + unsent data */
+ unsigned int uwqlen; /* unsent data */
+ unsigned int src_addr[4];
+ unsigned int dst_addr[4];
+ unsigned short shutdown;
+
+ int rfd;
+ int cpt_reuseaddr;
+ struct list_head rlist;
+};
+
+struct inet_port;
+struct inet_sk_info {
+ InetSkEntry *ie;
+ struct file_desc d;
+ struct inet_port *port;
+ /*
+ * This is an fd by which the socket is opened.
+ * It will be carried down to restorer code to
+ * repair-off the socket at the very end.
+ */
+ int sk_fd;
+ struct list_head rlist;
+};
+
+extern int inet_bind(int sk, struct inet_sk_info *);
+extern int inet_connect(int sk, struct inet_sk_info *);
+
+#ifdef CR_NOGLIBC
+#define setsockopt sys_setsockopt
+#endif
+static inline void tcp_repair_off(int fd)
+{
+ int aux = 0, ret;
+
+ ret = setsockopt(fd, SOL_TCP, TCP_REPAIR, &aux, sizeof(aux));
+ if (ret < 0)
+ pr_err("Failed to turn off repair mode on socket (%d)\n", ret);
+}
+
+extern void tcp_locked_conn_add(struct inet_sk_info *);
+extern void rst_unlock_tcp_connections(void);
+extern void cpt_unlock_tcp_connections(void);
+
+extern int dump_one_tcp(int sk, struct inet_sk_desc *sd);
+extern int restore_one_tcp(int sk, struct inet_sk_info *si);
+
+#define SK_EST_PARAM "tcp-established"
+
+extern int check_tcp(void);
+extern mutex_t *inet_get_reuseaddr_lock(struct inet_sk_info *ii);
+
+int rst_tcp_socks_prep(void);
+extern unsigned long rst_tcp_socks_cpos;
+extern unsigned int rst_tcp_socks_nr;
+
+struct rst_tcp_sock {
+ int sk;
+ bool reuseaddr;
+};
+
+#endif /* __CR_SK_INET_H__ */
diff --git a/criu/include/sk-packet.h b/criu/include/sk-packet.h
new file mode 100644
index 000000000000..6c4398c604e4
--- /dev/null
+++ b/criu/include/sk-packet.h
@@ -0,0 +1,39 @@
+#ifndef __CR_SK_PACKET_H__
+#define __CR_SK_PACKET_H__
+
+#ifndef PACKET_TIMESTAMP
+#define PACKET_TIMESTAMP 17
+#endif
+
+struct cr_imgset;
+struct fd_parms;
+struct vma_area;
+
+extern struct collect_image_info packet_sk_cinfo;
+
+extern int dump_socket_map(struct vma_area *vma);
+extern int get_socket_fd(int pid, VmaEntry *vma);
+
+extern int packet_receive_one(struct nlmsghdr *h, void *arg);
+
+#ifndef PACKET_VNET_HDR
+#define PACKET_VNET_HDR 15
+#endif
+
+#ifndef PACKET_FANOUT
+#define PACKET_FANOUT 18
+#endif
+
+#ifndef TPACKET3_HDRLEN
+struct tpacket_req3 {
+ unsigned int tp_block_size;
+ unsigned int tp_block_nr;
+ unsigned int tp_frame_size;
+ unsigned int tp_frame_nr;
+ unsigned int tp_retire_blk_tov;
+ unsigned int tp_sizeof_priv;
+ unsigned int tp_feature_req_word;
+};
+#endif
+
+#endif /* __CR_SK_PACKET_H__ */
diff --git a/criu/include/sk-queue.h b/criu/include/sk-queue.h
new file mode 100644
index 000000000000..9044de0b0fb6
--- /dev/null
+++ b/criu/include/sk-queue.h
@@ -0,0 +1,8 @@
+#ifndef __CR_SK_QUEUE_H__
+#define __CR_SK_QUEUE_H__
+
+extern int read_sk_queues(void);
+extern int dump_sk_queue(int sock_fd, int sock_id);
+extern int restore_sk_queue(int fd, unsigned int peer_id);
+
+#endif /* __CR_SK_QUEUE_H__ */
diff --git a/criu/include/sockets.h b/criu/include/sockets.h
new file mode 100644
index 000000000000..b726e2f7a0b3
--- /dev/null
+++ b/criu/include/sockets.h
@@ -0,0 +1,89 @@
+#ifndef __CR_SOCKETS_H__
+#define __CR_SOCKETS_H__
+
+#include <stdbool.h>
+#include <sys/socket.h>
+
+#include "asm/types.h"
+
+#include "protobuf.h"
+#include "protobuf/sk-opts.pb-c.h"
+
+struct fdinfo_list_entry;
+struct sk_opts_entry;
+struct file_desc;
+struct fd_parms;
+struct cr_imgset;
+struct nlmsghdr;
+struct cr_img;
+
+struct socket_desc {
+ unsigned int family;
+ unsigned int ino;
+ struct socket_desc *next;
+ int already_dumped;
+};
+
+extern int dump_socket(struct fd_parms *p, int lfd, struct cr_img *);
+extern int dump_socket_opts(int sk, SkOptsEntry *soe);
+extern int restore_socket_opts(int sk, SkOptsEntry *soe);
+extern void release_skopts(SkOptsEntry *);
+extern int restore_prepare_socket(int sk);
+extern void preload_socket_modules();
+
+extern bool socket_test_collect_bit(unsigned int family, unsigned int proto);
+
+extern int sk_collect_one(int ino, int family, struct socket_desc *d);
+struct ns_id;
+extern int collect_sockets(struct ns_id *);
+extern int collect_inet_sockets(void);
+extern struct collect_image_info unix_sk_cinfo;
+extern int collect_unix_sockets(void);
+extern int fix_external_unix_sockets(void);
+extern int resolve_unix_peers(void);
+
+extern struct collect_image_info netlink_sk_cinfo;
+
+extern struct socket_desc *lookup_socket(int ino, int family, int proto);
+
+extern const struct fdtype_ops unix_dump_ops;
+extern const struct fdtype_ops inet_dump_ops;
+extern const struct fdtype_ops inet6_dump_ops;
+extern const struct fdtype_ops netlink_dump_ops;
+extern const struct fdtype_ops packet_dump_ops;
+
+extern int inet_collect_one(struct nlmsghdr *h, int family, int type);
+extern int unix_receive_one(struct nlmsghdr *h, void *);
+extern int netlink_receive_one(struct nlmsghdr *hdr, void *arg);
+
+extern int unix_sk_id_add(ino_t ino);
+extern int unix_sk_ids_parse(char *optarg);
+
+extern int do_dump_opt(int sk, int level, int name, void *val, int len);
+#define dump_opt(s, l, n, f) do_dump_opt(s, l, n, f, sizeof(*f))
+extern int do_restore_opt(int sk, int level, int name, void *val, int len);
+#define restore_opt(s, l, n, f) do_restore_opt(s, l, n, f, sizeof(*f))
+
+#define sk_encode_shutdown(img, mask) do { \
+ /* \
+ * protobuf SK_SHUTDOWN__ bits match those \
+ * reported by kernel \
+ */ \
+ (img)->shutdown = mask; \
+ if ((img)->shutdown != SK_SHUTDOWN__NONE) \
+ (img)->has_shutdown = true; \
+ } while (0)
+
+static inline int sk_decode_shutdown(int val)
+{
+ static const int hows[] = {-1, SHUT_RD, SHUT_WR, SHUT_RDWR};
+ return hows[val];
+}
+
+#define USK_EXT_PARAM "ext-unix-sk"
+
+#ifndef NETLINK_SOCK_DIAG
+#define NETLINK_SOCK_DIAG NETLINK_INET_DIAG
+#endif
+
+#endif /* __CR_SOCKETS_H__ */
diff --git a/criu/include/stats.h b/criu/include/stats.h
new file mode 100644
index 000000000000..e417636e6d1e
--- /dev/null
+++ b/criu/include/stats.h
@@ -0,0 +1,48 @@
+#ifndef __CR_STATS_H__
+#define __CR_STATS_H__
+
+enum {
+ TIME_FREEZING,
+ TIME_FROZEN,
+ TIME_MEMDUMP,
+ TIME_MEMWRITE,
+ TIME_IRMAP_RESOLVE,
+
+ DUMP_TIME_NR_STATS,
+};
+
+enum {
+ TIME_FORK,
+ TIME_RESTORE,
+
+ RESTORE_TIME_NS_STATS,
+};
+
+extern void timing_start(int t);
+extern void timing_stop(int t);
+
+enum {
+ CNT_PAGES_SCANNED,
+ CNT_PAGES_SKIPPED_PARENT,
+ CNT_PAGES_WRITTEN,
+
+ DUMP_CNT_NR_STATS,
+};
+
+enum {
+ CNT_PAGES_COMPARED,
+ CNT_PAGES_SKIPPED_COW,
+ CNT_PAGES_RESTORED,
+
+ RESTORE_CNT_NR_STATS,
+};
+
+extern void cnt_add(int c, unsigned long val);
+
+#define DUMP_STATS 1
+#define RESTORE_STATS 2
+
+extern int init_stats(int what);
+extern void write_stats(int what);
+
+#endif /* __CR_STATS_H__ */
diff --git a/criu/include/string.h b/criu/include/string.h
new file mode 100644
index 000000000000..b469bfe55a84
--- /dev/null
+++ b/criu/include/string.h
@@ -0,0 +1,21 @@
+#ifndef __CR_STRING_H__
+#define __CR_STRING_H__
+
+#include <sys/types.h>
+#include <string.h>
+
+#ifdef CONFIG_HAS_LIBBSD
+# include <bsd/string.h>
+#endif
+
+#include "config.h"
+
+#ifndef CONFIG_HAS_STRLCPY
+extern size_t strlcpy(char *dest, const char *src, size_t size);
+#endif
+
+#ifndef CONFIG_HAS_STRLCAT
+extern size_t strlcat(char *dest, const char *src, size_t count);
+#endif
+
+#endif /* __CR_STRING_H__ */
diff --git a/criu/include/syscall-types.h b/criu/include/syscall-types.h
new file mode 100644
index 000000000000..e3a114d6c280
--- /dev/null
+++ b/criu/include/syscall-types.h
@@ -0,0 +1,85 @@
+/*
+ * Please add here type definitions if
+ * syscall prototypes need them.
+ *
+ * Anything else should go to plain type.h
+ */
+
+#ifndef __CR_SYSCALL_TYPES_H__
+#define __CR_SYSCALL_TYPES_H__
+
+#include <sys/time.h>
+#include <arpa/inet.h>
+#include <sched.h>
+
+#include "asm/types.h"
+
+struct cap_header {
+ u32 version;
+ int pid;
+};
+
+struct cap_data {
+ u32 eff;
+ u32 prm;
+ u32 inh;
+};
+
+struct sockaddr;
+struct msghdr;
+struct rusage;
+struct file_handle;
+struct robust_list_head;
+struct io_event;
+struct timespec;
+
+typedef unsigned long aio_context_t;
+
+struct itimerspec;
+
+#ifndef F_GETFD
+#define F_GETFD 1
+#endif
+
+#ifndef CLONE_NEWNS
+#define CLONE_NEWNS 0x00020000
+#endif
+
+#ifndef CLONE_NEWPID
+#define CLONE_NEWPID 0x20000000
+#endif
+
+#ifndef CLONE_NEWUTS
+#define CLONE_NEWUTS 0x04000000
+#endif
+
+#ifndef CLONE_NEWIPC
+#define CLONE_NEWIPC 0x08000000
+#endif
+
+#ifndef CLONE_NEWNET
+#define CLONE_NEWNET 0x40000000
+#endif
+
+#ifndef CLONE_NEWUSER
+#define CLONE_NEWUSER 0x10000000
+#endif
+
+#define CLONE_ALLNS (CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWUSER)
+
+#define setns sys_setns
+
+struct rlimit;
+struct rlimit64;
+
+struct krlimit {
+ unsigned long rlim_cur;
+ unsigned long rlim_max;
+};
+
+struct siginfo;
+
+/* Type of timers in the kernel. */
+typedef int kernel_timer_t;
+
+#endif /* __CR_SYSCALL_TYPES_H__ */
diff --git a/criu/include/sysctl.h b/criu/include/sysctl.h
new file mode 100644
index 000000000000..b949a409eeb3
--- /dev/null
+++ b/criu/include/sysctl.h
@@ -0,0 +1,39 @@
+#ifndef __CR_SYSCTL_H__
+#define __CR_SYSCTL_H__
+
+struct sysctl_req {
+ char *name;
+ void *arg;
+ int type;
+ int flags;
+};
+
+extern int sysctl_op(struct sysctl_req *req, size_t nr_req, int op, unsigned int ns);
+
+enum {
+ CTL_READ,
+ CTL_WRITE,
+};
+
+#define CTL_SHIFT 4 /* Up to 16 types */
+
+#define CTL_U32 1 /* Single u32 */
+#define CTL_U64 2 /* Single u64 */
+#define __CTL_U32A 3 /* Array of u32 */
+#define __CTL_U64A 4 /* Array of u64 */
+#define __CTL_STR 5 /* String */
+#define CTL_32 6 /* Single s32 */
+
+#define CTL_U32A(n) (__CTL_U32A | ((n) << CTL_SHIFT))
+#define CTL_U64A(n) (__CTL_U64A | ((n) << CTL_SHIFT))
+#define CTL_STR(len) (__CTL_STR | ((len) << CTL_SHIFT))
+
+#define CTL_LEN(t) ((t) >> CTL_SHIFT)
+#define CTL_TYPE(t) ((t) & ((1 << CTL_SHIFT) - 1))
+
+/*
+ * Some entries might be missing mark them as optional.
+ */
+#define CTL_FLAGS_OPTIONAL 1
+
+#endif /* __CR_SYSCTL_H__ */
diff --git a/criu/include/sysfs_parse.h b/criu/include/sysfs_parse.h
new file mode 100644
index 000000000000..4d74c4ee5422
--- /dev/null
+++ b/criu/include/sysfs_parse.h
@@ -0,0 +1,14 @@
+#ifndef __CR_SYSFS_PARSE_H__
+#define __CR_SYSFS_PARSE_H__
+
+#define SYSFS_AUFS "/sys/fs/aufs/"
+#define SBINFO_LEN (3 + 16 + 1) /* si_%lx */
+#define SBINFO_PATH_LEN (sizeof SYSFS_AUFS + SBINFO_LEN) /* /sys/fs/aufs/<sbinfo> */
+#define AUFSBR_PATH_LEN (SBINFO_PATH_LEN + 6 + 1) /* /sys/fs/aufs/<sbinfo>/br%3d */
+
+extern int parse_aufs_branches(struct mount_info *mi);
+extern int fixup_aufs_vma_fd(struct vma_area *vma);
+extern void free_aufs_branches(void);
+
+#endif /* __CR_SYSFS_PARSE_H__ */
+
diff --git a/criu/include/timerfd.h b/criu/include/timerfd.h
new file mode 100644
index 000000000000..67b9187179cf
--- /dev/null
+++ b/criu/include/timerfd.h
@@ -0,0 +1,39 @@
+#ifndef __CR_TIMERFD_H__
+#define __CR_TIMERFD_H__
+
+#include <time.h>
+#include <sys/ioctl.h>
+
+#include "files.h"
+
+struct pstree_item;
+
+struct restore_timerfd {
+ int id;
+ int fd;
+ int clockid;
+ int settime_flags;
+ unsigned long ticks;
+ struct itimerspec val;
+};
+
+extern const struct fdtype_ops timerfd_dump_ops;
+extern struct collect_image_info timerfd_cinfo;
+
+int rst_timerfd_prep(void);
+extern unsigned long rst_timerfd_cpos;
+extern unsigned int rst_timerfd_nr;
+
+
+extern int check_timerfd(void);
+extern int is_timerfd_link(char *link);
+
+#ifndef TFD_TIMER_ABSTIME
+# define TFD_TIMER_ABSTIME (1 << 0)
+#endif
+
+#ifndef TFD_IOC_SET_TICKS
+# define TFD_IOC_SET_TICKS _IOW('T', 0, u64)
+#endif
+
+#endif /* __CR_TIMERFD_H__ */
diff --git a/criu/include/tty.h b/criu/include/tty.h
new file mode 100644
index 000000000000..c8b620992d75
--- /dev/null
+++ b/criu/include/tty.h
@@ -0,0 +1,34 @@
+#ifndef __CR_TTY_H__
+#define __CR_TTY_H__
+
+#include <linux/major.h>
+#include <linux/vt.h>
+
+#include "files.h"
+
+/* Kernel's limit */
+#define TERMIOS_NCC 19
+
+extern const struct fdtype_ops tty_dump_ops;
+
+struct tty_driver;
+struct tty_driver *get_tty_driver(dev_t rdev, dev_t dev);
+static inline int is_tty(dev_t rdev, dev_t dev)
+{
+ return get_tty_driver(rdev, dev) != NULL;
+}
+
+extern int dump_verify_tty_sids(void);
+extern struct collect_image_info tty_info_cinfo;
+extern struct collect_image_info tty_cinfo;
+extern int prepare_shared_tty(void);
+extern int tty_setup_slavery(void);
+
+extern int tty_verify_active_pairs(void);
+
+extern int tty_prep_fds(void);
+extern void tty_fini_fds(void);
+
+#define OPT_SHELL_JOB "shell-job"
+
+#endif /* __CR_TTY_H__ */
diff --git a/criu/include/tun.h b/criu/include/tun.h
new file mode 100644
index 000000000000..d70f8f2103c4
--- /dev/null
+++ b/criu/include/tun.h
@@ -0,0 +1,16 @@
+#ifndef __CR_TUN_H__
+#define __CR_TUN_H__
+
+#ifndef TUN_MINOR
+#define TUN_MINOR 200
+#endif
+
+#include "protobuf/netdev.pb-c.h"
+
+extern const struct fdtype_ops tunfile_dump_ops;
+extern int dump_tun_link(NetDeviceEntry *nde, struct cr_imgset *fds);
+extern int restore_one_tun(NetDeviceEntry *nde, int nlsk);
+extern struct collect_image_info tunfile_cinfo;
+extern int check_tun_cr(int no_tun_err);
+
+#endif /* __CR_TUN_H__ */
diff --git a/criu/include/unix_diag.h b/criu/include/unix_diag.h
new file mode 100644
index 000000000000..3f2468330e2b
--- /dev/null
+++ b/criu/include/unix_diag.h
@@ -0,0 +1,67 @@
+#ifndef __CR_UNIX_DIAG_H__
+#define __CR_UNIX_DIAG_H__
+
+#include "asm/types.h"
+
+struct unix_diag_req {
+ u8 sdiag_family;
+ u8 sdiag_protocol;
+ u16 pad;
+ u32 udiag_states;
+ u32 udiag_ino;
+ u32 udiag_show;
+ u32 udiag_cookie[2];
+};
+
+#define UDIAG_SHOW_NAME 0x00000001 /* show name (not path) */
+#define UDIAG_SHOW_VFS 0x00000002 /* show VFS inode info */
+#define UDIAG_SHOW_PEER 0x00000004 /* show peer socket info */
+#define UDIAG_SHOW_ICONS 0x00000008 /* show pending connections */
+#define UDIAG_SHOW_RQLEN 0x00000010 /* show skb receive queue len */
+#define UDIAG_SHOW_MEMINFO 0x00000020 /* show memory info of a socket */
+
+struct unix_diag_msg {
+ u8 udiag_family;
+ u8 udiag_type;
+ u8 udiag_state;
+ u8 pad;
+
+ u32 udiag_ino;
+ u32 udiag_cookie[2];
+};
+
+enum {
+ SK_MEMINFO_RMEM_ALLOC,
+ SK_MEMINFO_RCVBUF,
+ SK_MEMINFO_WMEM_ALLOC,
+ SK_MEMINFO_SNDBUF,
+ SK_MEMINFO_FWD_ALLOC,
+ SK_MEMINFO_WMEM_QUEUED,
+ SK_MEMINFO_OPTMEM,
+
+ SK_MEMINFO_VARS,
+};
+
+enum {
+ UNIX_DIAG_NAME,
+ UNIX_DIAG_VFS,
+ UNIX_DIAG_PEER,
+ UNIX_DIAG_ICONS,
+ UNIX_DIAG_RQLEN,
+ UNIX_DIAG_MEMINFO,
+ UNIX_DIAG_SHUTDOWN,
+
+ UNIX_DIAG_MAX,
+};
+
+struct unix_diag_vfs {
+ u32 udiag_vfs_ino;
+ u32 udiag_vfs_dev;
+};
+
+struct unix_diag_rqlen {
+ u32 udiag_rqueue;
+ u32 udiag_wqueue;
+};
+
+#endif /* __CR_UNIX_DIAG_H__ */
diff --git a/criu/include/util-pie.h b/criu/include/util-pie.h
new file mode 100644
index 000000000000..cbaed4224cfd
--- /dev/null
+++ b/criu/include/util-pie.h
@@ -0,0 +1,66 @@
+#ifndef __CR_UTIL_NET_H__
+#define __CR_UTIL_NET_H__
+
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include "asm/types.h"
+
+#define UNIX_PATH_MAX (sizeof(struct sockaddr_un) - \
+ (size_t)((struct sockaddr_un *) 0)->sun_path)
+
+#ifndef SO_PEEK_OFF
+#define SO_PEEK_OFF 42
+#endif
+
+/*
+ * Because of kernel doing kmalloc for user data passed
+ * in SCM messages, and there is kernel's SCM_MAX_FD as a limit
+ * for descriptors passed at once we're trying to reduce
+ * the pressue on kernel memory manager and use predefined
+ * known to work well size of the message buffer.
+ */
+#define CR_SCM_MSG_SIZE (1024)
+#define CR_SCM_MAX_FD (252)
+
+struct fd_opts {
+ char flags;
+ struct {
+ u32 uid;
+ u32 euid;
+ u32 signum;
+ u32 pid_type;
+ u32 pid;
+ } fown;
+};
+
+struct scm_fdset {
+ struct msghdr hdr;
+ struct iovec iov;
+ char msg_buf[CR_SCM_MSG_SIZE];
+ struct fd_opts opts[CR_SCM_MAX_FD];
+};
+
+extern int send_fds(int sock, struct sockaddr_un *saddr, int saddr_len,
+ int *fds, int nr_fds, bool with_flags);
+extern int recv_fds(int sock, int *fds, int nr_fds, struct fd_opts *opts);
+
+static inline int send_fd(int sock, struct sockaddr_un *saddr, int saddr_len, int fd)
+{
+ return send_fds(sock, saddr, saddr_len, &fd, 1, false);
+}
+
+static inline int recv_fd(int sock)
+{
+ int fd, ret;
+
+ ret = recv_fds(sock, &fd, 1, NULL);
+ if (ret)
+ return -1;
+
+ return fd;
+}
+
+extern int open_detach_mount(char *dir);
+
+#endif /* __CR_UTIL_NET_H__ */
diff --git a/criu/include/util-vdso.h b/criu/include/util-vdso.h
new file mode 100644
index 000000000000..c8dfa054f825
--- /dev/null
+++ b/criu/include/util-vdso.h
@@ -0,0 +1,65 @@
+#ifndef __CR_UTIL_VDSO_H__
+#define __CR_UTIL_VDSO_H__
+
+/*
+ * VDSO management common definitions.
+ *
+ * This header file is included by the criu main code and the parasite code.
+ * It contains definitions shared by these 2 parts.
+ *
+ * This file should not be included except in pie/util-vdso.c, include/vdso.h
+ * and include/parasite-vdso.h
+ */
+
+#include <sys/types.h>
+
+/*
+ * Each architecture must export:
+ * VDSO_SYMBOL_MAX, the number of vDSO symbols to manage
+ * ARCH_VDSO_SYMBOLS, a table of string containing the vDSO symbol names
+ * vdso_redirect_calls, a service called to redirect the vDSO symbols in
+ * the parasite code.
+ */
+#include "asm/vdso.h"
+
+struct vdso_symbol {
+ char name[32];
+ unsigned long offset;
+};
+
+struct vdso_symtable {
+ unsigned long vma_start;
+ unsigned long vma_end;
+ unsigned long vvar_start;
+ unsigned long vvar_end;
+ struct vdso_symbol symbols[VDSO_SYMBOL_MAX];
+};
+
+#define VDSO_SYMBOL_INIT { .offset = VDSO_BAD_ADDR, }
+
+#define VDSO_SYMTABLE_INIT \
+ { \
+ .vma_start = VDSO_BAD_ADDR, \
+ .vma_end = VDSO_BAD_ADDR, \
+ .vvar_start = VVAR_BAD_ADDR, \
+ .vvar_end = VVAR_BAD_ADDR, \
+ .symbols = { \
+ [0 ... VDSO_SYMBOL_MAX - 1] = \
+ (struct vdso_symbol)VDSO_SYMBOL_INIT, \
+ }, \
+ }
+
+/* Size of VMA associated with vdso */
+static inline unsigned long vdso_vma_size(struct vdso_symtable *t)
+{
+ return t->vma_end - t->vma_start;
+}
+
+static inline unsigned long vvar_vma_size(struct vdso_symtable *t)
+{
+ return t->vvar_end - t->vvar_start;
+}
+
+extern int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t);
+
+#endif /* __CR_UTIL_VDSO_H__ */
diff --git a/criu/include/util.h b/criu/include/util.h
new file mode 100644
index 000000000000..a64782783615
--- /dev/null
+++ b/criu/include/util.h
@@ -0,0 +1,284 @@
+#ifndef __CR_UTIL_H__
+#define __CR_UTIL_H__
+
+/*
+ * Some bits are stolen from perf and kvm tools
+ */
+#include <signal.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/statfs.h>
+#include <dirent.h>
+
+#include "compiler.h"
+#include "asm/types.h"
+#include "xmalloc.h"
+#include "bug.h"
+#include "log.h"
+#include "err.h"
+
+#include "protobuf/vma.pb-c.h"
+
+#define PREF_SHIFT_OP(pref, op, size) ((size) op (pref ##BYTES_SHIFT))
+#define KBYTES_SHIFT 10
+#define MBYTES_SHIFT 20
+#define GBYTES_SHIFT 30
+
+#define KBYTES(size) PREF_SHIFT_OP(K, >>, size)
+#define MBYTES(size) PREF_SHIFT_OP(M, >>, size)
+#define GBYTES(size) PREF_SHIFT_OP(G, >>, size)
+
+#define KILO(size) PREF_SHIFT_OP(K, <<, size)
+#define MEGA(size) PREF_SHIFT_OP(M, <<, size)
+#define GIGA(size) PREF_SHIFT_OP(G, <<, size)
+
+struct vma_area;
+struct list_head;
+
+extern void pr_vma(unsigned int loglevel, const struct vma_area *vma_area);
+
+#define pr_info_vma(vma_area) pr_vma(LOG_INFO, vma_area)
+#define pr_msg_vma(vma_area) pr_vma(LOG_MSG, vma_area)
+
+#define pr_vma_list(level, head) \
+ do { \
+ struct vma_area *vma; \
+ list_for_each_entry(vma, head, list) \
+ pr_vma(level, vma); \
+ } while (0)
+#define pr_info_vma_list(head) pr_vma_list(LOG_INFO, head)
+
+extern int move_img_fd(int *img_fd, int want_fd);
+extern int close_safe(int *fd);
+
+extern int reopen_fd_as_safe(char *file, int line, int new_fd, int old_fd, bool allow_reuse_fd);
+#define reopen_fd_as(new_fd, old_fd) reopen_fd_as_safe(__FILE__, __LINE__, new_fd, old_fd, false)
+#define reopen_fd_as_nocheck(new_fd, old_fd) reopen_fd_as_safe(__FILE__, __LINE__, new_fd, old_fd, true)
+
+extern void close_proc(void);
+extern int open_pid_proc(pid_t pid);
+extern int close_pid_proc(void);
+extern int set_proc_fd(int fd);
+
+/*
+ * Values for pid argument of the proc opening routines below.
+ * SELF would open file under /proc/self
+ * GEN would open a file under /proc itself
+ * NONE is internal, don't use it ;)
+ */
+
+#define PROC_SELF 0
+#define PROC_GEN -1
+#define PROC_NONE -2
+
+extern int do_open_proc(pid_t pid, int flags, const char *fmt, ...);
+
+#define __open_proc(pid, ier, flags, fmt, ...) \
+ ({ \
+ int __fd = do_open_proc(pid, flags, \
+ fmt, ##__VA_ARGS__); \
+ if (__fd < 0 && (errno != ier)) \
+ pr_perror("Can't open %d/" fmt " on procfs", \
+ pid, ##__VA_ARGS__); \
+ \
+ __fd; \
+ })
+
+/* int open_proc(pid_t pid, const char *fmt, ...); */
+#define open_proc(pid, fmt, ...) \
+ __open_proc(pid, 0, O_RDONLY, fmt, ##__VA_ARGS__)
+
+/* int open_proc_rw(pid_t pid, const char *fmt, ...); */
+#define open_proc_rw(pid, fmt, ...) \
+ __open_proc(pid, 0, O_RDWR, fmt, ##__VA_ARGS__)
+
+#define open_proc_path(pid, fmt, ...) \
+ __open_proc(pid, 0, O_PATH, fmt, ##__VA_ARGS__)
+
+/* DIR *opendir_proc(pid_t pid, const char *fmt, ...); */
+#define opendir_proc(pid, fmt, ...) \
+ ({ \
+ int __fd = open_proc(pid, fmt, ##__VA_ARGS__); \
+ DIR *__d = NULL; \
+ \
+ if (__fd >= 0) { \
+ __d = fdopendir(__fd); \
+ if (__d == NULL) \
+ pr_perror("Can't fdopendir %d " \
+ "(%d/" fmt " on procfs)", \
+ __fd, pid, ##__VA_ARGS__); \
+ } \
+ __d; \
+ })
+
+/* FILE *fopen_proc(pid_t pid, const char *fmt, ...); */
+#define fopen_proc(pid, fmt, ...) \
+ ({ \
+ int __fd = open_proc(pid, fmt, ##__VA_ARGS__); \
+ FILE *__f = NULL; \
+ \
+ if (__fd >= 0) { \
+ __f = fdopen(__fd, "r"); \
+ if (__f == NULL) \
+ pr_perror("Can't fdopen %d " \
+ "(%d/" fmt " on procfs)", \
+ __fd, pid, ##__VA_ARGS__); \
+ } \
+ __f; \
+ })
+
+#define pr_img_head(type, ...) pr_msg("\n"#type __VA_ARGS__ "\n----------------\n")
+#define pr_img_tail(type) pr_msg("----------------\n")
+
+#define DEVZERO (makedev(1, 5))
+
+#define KDEV_MINORBITS 20
+#define KDEV_MINORMASK ((1UL << KDEV_MINORBITS) - 1)
+#define MKKDEV(ma, mi) (((ma) << KDEV_MINORBITS) | (mi))
+
+static inline u32 kdev_major(u32 kdev)
+{
+ return kdev >> KDEV_MINORBITS;
+}
+
+static inline u32 kdev_minor(u32 kdev)
+{
+ return kdev & KDEV_MINORMASK;
+}
+
+static inline dev_t kdev_to_odev(u32 kdev)
+{
+ /*
+ * New kernels encode devices in a new form.
+ * See kernel's fs/stat.c for details, there
+ * choose_32_64 helpers which are the key.
+ */
+ unsigned major = kdev_major(kdev);
+ unsigned minor = kdev_minor(kdev);
+
+ return makedev(major, minor);
+}
+
+extern int copy_file(int fd_in, int fd_out, size_t bytes);
+extern int is_anon_link_type(char *link, char *type);
+
+#define is_hex_digit(c) \
+ (((c) >= '0' && (c) <= '9') || \
+ ((c) >= 'a' && (c) <= 'f') || \
+ ((c) >= 'A' && (c) <= 'F'))
+
+extern void *shmalloc(size_t bytes);
+extern void shfree_last(void *ptr);
+
+#define CRS_CAN_FAIL 0x1 /* cmd can validly exit with non zero code */
+
+extern int cr_system(int in, int out, int err, char *cmd, char *const argv[], unsigned flags);
+extern int cr_system_userns(int in, int out, int err, char *cmd,
+ char *const argv[], unsigned flags, int userns_pid);
+extern int cr_daemon(int nochdir, int noclose, int *keep_fd, int close_fd);
+extern int is_root_user(void);
+
+static inline bool dir_dots(struct dirent *de)
+{
+ return !strcmp(de->d_name, ".") || !strcmp(de->d_name, "..");
+}
+
+extern int is_empty_dir(int dirfd);
+
+/*
+ * Size of buffer to carry the worst case or /proc/self/fd/N
+ * path. Since fd is an integer, we can easily estimate one :)
+ */
+#define PSFDS (sizeof("/proc/self/fd/2147483647"))
+
+extern int read_fd_link(int lfd, char *buf, size_t size);
+
+#define USEC_PER_SEC 1000000L
+#define NSEC_PER_SEC 1000000000L
+
+int vaddr_to_pfn(unsigned long vaddr, u64 *pfn);
+
+/*
+ * Check whether @str starts with @sub and report the
+ * next character of @str in @end
+ */
+static inline bool strstartswith2(const char *str, const char *sub, char *end)
+{
+ const char *osub = sub;
+
+ while (1) {
+ if (*sub == '\0') /* end of sub -- match */ {
+ if (end) {
+ if (sub == osub + 1) /* pure root */
+ *end = '/';
+ else
+ *end = *str;
+ }
+
+ return true;
+ }
+ if (*str == '\0') /* end of str, sub is NOT ended -- miss */
+ return false;
+ if (*str != *sub)
+ return false;
+
+ str++;
+ sub++;
+ }
+}
+
+static inline bool strstartswith(const char *str, const char *sub)
+{
+ return strstartswith2(str, sub, NULL);
+}
+
+/*
+ * Checks whether the @path has @sub_path as a sub path, i.e.
+ * sub_path is the beginning of path and the last component
+ * match is full (next character terminates path component).
+ *
+ * Paths shouldn't contain excessive /-s, i.e. only one slash
+ * between path components and no slash at the end (except for
+ * the "/" path. This is pretty good assumption to what paths
+ * are used by criu.
+ */
+
+static inline bool issubpath(const char *path, const char *sub_path)
+{
+ char end;
+ return strstartswith2(path, sub_path, &end) &&
+ (end == '/' || end == '\0');
+}
+
+/*
+ * mkdir -p
+ */
+int mkdirpat(int fd, const char *path);
+
+/*
+ * Tests whether a path is a prefix of another path. This is different than
+ * strstartswith because "/foo" is _not_ a path prefix of "/foobar", since they
+ * refer to different directories.
+ */
+bool is_path_prefix(const char *path, const char *prefix);
+FILE *fopenat(int dirfd, char *path, char *cflags);
+void split(char *str, char token, char ***out, int *n);
+
+int fd_has_data(int lfd);
+size_t read_into_buffer(int fd, char *buff, size_t size);
+
+int make_yard(char *path);
+
+void tcp_nodelay(int sk, bool on);
+void tcp_cork(int sk, bool on);
+
+const char *ns_to_string(unsigned int ns);
+
+char *xstrcat(char *str, const char *fmt, ...)
+ __attribute__ ((__format__ (__printf__, 2, 3)));
+char *xsprintf(const char *fmt, ...)
+ __attribute__ ((__format__ (__printf__, 1, 2)));
+
+#endif /* __CR_UTIL_H__ */
diff --git a/criu/include/uts_ns.h b/criu/include/uts_ns.h
new file mode 100644
index 000000000000..ab054ffe87d1
--- /dev/null
+++ b/criu/include/uts_ns.h
@@ -0,0 +1,9 @@
+#ifndef __CR_UTS_NS_H__
+#define __CR_UTS_NS_H__
+
+extern int dump_uts_ns(int ns_id);
+extern int prepare_utsns(int pid);
+
+extern struct ns_desc uts_ns_desc;
+
+#endif /* __CR_UTS_NS_H__ */
diff --git a/criu/include/vdso.h b/criu/include/vdso.h
new file mode 100644
index 000000000000..ea6bfabbf3ec
--- /dev/null
+++ b/criu/include/vdso.h
@@ -0,0 +1,27 @@
+#ifndef __CR_VDSO_H__
+#define __CR_VDSO_H__
+
+#include <sys/mman.h>
+#include <stdbool.h>
+
+#include "config.h"
+
+#ifdef CONFIG_VDSO
+
+#include "util-vdso.h"
+
+extern struct vdso_symtable vdso_sym_rt;
+
+extern int vdso_init(void);
+
+extern int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid,
+ struct vm_area_list *vma_area_list);
+
+#else /* CONFIG_VDSO */
+
+#define vdso_init() (0)
+#define parasite_fixup_vdso(ctl, pid, vma_area_list) (0)
+
+#endif /* CONFIG_VDSO */
+
+#endif /* __CR_VDSO_H__ */
diff --git a/criu/include/vma.h b/criu/include/vma.h
new file mode 100644
index 000000000000..6c28136612e9
--- /dev/null
+++ b/criu/include/vma.h
@@ -0,0 +1,110 @@
+#ifndef __CR_VMA_H__
+#define __CR_VMA_H__
+
+#include "asm/types.h"
+#include "image.h"
+#include "list.h"
+
+#include "protobuf/vma.pb-c.h"
+
+struct vm_area_list {
+ struct list_head h;
+ unsigned nr;
+ unsigned int nr_aios;
+ unsigned long priv_size; /* nr of pages in private VMAs */
+ unsigned long longest; /* nr of pages in longest VMA */
+};
+
+#define VM_AREA_LIST(name) struct vm_area_list name = { .h = LIST_HEAD_INIT(name.h), .nr = 0, }
+
+static inline void vm_area_list_init(struct vm_area_list *vml)
+{
+ INIT_LIST_HEAD(&vml->h);
+ vml->nr = 0;
+ vml->priv_size = 0;
+ vml->longest = 0;
+}
+
+struct file_desc;
+
+struct vma_area {
+ struct list_head list;
+ VmaEntry *e;
+
+ union {
+ struct /* for dump */ {
+ union {
+ /*
+ * These two cannot be assigned at once.
+ * The file_fd is an fd for a regular file and
+ * the socket_id is the inode number of the
+ * mapped (PF_PACKET) socket.
+ *
+ * The aio_nr_req is only for aio rings.
+ */
+ int vm_file_fd;
+ int vm_socket_id;
+ unsigned int aio_nr_req;
+ };
+
+ char *aufs_rpath; /* path from aufs root */
+ char *aufs_fpath; /* full path from global root */
+
+ /*
+ * When several subsequent vmas have the same
+ * dev:ino pair all 'tail' ones set this to true
+ * and the vmst points to the head's stat buf.
+ */
+ bool file_borrowed;
+ struct stat *vmst;
+ int mnt_id;
+ };
+
+ struct /* for restore */ {
+ struct file_desc *vmfd;
+ unsigned long *page_bitmap; /* existent pages */
+ unsigned long *ppage_bitmap; /* parent's existent pages */
+ unsigned long premmaped_addr; /* restore only */
+ };
+ };
+};
+
+extern struct vma_area *alloc_vma_area(void);
+extern int collect_mappings(pid_t pid, struct vm_area_list *vma_area_list);
+extern void free_mappings(struct vm_area_list *vma_area_list);
+
+#define vma_area_is(vma_area, s) vma_entry_is((vma_area)->e, s)
+#define vma_area_len(vma_area) vma_entry_len((vma_area)->e)
+#define vma_entry_is(vma, s) (((vma)->status & (s)) == (s))
+#define vma_entry_len(vma) ((vma)->end - (vma)->start)
+
+/*
+ * vma_premmaped_start() can be used only in restorer.
+ * In other cases vma_area->premmaped_addr must be used.
+ * This hack is required, because vma_area isn't tranfered in restorer and
+ * shmid is used to determing which vma-s are cowed.
+ */
+#define vma_premmaped_start(vma) ((vma)->shmid)
+
+static inline int in_vma_area(struct vma_area *vma, unsigned long addr)
+{
+ return addr >= (unsigned long)vma->e->start &&
+ addr < (unsigned long)vma->e->end;
+}
+
+static inline bool vma_entry_is_private(VmaEntry *entry,
+ unsigned long task_size)
+{
+ return vma_entry_is(entry, VMA_AREA_REGULAR) &&
+ (vma_entry_is(entry, VMA_ANON_PRIVATE) ||
+ vma_entry_is(entry, VMA_FILE_PRIVATE)) &&
+ (entry->end <= task_size);
+}
+
+static inline bool vma_area_is_private(struct vma_area *vma,
+ unsigned long task_size)
+{
+ return vma_entry_is_private(vma->e, task_size);
+}
+
+#endif /* __CR_VMA_H__ */
diff --git a/criu/include/xmalloc.h b/criu/include/xmalloc.h
new file mode 100644
index 000000000000..e5ce279fab1a
--- /dev/null
+++ b/criu/include/xmalloc.h
@@ -0,0 +1,67 @@
+#ifndef __CR_XMALLOC_H__
+#define __CR_XMALLOC_H__
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "log.h"
+
+#define __xalloc(op, size, ...) \
+ ({ \
+ void *___p = op( __VA_ARGS__ ); \
+ if (!___p) \
+ pr_err("%s: Can't allocate %li bytes\n", \
+ __func__, (long)(size)); \
+ ___p; \
+ })
+
+#define xstrdup(str) __xalloc(strdup, strlen(str) + 1, str)
+#define xmalloc(size) __xalloc(malloc, size, size)
+#define xzalloc(size) __xalloc(calloc, size, 1, size)
+#define xrealloc(p, size) __xalloc(realloc, size, p, size)
+
+#define xfree(p) free(p)
+
+#define xrealloc_safe(pptr, size) \
+ ({ \
+ int __ret = -1; \
+ void *new = xrealloc(*pptr, size); \
+ if (new) { \
+ *pptr = new; \
+ __ret = 0; \
+ } \
+ __ret; \
+ })
+
+#define xmemdup(ptr, size) \
+ ({ \
+ void *new = xmalloc(size); \
+ if (new) \
+ memcpy(new, ptr, size); \
+ new; \
+ })
+
+#define memzero_p(p) memset(p, 0, sizeof(*p))
+#define memzero(p, size) memset(p, 0, size)
+
+/*
+ * Helper for allocating trees with single xmalloc.
+ * This one advances the void *pointer on s bytes and
+ * returns the previous value. Use like this
+ *
+ * m = xmalloc(total_size);
+ * a = xptr_pull(&m, tree_root_t);
+ * a->b = xptr_pull(&m, leaf_a_t);
+ * a->c = xptr_pull(&m, leaf_c_t);
+ * ...
+ */
+static inline void *xptr_pull_s(void **m, size_t s)
+{
+ void *ret = (*m);
+ (*m) += s;
+ return ret;
+}
+
+#define xptr_pull(m, type) xptr_pull_s(m, sizeof(type))
+
+#endif /* __CR_XMALLOC_H__ */
diff --git a/criu/ipc_ns.c b/criu/ipc_ns.c
new file mode 100644
index 000000000000..9abb40311a18
--- /dev/null
+++ b/criu/ipc_ns.c
@@ -0,0 +1,936 @@
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fcntl.h>
+#include <sys/wait.h>
+#include <sys/msg.h>
+#include <sys/sem.h>
+#include <sys/shm.h>
+#include <sched.h>
+
+#include "util.h"
+#include "cr_options.h"
+#include "imgset.h"
+#include "namespaces.h"
+#include "sysctl.h"
+#include "ipc_ns.h"
+
+#include "protobuf.h"
+#include "protobuf/ipc-var.pb-c.h"
+#include "protobuf/ipc-shm.pb-c.h"
+#include "protobuf/ipc-sem.pb-c.h"
+#include "protobuf/ipc-msg.pb-c.h"
+
+#if defined (__GLIBC__) && __GLIBC__ >= 2
+#define KEY __key
+#else
+#define KEY key
+#endif
+
+#ifndef MSGMAX
+#define MSGMAX 8192
+#endif
+
+#ifndef MSG_COPY
+#define MSG_COPY 040000
+#endif
+
+static void pr_ipc_desc_entry(unsigned int loglevel, const IpcDescEntry *desc)
+{
+ print_on_level(loglevel, "id: %-10d key: 0x%08x uid: %-10d gid: %-10d "
+ "cuid: %-10d cgid: %-10d mode: %-10o ",
+ desc->id, desc->key, desc->uid, desc->gid,
+ desc->cuid, desc->cgid, desc->mode);
+}
+
+static void fill_ipc_desc(int id, IpcDescEntry *desc, const struct ipc_perm *ipcp)
+{
+ desc->id = id;
+ desc->key = ipcp->KEY;
+ desc->uid = userns_uid(ipcp->uid);
+ desc->gid = userns_gid(ipcp->gid);
+ desc->cuid = userns_uid(ipcp->cuid);
+ desc->cgid = userns_gid(ipcp->cgid);
+ desc->mode = ipcp->mode;
+}
+
+static void pr_ipc_sem_array(unsigned int loglevel, int nr, u16 *values)
+{
+ while (nr--)
+ print_on_level(loglevel, " %-5d", values[nr]);
+ print_on_level(loglevel, "\n");
+}
+
+#define pr_info_ipc_sem_array(nr, values) pr_ipc_sem_array(LOG_INFO, nr, values)
+#define pr_msg_ipc_sem_array(nr, values) pr_ipc_sem_array(LOG_MSG, nr, values)
+
+static void pr_info_ipc_sem_entry(const IpcSemEntry *sem)
+{
+ pr_ipc_desc_entry(LOG_INFO, sem->desc);
+ print_on_level(LOG_INFO, "nsems: %-10d\n", sem->nsems);
+}
+
+static int dump_ipc_sem_set(struct cr_img *img, const IpcSemEntry *sem)
+{
+ size_t rounded;
+ int ret, size;
+ u16 *values;
+
+ size = sizeof(u16) * sem->nsems;
+ rounded = round_up(size, sizeof(u64));
+ values = xmalloc(rounded);
+ if (values == NULL) {
+ pr_err("Failed to allocate memory for semaphore set values\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+ ret = semctl(sem->desc->id, 0, GETALL, values);
+ if (ret < 0) {
+ pr_perror("Failed to get semaphore set values");
+ ret = -errno;
+ goto out;
+ }
+ pr_info_ipc_sem_array(sem->nsems, values);
+
+ memzero((void *)values + size, rounded - size);
+ ret = write_img_buf(img, values, rounded);
+ if (ret < 0) {
+ pr_err("Failed to write IPC message data\n");
+ goto out;
+ }
+out:
+ xfree(values);
+ return ret;
+}
+
+static int dump_ipc_sem_desc(struct cr_img *img, int id, const struct semid_ds *ds)
+{
+ IpcSemEntry sem = IPC_SEM_ENTRY__INIT;
+ IpcDescEntry desc = IPC_DESC_ENTRY__INIT;
+ int ret;
+
+ sem.desc = &desc;
+ sem.nsems = ds->sem_nsems;
+
+ fill_ipc_desc(id, sem.desc, &ds->sem_perm);
+ pr_info_ipc_sem_entry(&sem);
+
+ ret = pb_write_one(img, &sem, PB_IPC_SEM);
+ if (ret < 0) {
+ pr_err("Failed to write IPC semaphores set\n");
+ return ret;
+ }
+ return dump_ipc_sem_set(img, &sem);
+}
+
+static int dump_ipc_sem(struct cr_img *img)
+{
+ int i, maxid;
+ struct seminfo info;
+ int slot;
+
+ maxid = semctl(0, 0, SEM_INFO, &info);
+ if (maxid < 0) {
+ pr_perror("semctl failed");
+ return -errno;
+ }
+
+ pr_info("IPC semaphore sets: %d\n", info.semusz);
+ for (i = 0, slot = 0; i <= maxid; i++) {
+ struct semid_ds ds;
+ int id, ret;
+
+ id = semctl(i, 0, SEM_STAT, &ds);
+ if (id < 0) {
+ if (errno == EINVAL)
+ continue;
+ pr_perror("Failed to get stats for IPC semaphore set");
+ break;
+ }
+ ret = dump_ipc_sem_desc(img, id, &ds);
+ if (!ret)
+ slot++;
+ }
+ if (slot != info.semusz) {
+ pr_err("Failed to collect %d (only %d succeeded)\n", info.semusz, slot);
+ return -EFAULT;
+ }
+ return info.semusz;
+}
+
+static void pr_info_ipc_msg(int nr, const IpcMsg *msg)
+{
+ print_on_level(LOG_INFO, " %-5d: type: %-20"PRId64" size: %-10d\n",
+ nr++, msg->mtype, msg->msize);
+}
+
+static void pr_info_ipc_msg_entry(const IpcMsgEntry *msg)
+{
+ pr_ipc_desc_entry(LOG_INFO, msg->desc);
+ print_on_level(LOG_INFO, "qbytes: %-10d qnum: %-10d\n",
+ msg->qbytes, msg->qnum);
+}
+
+static int dump_ipc_msg_queue_messages(struct cr_img *img, const IpcMsgEntry *msq,
+ unsigned int msg_nr)
+{
+ struct msgbuf *message = NULL;
+ unsigned int msgmax;
+ int ret, msg_cnt = 0;
+ struct sysctl_req req[] = {
+ { "kernel/msgmax", &msgmax, CTL_U32 },
+ };
+
+ ret = sysctl_op(req, ARRAY_SIZE(req), CTL_READ, CLONE_NEWIPC);
+ if (ret < 0) {
+ pr_err("Failed to read max IPC message size\n");
+ goto err;
+ }
+
+ msgmax += sizeof(struct msgbuf);
+ message = xmalloc(round_up(msgmax, sizeof(u64)));
+ if (message == NULL) {
+ pr_err("Failed to allocate memory for IPC message\n");
+ return -ENOMEM;
+ }
+
+ for (msg_cnt = 0; msg_cnt < msg_nr; msg_cnt++) {
+ IpcMsg msg = IPC_MSG__INIT;
+ size_t rounded;
+
+ ret = msgrcv(msq->desc->id, message, msgmax, msg_cnt, IPC_NOWAIT | MSG_COPY);
+ if (ret < 0) {
+ pr_perror("Failed to copy IPC message");
+ goto err;
+ }
+
+ msg.msize = ret;
+ msg.mtype = message->mtype;
+
+ pr_info_ipc_msg(msg_cnt, &msg);
+
+ ret = pb_write_one(img, &msg, PB_IPCNS_MSG);
+ if (ret < 0) {
+ pr_err("Failed to write IPC message header\n");
+ break;
+ }
+
+ rounded = round_up(msg.msize, sizeof(u64));
+ memzero(((void *)message->mtext + msg.msize), rounded - msg.msize);
+ ret = write_img_buf(img, message->mtext, rounded);
+ if (ret < 0) {
+ pr_err("Failed to write IPC message data\n");
+ break;
+ }
+ }
+ ret = 0;
+err:
+ xfree(message);
+ return ret;
+}
+
+static int dump_ipc_msg_queue(struct cr_img *img, int id, const struct msqid_ds *ds)
+{
+ IpcMsgEntry msg = IPC_MSG_ENTRY__INIT;
+ IpcDescEntry desc = IPC_DESC_ENTRY__INIT;
+ int ret;
+
+ msg.desc = &desc;
+ fill_ipc_desc(id, msg.desc, &ds->msg_perm);
+ msg.qbytes = ds->msg_qbytes;
+ msg.qnum = ds->msg_qnum;
+
+ pr_info_ipc_msg_entry(&msg);
+
+ ret = pb_write_one(img, &msg, PB_IPCNS_MSG_ENT);
+ if (ret < 0) {
+ pr_err("Failed to write IPC message queue\n");
+ return ret;
+ }
+ return dump_ipc_msg_queue_messages(img, &msg, ds->msg_qnum);
+}
+
+static int dump_ipc_msg(struct cr_img *img)
+{
+ int i, maxid;
+ struct msginfo info;
+ int slot;
+
+ maxid = msgctl(0, MSG_INFO, (struct msqid_ds *)&info);
+ if (maxid < 0) {
+ pr_perror("msgctl failed");
+ return -errno;
+ }
+
+ pr_info("IPC message queues: %d\n", info.msgpool);
+ for (i = 0, slot = 0; i <= maxid; i++) {
+ struct msqid_ds ds;
+ int id, ret;
+
+ id = msgctl(i, MSG_STAT, &ds);
+ if (id < 0) {
+ if (errno == EINVAL)
+ continue;
+ pr_perror("Failed to get stats for IPC message queue");
+ break;
+ }
+ ret = dump_ipc_msg_queue(img, id, &ds);
+ if (!ret)
+ slot++;
+ }
+ if (slot != info.msgpool) {
+ pr_err("Failed to collect %d message queues (only %d succeeded)\n", info.msgpool, slot);
+ return -EFAULT;
+ }
+ return info.msgpool;
+}
+
+static void pr_info_ipc_shm(const IpcShmEntry *shm)
+{
+ pr_ipc_desc_entry(LOG_INFO, shm->desc);
+ print_on_level(LOG_INFO, "size: %-10"PRIu64"\n", shm->size);
+}
+
+static int ipc_sysctl_req(IpcVarEntry *e, int op)
+{
+ struct sysctl_req req[] = {
+ { "kernel/sem", e->sem_ctls, CTL_U32A(e->n_sem_ctls) },
+ { "kernel/msgmax", &e->msg_ctlmax, CTL_U32 },
+ { "kernel/msgmnb", &e->msg_ctlmnb, CTL_U32 },
+ { "kernel/auto_msgmni", &e->auto_msgmni, CTL_U32 },
+ { "kernel/msgmni", &e->msg_ctlmni, CTL_U32 },
+ { "kernel/shmmax", &e->shm_ctlmax, CTL_U64 },
+ { "kernel/shmall", &e->shm_ctlall, CTL_U64 },
+ { "kernel/shmmni", &e->shm_ctlmni, CTL_U32 },
+ { "kernel/shm_rmid_forced", &e->shm_rmid_forced, CTL_U32 },
+ };
+
+ struct sysctl_req req_mq[] = {
+ { "fs/mqueue/queues_max", &e->mq_queues_max, CTL_U32 },
+ { "fs/mqueue/msg_max", &e->mq_msg_max, CTL_U32 },
+ { "fs/mqueue/msgsize_max", &e->mq_msgsize_max, CTL_U32 },
+ };
+
+ int ret;
+
+ ret = sysctl_op(req, ARRAY_SIZE(req), op, CLONE_NEWIPC);
+ if (ret)
+ return ret;
+
+ if (access("/proc/sys/fs/mqueue", X_OK)) {
+ pr_info("Mqueue sysctls are missing\n");
+ return 0;
+ }
+
+ return sysctl_op(req_mq, ARRAY_SIZE(req_mq), op, CLONE_NEWIPC);
+}
+
+/*
+ * TODO: Function below should be later improved to locate and dump only dirty
+ * pages via updated sys_mincore().
+ */
+static int dump_ipc_shm_pages(struct cr_img *img, const IpcShmEntry *shm)
+{
+ void *data;
+ int ret;
+
+ data = shmat(shm->desc->id, NULL, SHM_RDONLY);
+ if (data == (void *)-1) {
+ pr_perror("Failed to attach IPC shared memory");
+ return -errno;
+ }
+ ret = write_img_buf(img, data, round_up(shm->size, sizeof(u32)));
+ if (ret < 0) {
+ pr_err("Failed to write IPC shared memory data\n");
+ return ret;
+ }
+ if (shmdt(data)) {
+ pr_perror("Failed to detach IPC shared memory");
+ return -errno;
+ }
+ return 0;
+}
+
+static int dump_ipc_shm_seg(struct cr_img *img, int id, const struct shmid_ds *ds)
+{
+ IpcShmEntry shm = IPC_SHM_ENTRY__INIT;
+ IpcDescEntry desc = IPC_DESC_ENTRY__INIT;
+ int ret;
+
+ shm.desc = &desc;
+ shm.size = ds->shm_segsz;
+ fill_ipc_desc(id, shm.desc, &ds->shm_perm);
+ pr_info_ipc_shm(&shm);
+
+ ret = pb_write_one(img, &shm, PB_IPC_SHM);
+ if (ret < 0) {
+ pr_err("Failed to write IPC shared memory segment\n");
+ return ret;
+ }
+ return dump_ipc_shm_pages(img, &shm);
+}
+
+static int dump_ipc_shm(struct cr_img *img)
+{
+ int i, maxid, slot;
+ struct shm_info info;
+
+ maxid = shmctl(0, SHM_INFO, (void *)&info);
+ if (maxid < 0) {
+ pr_perror("shmctl(SHM_INFO) failed");
+ return -errno;
+ }
+
+ pr_info("IPC shared memory segments: %d\n", info.used_ids);
+ for (i = 0, slot = 0; i <= maxid; i++) {
+ struct shmid_ds ds;
+ int id, ret;
+
+ id = shmctl(i, SHM_STAT, &ds);
+ if (id < 0) {
+ if (errno == EINVAL)
+ continue;
+ pr_perror("Failed to get stats for IPC shared memory");
+ break;
+ }
+
+ ret = dump_ipc_shm_seg(img, id, &ds);
+ if (ret < 0)
+ return ret;
+ slot++;
+ }
+ if (slot != info.used_ids) {
+ pr_err("Failed to collect %d (only %d succeeded)\n",
+ info.used_ids, slot);
+ return -EFAULT;
+ }
+ return 0;
+}
+
+static int dump_ipc_var(struct cr_img *img)
+{
+ IpcVarEntry var = IPC_VAR_ENTRY__INIT;
+ int ret = -1;
+
+ var.n_sem_ctls = 4;
+ var.sem_ctls = xmalloc(pb_repeated_size(&var, sem_ctls));
+ if (!var.sem_ctls)
+ goto err;
+
+ ret = ipc_sysctl_req(&var, CTL_READ);
+ if (ret < 0) {
+ pr_err("Failed to read IPC variables\n");
+ goto err;
+ }
+
+ ret = pb_write_one(img, &var, PB_IPC_VAR);
+ if (ret < 0) {
+ pr_err("Failed to write IPC variables\n");
+ goto err;
+ }
+
+err:
+ xfree(var.sem_ctls);
+ return ret;
+}
+
+static int dump_ipc_data(const struct cr_imgset *imgset)
+{
+ int ret;
+
+ ret = dump_ipc_var(img_from_set(imgset, CR_FD_IPC_VAR));
+ if (ret < 0)
+ return ret;
+ ret = dump_ipc_shm(img_from_set(imgset, CR_FD_IPCNS_SHM));
+ if (ret < 0)
+ return ret;
+ ret = dump_ipc_msg(img_from_set(imgset, CR_FD_IPCNS_MSG));
+ if (ret < 0)
+ return ret;
+ ret = dump_ipc_sem(img_from_set(imgset, CR_FD_IPCNS_SEM));
+ if (ret < 0)
+ return ret;
+ return 0;
+}
+
+int dump_ipc_ns(int ns_id)
+{
+ int ret;
+ struct cr_imgset *imgset;
+
+ imgset = cr_imgset_open(ns_id, IPCNS, O_DUMP);
+ if (imgset == NULL)
+ return -1;
+
+ ret = dump_ipc_data(imgset);
+ if (ret < 0) {
+ pr_err("Failed to write IPC namespace data\n");
+ goto err;
+ }
+
+err:
+ close_cr_imgset(&imgset);
+ return ret < 0 ? -1 : 0;
+}
+
+void ipc_sem_handler(struct cr_img *img, void *obj)
+{
+ IpcSemEntry *e = obj;
+ u16 *values;
+ int size;
+
+ pr_msg("\n");
+ size = round_up(sizeof(u16) * e->nsems, sizeof(u64));
+ values = xmalloc(size);
+ if (values == NULL)
+ return;
+ if (read_img_buf(img, values, size) <= 0) {
+ xfree(values);
+ return;
+ }
+ pr_msg_ipc_sem_array(e->nsems, values);
+ xfree(values);
+}
+
+static void ipc_msg_data_handler(struct cr_img *img, void *obj)
+{
+ IpcMsg *e = obj;
+ print_image_data(img, round_up(e->msize, sizeof(u64)), opts.show_pages_content);
+}
+
+void ipc_msg_handler(struct cr_img *img, void *obj)
+{
+ IpcMsgEntry *e = obj;
+ int msg_nr = 0;
+
+ pr_msg("\n");
+ while (msg_nr++ < e->qnum)
+ pb_show_plain_payload(img, PB_IPCNS_MSG, ipc_msg_data_handler);
+
+}
+
+void ipc_shm_handler(struct cr_img *img, void *obj)
+{
+ IpcShmEntry *e = obj;
+ print_image_data(img, round_up(e->size, sizeof(u32)), opts.show_pages_content);
+}
+
+static int prepare_ipc_sem_values(struct cr_img *img, const IpcSemEntry *sem)
+{
+ int ret, size;
+ u16 *values;
+
+ size = round_up(sizeof(u16) * sem->nsems, sizeof(u64));
+ values = xmalloc(size);
+ if (values == NULL) {
+ pr_err("Failed to allocate memory for semaphores set values\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = read_img_buf(img, values, size);
+ if (ret < 0) {
+ pr_err("Failed to allocate memory for semaphores set values\n");
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ pr_info_ipc_sem_array(sem->nsems, values);
+
+ ret = semctl(sem->desc->id, 0, SETALL, values);
+ if (ret < 0) {
+ pr_perror("Failed to set semaphores set values");
+ ret = -errno;
+ }
+out:
+ xfree(values);
+ return ret;
+}
+
+static int prepare_ipc_sem_desc(struct cr_img *img, const IpcSemEntry *sem)
+{
+ int ret, id;
+ struct sysctl_req req[] = {
+ { "kernel/sem_next_id", &sem->desc->id, CTL_U32 },
+ };
+ struct semid_ds semid;
+
+ ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWIPC);
+ if (ret < 0) {
+ pr_err("Failed to set desired IPC sem ID\n");
+ return ret;
+ }
+
+ id = semget(sem->desc->key, sem->nsems,
+ sem->desc->mode | IPC_CREAT | IPC_EXCL);
+ if (id == -1) {
+ pr_perror("Failed to create sem set");
+ return -errno;
+ }
+
+ if (id != sem->desc->id) {
+ pr_err("Failed to restore sem id (%d instead of %d)\n",
+ id, sem->desc->id);
+ return -EFAULT;
+ }
+
+ ret = semctl(id, sem->nsems, IPC_STAT, &semid);
+ if (ret == -1) {
+ pr_err("Failed to get sem stat structure\n");
+ return -EFAULT;
+ }
+
+ semid.sem_perm.uid = sem->desc->uid;
+ semid.sem_perm.gid = sem->desc->gid;
+
+ ret = semctl(id, sem->nsems, IPC_SET, &semid);
+ if (ret == -1) {
+ pr_err("Failed to set sem uid and gid\n");
+ return -EFAULT;
+ }
+
+ ret = prepare_ipc_sem_values(img, sem);
+ if (ret < 0) {
+ pr_err("Failed to update sem pages\n");
+ return ret;
+ }
+ return 0;
+}
+
+static int prepare_ipc_sem(int pid)
+{
+ int ret;
+ struct cr_img *img;
+
+ pr_info("Restoring IPC semaphores sets\n");
+ img = open_image(CR_FD_IPCNS_SEM, O_RSTR, pid);
+ if (!img)
+ return -1;
+
+ while (1) {
+ IpcSemEntry *sem;
+
+ ret = pb_read_one_eof(img, &sem, PB_IPC_SEM);
+ if (ret < 0) {
+ ret = -EIO;
+ goto err;
+ }
+ if (ret == 0)
+ break;
+
+ pr_info_ipc_sem_entry(sem);
+
+ ret = prepare_ipc_sem_desc(img, sem);
+ ipc_sem_entry__free_unpacked(sem, NULL);
+
+ if (ret < 0) {
+ pr_err("Failed to prepare semaphores set\n");
+ goto err;
+ }
+ }
+
+ close_image(img);
+ return 0;
+
+err:
+ close_image(img);
+ return ret;
+}
+
+static int prepare_ipc_msg_queue_messages(struct cr_img *img, const IpcMsgEntry *msq)
+{
+ IpcMsg *msg = NULL;
+ int msg_nr = 0;
+ int ret = 0;
+
+ while (msg_nr < msq->qnum) {
+ struct msgbuf {
+ long mtype;
+ char mtext[MSGMAX];
+ } data;
+
+ ret = pb_read_one(img, &msg, PB_IPCNS_MSG);
+ if (ret <= 0)
+ return -EIO;
+
+ pr_info_ipc_msg(msg_nr, msg);
+
+ if (msg->msize > MSGMAX) {
+ ret = -1;
+ pr_err("Unsupported message size: %d (MAX: %d)\n",
+ msg->msize, MSGMAX);
+ break;
+ }
+
+ ret = read_img_buf(img, data.mtext, round_up(msg->msize, sizeof(u64)));
+ if (ret < 0) {
+ pr_err("Failed to read IPC message data\n");
+ break;
+ }
+
+ data.mtype = msg->mtype;
+ ret = msgsnd(msq->desc->id, &data, msg->msize, IPC_NOWAIT);
+ if (ret < 0) {
+ pr_perror("Failed to send IPC message");
+ ret = -errno;
+ break;
+ }
+ msg_nr++;
+ }
+
+ if (msg)
+ ipc_msg__free_unpacked(msg, NULL);
+ return ret;
+}
+
+static int prepare_ipc_msg_queue(struct cr_img *img, const IpcMsgEntry *msq)
+{
+ int ret, id;
+ struct sysctl_req req[] = {
+ { "kernel/msg_next_id", &msq->desc->id, CTL_U32 },
+ };
+ struct msqid_ds msqid;
+
+ ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWIPC);
+ if (ret < 0) {
+ pr_err("Failed to set desired IPC msg ID\n");
+ return ret;
+ }
+
+ id = msgget(msq->desc->key, msq->desc->mode | IPC_CREAT | IPC_EXCL);
+ if (id == -1) {
+ pr_perror("Failed to create msg set");
+ return -errno;
+ }
+
+ if (id != msq->desc->id) {
+ pr_err("Failed to restore msg id (%d instead of %d)\n",
+ id, msq->desc->id);
+ return -EFAULT;
+ }
+
+ ret = msgctl(id, IPC_STAT, &msqid);
+ if (ret == -1) {
+ pr_err("Failed to get msq stat structure\n");
+ return -EFAULT;
+ }
+
+ msqid.msg_perm.uid = msq->desc->uid;
+ msqid.msg_perm.gid = msq->desc->gid;
+
+ ret = msgctl(id, IPC_SET, &msqid);
+ if (ret == -1) {
+ pr_err("Failed to set msq queue uid and gid\n");
+ return -EFAULT;
+ }
+
+ ret = prepare_ipc_msg_queue_messages(img, msq);
+ if (ret < 0) {
+ pr_err("Failed to update message queue messages\n");
+ return ret;
+ }
+ return 0;
+}
+
+static int prepare_ipc_msg(int pid)
+{
+ int ret;
+ struct cr_img *img;
+
+ pr_info("Restoring IPC message queues\n");
+ img = open_image(CR_FD_IPCNS_MSG, O_RSTR, pid);
+ if (!img)
+ return -1;
+
+ while (1) {
+ IpcMsgEntry *msq;
+
+ ret = pb_read_one_eof(img, &msq, PB_IPCNS_MSG_ENT);
+ if (ret < 0) {
+ pr_err("Failed to read IPC messages queue\n");
+ ret = -EIO;
+ goto err;
+ }
+ if (ret == 0)
+ break;
+
+ pr_info_ipc_msg_entry(msq);
+
+ ret = prepare_ipc_msg_queue(img, msq);
+ ipc_msg_entry__free_unpacked(msq, NULL);
+
+ if (ret < 0) {
+ pr_err("Failed to prepare messages queue\n");
+ goto err;
+ }
+ }
+
+ close_image(img);
+ return 0;
+err:
+ close_image(img);
+ return ret;
+}
+
+static int prepare_ipc_shm_pages(struct cr_img *img, const IpcShmEntry *shm)
+{
+ int ret;
+ void *data;
+
+ data = shmat(shm->desc->id, NULL, 0);
+ if (data == (void *)-1) {
+ pr_perror("Failed to attach IPC shared memory");
+ return -errno;
+ }
+ ret = read_img_buf(img, data, round_up(shm->size, sizeof(u32)));
+ if (ret < 0) {
+ pr_err("Failed to read IPC shared memory data\n");
+ return ret;
+ }
+ if (shmdt(data)) {
+ pr_perror("Failed to detach IPC shared memory");
+ return -errno;
+ }
+ return 0;
+}
+
+static int prepare_ipc_shm_seg(struct cr_img *img, const IpcShmEntry *shm)
+{
+ int ret, id;
+ struct sysctl_req req[] = {
+ { "kernel/shm_next_id", &shm->desc->id, CTL_U32 },
+ };
+ struct shmid_ds shmid;
+
+ ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWIPC);
+ if (ret < 0) {
+ pr_err("Failed to set desired IPC shm ID\n");
+ return ret;
+ }
+
+ id = shmget(shm->desc->key, shm->size,
+ shm->desc->mode | IPC_CREAT | IPC_EXCL);
+ if (id == -1) {
+ pr_perror("Failed to create shm set");
+ return -errno;
+ }
+
+ if (id != shm->desc->id) {
+ pr_err("Failed to restore shm id (%d instead of %d)\n",
+ id, shm->desc->id);
+ return -EFAULT;
+ }
+
+ ret = shmctl(id, IPC_STAT, &shmid);
+ if (ret == -1) {
+ pr_err("Failed to get shm stat structure\n");
+ return -EFAULT;
+ }
+
+ shmid.shm_perm.uid = shm->desc->uid;
+ shmid.shm_perm.gid = shm->desc->gid;
+
+ ret = shmctl(id, IPC_SET, &shmid);
+ if (ret == -1) {
+ pr_err("Failed to set shm uid and gid\n");
+ return -EFAULT;
+ }
+
+ ret = prepare_ipc_shm_pages(img, shm);
+ if (ret < 0) {
+ pr_err("Failed to update shm pages\n");
+ return ret;
+ }
+ return 0;
+}
+
+static int prepare_ipc_shm(int pid)
+{
+ int ret;
+ struct cr_img *img;
+
+ pr_info("Restoring IPC shared memory\n");
+ img = open_image(CR_FD_IPCNS_SHM, O_RSTR, pid);
+ if (!img)
+ return -1;
+
+ while (1) {
+ IpcShmEntry *shm;
+
+ ret = pb_read_one_eof(img, &shm, PB_IPC_SHM);
+ if (ret < 0) {
+ pr_err("Failed to read IPC shared memory segment\n");
+ ret = -EIO;
+ goto err;
+ }
+ if (ret == 0)
+ break;
+
+ pr_info_ipc_shm(shm);
+
+ ret = prepare_ipc_shm_seg(img, shm);
+ ipc_shm_entry__free_unpacked(shm, NULL);
+
+ if (ret < 0) {
+ pr_err("Failed to prepare shm segment\n");
+ goto err;
+ }
+ }
+
+ close_image(img);
+ return 0;
+err:
+ close_image(img);
+ return ret;
+}
+
+static int prepare_ipc_var(int pid)
+{
+ int ret;
+ struct cr_img *img;
+ IpcVarEntry *var;
+
+ pr_info("Restoring IPC variables\n");
+ img = open_image(CR_FD_IPC_VAR, O_RSTR, pid);
+ if (!img)
+ return -1;
+
+ ret = pb_read_one(img, &var, PB_IPC_VAR);
+ close_image(img);
+ if (ret <= 0) {
+ pr_err("Failed to read IPC namespace variables\n");
+ return -EFAULT;
+ }
+
+ ret = ipc_sysctl_req(var, CTL_WRITE);
+ ipc_var_entry__free_unpacked(var, NULL);
+
+ if (ret < 0) {
+ pr_err("Failed to prepare IPC namespace variables\n");
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+int prepare_ipc_ns(int pid)
+{
+ int ret;
+
+ pr_info("Restoring IPC namespace\n");
+ ret = prepare_ipc_var(pid);
+ if (ret < 0)
+ return ret;
+ ret = prepare_ipc_shm(pid);
+ if (ret < 0)
+ return ret;
+ ret = prepare_ipc_msg(pid);
+ if (ret < 0)
+ return ret;
+ ret = prepare_ipc_sem(pid);
+ if (ret < 0)
+ return ret;
+ return 0;
+}
+
+struct ns_desc ipc_ns_desc = NS_DESC_ENTRY(CLONE_NEWIPC, "ipc");
diff --git a/criu/irmap.c b/criu/irmap.c
new file mode 100644
index 000000000000..81d245c3e851
--- /dev/null
+++ b/criu/irmap.c
@@ -0,0 +1,489 @@
+/*
+ * IRMAP -- inode reverse mapping.
+ *
+ * Helps us to map inode number (and device) back to path
+ * so that we can restore inotify/fanotify-s.
+ *
+ * Scanning _is_ slow, so we limit it with hints, which are
+ * heurisitical known places where notifies are typically put.
+ */
+
+#include <stdbool.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "xmalloc.h"
+#include "irmap.h"
+#include "mount.h"
+#include "log.h"
+#include "util.h"
+#include "image.h"
+#include "stats.h"
+#include "pstree.h"
+#include "cr_options.h"
+
+#include "protobuf.h"
+#include "protobuf/fsnotify.pb-c.h"
+#include "protobuf/fh.pb-c.h"
+
+#undef LOG_PREFIX
+#define LOG_PREFIX "irmap: "
+
+#define IRMAP_CACHE_BITS 5
+#define IRMAP_CACHE_SIZE (1 << IRMAP_CACHE_BITS)
+#define IRMAP_CACHE_MASK (IRMAP_CACHE_SIZE - 1)
+
+static inline int irmap_hashfn(unsigned int s_dev, unsigned long i_ino)
+{
+ return (s_dev + i_ino) & IRMAP_CACHE_MASK;
+}
+
+struct irmap {
+ unsigned int dev;
+ unsigned long ino;
+ char *path;
+ struct irmap *next;
+ bool revalidate;
+ int nr_kids;
+ struct irmap *kids;
+};
+
+static struct irmap *cache[IRMAP_CACHE_SIZE];
+
+static struct irmap hints[] = {
+ { .path = "/etc", .nr_kids = -1, },
+ { .path = "/var/spool", .nr_kids = -1, },
+ { .path = "/var/log", .nr_kids = -1, },
+ { .path = "/usr/share/dbus-1/system-services", .nr_kids = -1 },
+ { .path = "/var/lib/polkit-1/localauthority", .nr_kids = -1 },
+ { .path = "/usr/share/polkit-1/actions", .nr_kids = -1 },
+ { .path = "/lib/udev", .nr_kids = -1, },
+ { .path = "/.", .nr_kids = 0, },
+ { .path = "/no-such-path", .nr_kids = -1, },
+ { },
+};
+
+/*
+ * Update inode (and device) number and cache the entry
+ */
+static int irmap_update_stat(struct irmap *i)
+{
+ struct stat st;
+ int mntns_root;
+ unsigned hv;
+
+ if (i->ino)
+ return 0;
+
+ mntns_root = get_service_fd(ROOT_FD_OFF);
+
+ pr_debug("Refresh stat for %s\n", i->path);
+ if (fstatat(mntns_root, i->path + 1, &st, AT_SYMLINK_NOFOLLOW)) {
+ pr_perror("Can't stat %s", i->path);
+ return -1;
+ }
+
+ i->revalidate = false;
+ i->dev = st.st_dev;
+ i->ino = st.st_ino;
+ if (!S_ISDIR(st.st_mode))
+ i->nr_kids = 0; /* don't irmap_update_dir */
+
+ hv = irmap_hashfn(i->dev, i->ino);
+ i->next = cache[hv];
+ cache[hv] = i;
+
+ return 0;
+}
+
+/*
+ * Update list of children, but don't cache any. Later
+ * we'll scan them one-by-one and cache.
+ */
+static int irmap_update_dir(struct irmap *t)
+{
+ int fd, nr = 0, mntns_root;
+ DIR *dfd;
+ struct dirent *de;
+
+ if (t->nr_kids >= 0)
+ return 0;
+
+ mntns_root = get_service_fd(ROOT_FD_OFF);
+
+ pr_debug("Refilling %s dir\n", t->path);
+ fd = openat(mntns_root, t->path + 1, O_RDONLY);
+ if (fd < 0) {
+ pr_perror("Can't open %s", t->path);
+ return -1;
+ }
+
+ dfd = fdopendir(fd);
+ if (!dfd) {
+ pr_perror("Can't opendir %s", t->path);
+ return -1;
+ }
+
+ errno = 0;
+ while ((de = readdir(dfd)) != NULL) {
+ struct irmap *k;
+
+ if (dir_dots(de))
+ continue;
+
+ nr++;
+ if (xrealloc_safe(&t->kids, nr * sizeof(struct irmap)))
+ goto out_err;
+
+ k = &t->kids[nr - 1];
+
+ k->kids = NULL; /* for xrealloc above */
+ k->ino = 0; /* for irmap_update_stat */
+ k->nr_kids = -1; /* for irmap_update_dir */
+ k->path = xsprintf("%s/%s", t->path, de->d_name);
+ if (!k->path)
+ goto out_err;
+ }
+
+ if (errno) {
+ pr_perror("Readdir failed");
+ goto out_err;
+ }
+
+ closedir(dfd);
+ close(fd);
+ t->nr_kids = nr;
+ return 0;
+
+out_err:
+ xfree(t->kids);
+ closedir(dfd);
+ close(fd);
+ return -1;
+}
+
+static struct irmap *irmap_scan(struct irmap *t, unsigned int dev, unsigned long ino)
+{
+ struct irmap *c;
+ int i;
+
+ if (irmap_update_stat(t))
+ return NULL;
+
+ if (t->dev == dev && t->ino == ino)
+ return t;
+
+ if (irmap_update_dir(t))
+ return NULL;
+
+ for (i = 0; i < t->nr_kids; i++) {
+ c = irmap_scan(&t->kids[i], dev, ino);
+ if (c)
+ return c;
+ }
+
+ return NULL;
+}
+
+static int irmap_revalidate(struct irmap *c, struct irmap **p)
+{
+ struct stat st;
+ int mntns_root;
+
+ mntns_root = get_service_fd(ROOT_FD_OFF);
+
+ pr_debug("Revalidate stat for %s\n", c->path);
+ if (fstatat(mntns_root, c->path + 1, &st, AT_SYMLINK_NOFOLLOW)) {
+ /* File can be (re)moved, so just treat it as invalid */
+ pr_perror("Can't stat %s", c->path);
+ goto invalid;
+ }
+
+ if (c->dev != st.st_dev)
+ goto invalid;
+ if (c->ino != st.st_ino)
+ goto invalid;
+
+ c->revalidate = false;
+ return 0;
+
+invalid:
+ pr_debug("\t%x:%lx is invalid\n", c->dev, c->ino);
+ *p = c->next;
+ xfree(c->path);
+ xfree(c);
+ return 1;
+}
+
+static bool doing_predump = false;
+
+char *irmap_lookup(unsigned int s_dev, unsigned long i_ino)
+{
+ struct irmap *c, *h, **p;
+ char *path = NULL;
+ int hv;
+ struct irmap_path_opt *o;
+
+ s_dev = kdev_to_odev(s_dev);
+
+ pr_debug("Resolving %x:%lx path\n", s_dev, i_ino);
+
+ /*
+ * If we're in predump, then processes already run
+ * and the root_item is already freed by that time.
+ * But the root service fd is already set by the
+ * irmap_predump_prep, so we just go ahead and scan.
+ */
+ if (!doing_predump &&
+ __mntns_get_root_fd(root_item->pid.real) < 0)
+ goto out;
+
+ timing_start(TIME_IRMAP_RESOLVE);
+
+ hv = irmap_hashfn(s_dev, i_ino);
+ for (p = &cache[hv]; *p; p = &(*p)->next) {
+ c = *p;
+ if (!(c->dev == s_dev && c->ino == i_ino))
+ continue;
+
+ if (c->revalidate && irmap_revalidate(c, p))
+ continue;
+
+ pr_debug("\tFound %s in cache\n", c->path);
+ path = c->path;
+ goto out;
+ }
+
+ /* Let's scan any user provided paths first; since the user told us
+ * about them, hopefully they're more interesting than our hints.
+ */
+ list_for_each_entry(o, &opts.irmap_scan_paths, node) {
+ c = irmap_scan(o->ir, s_dev, i_ino);
+ if (c) {
+ pr_debug("\tScanned %s\n", c->path);
+ path = c->path;
+ goto out;
+ }
+ }
+
+ for (h = hints; h->path; h++) {
+ pr_debug("Scanning %s hint\n", h->path);
+ c = irmap_scan(h, s_dev, i_ino);
+ if (c) {
+ pr_debug("\tScanned %s\n", c->path);
+ path = c->path;
+ goto out;
+ }
+ }
+
+out:
+ timing_stop(TIME_IRMAP_RESOLVE);
+ return path;
+}
+
+/*
+ * IRMAP pre-cache -- do early irmap scan on pre-dump to reduce
+ * the freeze time on dump
+ */
+
+struct irmap_predump {
+ unsigned int dev;
+ unsigned long ino;
+ FhEntry fh;
+ struct irmap_predump *next;
+};
+
+static struct irmap_predump *predump_queue;
+
+int irmap_queue_cache(unsigned int dev, unsigned long ino,
+ FhEntry *fh)
+{
+ struct irmap_predump *ip;
+
+ ip = xmalloc(sizeof(*ip));
+ if (!ip)
+ return -1;
+
+ ip->dev = dev;
+ ip->ino = ino;
+ ip->fh = *fh;
+ fh->handle = NULL; /* don't free in free_fhandle */
+
+ pr_debug("Queue %x:%lx for pre-dump\n", dev, ino);
+
+ ip->next = predump_queue;
+ predump_queue = ip;
+ return 0;
+}
+
+int irmap_predump_prep(void)
+{
+ /*
+ * Tasks are about to get released soon, but
+ * we'll need to do FS scan for irmaps. In this
+ * scan we will need to know the root dir tasks
+ * live in. Need to make sure the respective fd
+ * (service) is set to that root, so that the
+ * scan works and doesn't race with the tasks
+ * dying or changind root.
+ */
+
+ doing_predump = true;
+ return __mntns_get_root_fd(root_item->pid.real) < 0 ? -1 : 0;
+}
+
+int irmap_predump_run(void)
+{
+ int ret = 0;
+ struct cr_img *img;
+ struct irmap_predump *ip;
+
+ img = open_image_at(AT_FDCWD, CR_FD_IRMAP_CACHE, O_DUMP);
+ if (!img)
+ return -1;
+
+ pr_info("Running irmap pre-dump\n");
+
+ for (ip = predump_queue; ip; ip = ip->next) {
+ pr_debug("\tchecking %x:%lx\n", ip->dev, ip->ino);
+ ret = check_open_handle(ip->dev, ip->ino, &ip->fh);
+ if (ret) {
+ pr_err("Failed to resolve %x:%lx\n", ip->dev, ip->ino);
+ break;
+ }
+
+ if (ip->fh.path) {
+ IrmapCacheEntry ic = IRMAP_CACHE_ENTRY__INIT;
+
+ pr_info("Irmap cache %x:%lx -> %s\n", ip->dev, ip->ino, ip->fh.path);
+ ic.dev = ip->dev;
+ ic.inode = ip->ino;
+ ic.path = ip->fh.path;
+
+ ret = pb_write_one(img, &ic, PB_IRMAP_CACHE);
+ if (ret)
+ break;
+ }
+ }
+
+ close_image(img);
+ return ret;
+}
+
+static int irmap_cache_one(IrmapCacheEntry *ie)
+{
+ struct irmap *ic;
+ unsigned hv;
+
+ ic = xmalloc(sizeof(*ic));
+ if (!ic)
+ return -1;
+
+ ic->dev = ie->dev;
+ ic->ino = ie->inode;
+ ic->path = xstrdup(ie->path);
+ if (!ie->path) {
+ xfree(ic);
+ return -1;
+ }
+
+ ic->nr_kids = 0;
+ /*
+ * We've loaded entry from cache, thus we'll need to check
+ * whether it's still valid when find it in cache.
+ */
+ ic->revalidate = true;
+
+ pr_debug("Pre-cache %x:%lx -> %s\n", ic->dev, ic->ino, ic->path);
+
+ hv = irmap_hashfn(ic->dev, ic->ino);
+ ic->next = cache[hv];
+ cache[hv] = ic;
+
+ return 0;
+}
+
+static int open_irmap_cache(struct cr_img **img)
+{
+ int dir = AT_FDCWD;
+
+ pr_info("Searching irmap cache in work dir\n");
+in:
+ *img = open_image_at(dir, CR_FD_IRMAP_CACHE, O_RSTR);
+ if (dir != AT_FDCWD)
+ close(dir);
+
+ if (empty_image(*img)) {
+ close_image(*img);
+ if (dir == AT_FDCWD) {
+ pr_info("Searching irmap cache in parent\n");
+ dir = openat(get_service_fd(IMG_FD_OFF),
+ CR_PARENT_LINK, O_RDONLY);
+ if (dir >= 0)
+ goto in;
+ if (errno != ENOENT)
+ return -1;
+ }
+
+ pr_info("No irmap cache\n");
+ return 0;
+ }
+
+ if (!*img)
+ return -1;
+
+ pr_info("... done\n");
+ return 1;
+}
+
+int irmap_load_cache(void)
+{
+ int ret;
+ struct cr_img *img;
+
+ ret = open_irmap_cache(&img);
+ if (ret <= 0)
+ return ret;
+
+ pr_info("Loading irmap cache\n");
+ while (1) {
+ IrmapCacheEntry *ic;
+
+ ret = pb_read_one_eof(img, &ic, PB_IRMAP_CACHE);
+ if (ret <= 0)
+ break;
+
+ ret = irmap_cache_one(ic);
+ if (ret < 0)
+ break;
+
+ irmap_cache_entry__free_unpacked(ic, NULL);
+ }
+
+ close_image(img);
+ return ret;
+}
+
+int irmap_scan_path_add(char *path)
+{
+ struct irmap_path_opt *o;
+
+ o = xzalloc(sizeof(*o));
+ if (!o)
+ return -1;
+
+ o->ir = xzalloc(sizeof(*o->ir));
+ if (!o->ir) {
+ xfree(o);
+ return -1;
+ }
+
+ o->ir->path = path;
+ o->ir->nr_kids = -1;
+ list_add(&o->node, &opts.irmap_scan_paths);
+ return 0;
+}
diff --git a/criu/kcmp-ids.c b/criu/kcmp-ids.c
new file mode 100644
index 000000000000..853879fe071b
--- /dev/null
+++ b/criu/kcmp-ids.c
@@ -0,0 +1,153 @@
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/syscall.h>
+
+#include "asm/types.h"
+#include "rbtree.h"
+#include "util.h"
+#include "kcmp-ids.h"
+
+/*
+ * We track shared files by global rbtree, where each node might
+ * be a root for subtree. The reason for that is the nature of data
+ * we obtain from operating system.
+ *
+ * Basically OS provides us two ways to distinguish files
+ *
+ * - information obtained from fstat call
+ * - shiny new sys_kcmp system call (which may compare the file descriptor
+ * pointers inside the kernel and provide us order info)
+ *
+ * So, to speedup procedure of searching for shared file descriptors
+ * we use both techniques. From fstat call we get that named general file
+ * IDs (genid) which are carried in the main rbtree.
+ *
+ * In case if two genid are the same -- we need to use a second way and
+ * call for sys_kcmp. Thus, if kernel tells us that files have identical
+ * genid but in real they are different from kernel point of view -- we assign
+ * a second unique key (subid) to such file descriptor and put it into a subtree.
+ *
+ * So the tree will look like
+ *
+ * (root)
+ * genid-1
+ * / \
+ * genid-2 genid-3
+ * / \ / \
+ *
+ * Where each genid node might be a sub-rbtree as well
+ *
+ * (genid-N)
+ * / \
+ * subid-1 subid-2
+ * / \ / \
+ *
+ * Carrying two rbtree at once allow us to minimize the number
+ * of sys_kcmp syscalls, also to collect and dump file descriptors
+ * in one pass.
+ */
+
+struct kid_entry {
+ struct rb_node node;
+
+ struct rb_root subtree_root;
+ struct rb_node subtree_node;
+
+ u32 subid; /* subid is always unique */
+ struct kid_elem elem;
+} __aligned(sizeof(long));
+
+static struct kid_entry *alloc_kid_entry(struct kid_tree *tree, struct kid_elem *elem)
+{
+ struct kid_entry *e;
+
+ e = xmalloc(sizeof(*e));
+ if (!e)
+ goto err;
+
+ e->subid = tree->subid++;
+ e->elem = *elem;
+
+ /* Make sure no overflow here */
+ BUG_ON(!e->subid);
+
+ rb_init_node(&e->node);
+ rb_init_node(&e->subtree_node);
+ e->subtree_root = RB_ROOT;
+ rb_link_and_balance(&e->subtree_root, &e->subtree_node,
+ NULL, &e->subtree_root.rb_node);
+err:
+ return e;
+}
+
+static u32 kid_generate_sub(struct kid_tree *tree, struct kid_entry *e,
+ struct kid_elem *elem, int *new_id)
+{
+ struct rb_node *node = e->subtree_root.rb_node;
+ struct kid_entry *sub = NULL;
+
+ struct rb_node **new = &e->subtree_root.rb_node;
+ struct rb_node *parent = NULL;
+
+ BUG_ON(!node);
+
+ while (node) {
+ struct kid_entry *this = rb_entry(node, struct kid_entry, subtree_node);
+ int ret = syscall(SYS_kcmp, this->elem.pid, elem->pid, tree->kcmp_type,
+ this->elem.idx, elem->idx);
+
+ parent = *new;
+ if (ret == 1)
+ node = node->rb_left, new = &((*new)->rb_left);
+ else if (ret == 2)
+ node = node->rb_right, new = &((*new)->rb_right);
+ else if (ret == 0)
+ return this->subid;
+ else {
+ pr_perror("kcmp failed: pid (%d %d) type %u idx (%u %u)",
+ this->elem.pid, elem->pid, tree->kcmp_type,
+ this->elem.idx, elem->idx);
+ return 0;
+ }
+ }
+
+ sub = alloc_kid_entry(tree, elem);
+ if (!sub)
+ return 0;
+
+ rb_link_and_balance(&e->subtree_root, &sub->subtree_node, parent, new);
+ *new_id = 1;
+ return sub->subid;
+}
+
+u32 kid_generate_gen(struct kid_tree *tree,
+ struct kid_elem *elem, int *new_id)
+{
+ struct rb_node *node = tree->root.rb_node;
+ struct kid_entry *e = NULL;
+
+ struct rb_node **new = &tree->root.rb_node;
+ struct rb_node *parent = NULL;
+
+ while (node) {
+ struct kid_entry *this = rb_entry(node, struct kid_entry, node);
+
+ parent = *new;
+ if (elem->genid < this->elem.genid)
+ node = node->rb_left, new = &((*new)->rb_left);
+ else if (elem->genid > this->elem.genid)
+ node = node->rb_right, new = &((*new)->rb_right);
+ else
+ return kid_generate_sub(tree, this, elem, new_id);
+ }
+
+ e = alloc_kid_entry(tree, elem);
+ if (!e)
+ return 0;
+
+ rb_link_and_balance(&tree->root, &e->node, parent, new);
+ *new_id = 1;
+ return e->subid;
+
+}
+
diff --git a/criu/kerndat.c b/criu/kerndat.c
new file mode 100644
index 000000000000..eb296033e5d9
--- /dev/null
+++ b/criu/kerndat.c
@@ -0,0 +1,556 @@
+#include <unistd.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <errno.h>
+#include <sys/syscall.h>
+
+#include "log.h"
+#include "bug.h"
+#include "kerndat.h"
+#include "fs-magic.h"
+#include "mem.h"
+#include "compiler.h"
+#include "sysctl.h"
+#include "asm/types.h"
+#include "cr_options.h"
+#include "util.h"
+#include "lsm.h"
+#include "proc_parse.h"
+#include "config.h"
+
+struct kerndat_s kdat = {
+ /*
+ * TCP send receive buffers are calculated
+ * dynamically by the kernel taking into account
+ * the size of memory present on the machine.
+ *
+ * On machines with huge amount of memory it grants
+ * up to 4M for sendding buffer and 6M for receiving.
+ * But in turn for low mem machines these limits
+ * are quite small down to 16K for sending and
+ * 87380 for receiving.
+ *
+ * We will find out precise limits in tcp_read_sysctl_limits
+ * but by default lets stick for small data to not fail
+ * on restore: better to slowdown restore procedure than
+ * failing completely.
+ */
+ .tcp_max_rshare = 87380,
+};
+
+static int check_pagemap(void)
+{
+ int ret, fd;
+ u64 pfn = 0;
+
+ fd = __open_proc(PROC_SELF, EPERM, O_RDONLY, "pagemap");
+ if (fd < 0) {
+ if (errno == EPERM) {
+ pr_info("Pagemap disabled");
+ kdat.pmap = PM_DISABLED;
+ return 0;
+ }
+
+ return -1;
+ }
+
+ /* Get the PFN of some present page. Stack is here, so try it :) */
+ ret = pread(fd, &pfn, sizeof(pfn), (((unsigned long)&ret) / page_size()) * sizeof(pfn));
+ if (ret != sizeof(pfn)) {
+ pr_perror("Can't read pagemap");
+ return -1;
+ }
+
+ close(fd);
+
+ if ((pfn & PME_PFRAME_MASK) == 0) {
+ pr_info("Pagemap provides flags only\n");
+ kdat.pmap = PM_FLAGS_ONLY;
+ } else {
+ pr_info("Pagemap is fully functional\n");
+ kdat.pmap = PM_FULL;
+ }
+
+ return 0;
+}
+
+/*
+ * Anonymous shared mappings are backed by hidden tmpfs
+ * mount. Find out its dev to distinguish such mappings
+ * from real tmpfs files maps.
+ */
+
+static int parse_self_maps(unsigned long vm_start, dev_t *device)
+{
+ FILE *maps;
+ char buf[1024];
+
+ maps = fopen_proc(PROC_SELF, "maps");
+ if (maps == NULL) {
+ pr_perror("Can't open self maps");
+ return -1;
+ }
+
+ while (fgets(buf, sizeof(buf), maps) != NULL) {
+ char *end, *aux;
+ unsigned long start;
+ int maj, min;
+
+ start = strtoul(buf, &end, 16);
+ if (vm_start > start)
+ continue;
+ if (vm_start < start)
+ break;
+
+ /* It's ours */
+ aux = strchr(end + 1, ' '); /* end prot */
+ aux = strchr(aux + 1, ' '); /* prot pgoff */
+ aux = strchr(aux + 1, ' '); /* pgoff dev */
+
+ maj = strtoul(aux + 1, &end, 16);
+ min = strtoul(end + 1, NULL, 16);
+
+ *device = makedev(maj, min);
+ fclose(maps);
+ return 0;
+ }
+
+ fclose(maps);
+ return -1;
+}
+
+static int kerndat_get_shmemdev(void)
+{
+ void *map;
+ char maps[128];
+ struct stat buf;
+ dev_t dev;
+
+ map = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANONYMOUS, 0, 0);
+ if (map == MAP_FAILED) {
+ pr_perror("Can't mmap memory for shmemdev test");
+ return -1;
+ }
+
+ sprintf(maps, "/proc/self/map_files/%lx-%lx",
+ (unsigned long)map, (unsigned long)map + page_size());
+ if (stat(maps, &buf) < 0) {
+ int e = errno;
+ if (errno == EPERM) {
+ /*
+ * Kernel disables messing with map_files.
+ * OK, let's go the slower route.
+ */
+
+ if (parse_self_maps((unsigned long)map, &dev) < 0) {
+ pr_err("Can't read self maps\n");
+ goto err;
+ }
+ } else {
+ pr_perror("Can't stat self map_files %d", e);
+ goto err;
+ }
+ } else
+ dev = buf.st_dev;
+
+ munmap(map, PAGE_SIZE);
+ kdat.shmem_dev = dev;
+ pr_info("Found anon-shmem device at %"PRIx64"\n", kdat.shmem_dev);
+ return 0;
+
+err:
+ munmap(map, PAGE_SIZE);
+ return -1;
+}
+
+static dev_t get_host_dev(unsigned int which)
+{
+ static struct kst {
+ const char *name;
+ const char *path;
+ unsigned int magic;
+ dev_t fs_dev;
+ } kstat[KERNDAT_FS_STAT_MAX] = {
+ [KERNDAT_FS_STAT_DEVPTS] = {
+ .name = "devpts",
+ .path = "/dev/pts",
+ .magic = DEVPTS_SUPER_MAGIC,
+ },
+ [KERNDAT_FS_STAT_DEVTMPFS] = {
+ .name = "devtmpfs",
+ .path = "/dev",
+ .magic = TMPFS_MAGIC,
+ },
+ [KERNDAT_FS_STAT_BINFMT_MISC] = {
+ .name = "binfmt_misc",
+ .path = "/proc/sys/fs/binfmt_misc",
+ .magic = BINFMTFS_MAGIC,
+ },
+ };
+
+ if (which >= KERNDAT_FS_STAT_MAX) {
+ pr_err("Wrong fs type %u passed\n", which);
+ return 0;
+ }
+
+ if (kstat[which].fs_dev == 0) {
+ struct statfs fst;
+ struct stat st;
+
+ if (statfs(kstat[which].path, &fst)) {
+ pr_perror("Unable to statefs %s", kstat[which].path);
+ return 0;
+ }
+
+ /*
+ * XXX: If the fs we need is not there, it still
+ * may mean that it's virtualized, but just not
+ * mounted on the host.
+ */
+
+ if (fst.f_type != kstat[which].magic) {
+ pr_err("%s isn't mount on the host\n", kstat[which].name);
+ return 0;
+ }
+
+ if (stat(kstat[which].path, &st)) {
+ pr_perror("Unable to stat %s", kstat[which].path);
+ return 0;
+ }
+
+ BUG_ON(st.st_dev == 0);
+ kstat[which].fs_dev = st.st_dev;
+ }
+
+ return kstat[which].fs_dev;
+}
+
+int kerndat_fs_virtualized(unsigned int which, u32 kdev)
+{
+ dev_t host_fs_dev;
+
+ host_fs_dev = get_host_dev(which);
+ if (host_fs_dev == 0)
+ return -1;
+
+ return (kdev_to_odev(kdev) == host_fs_dev) ? 0 : 1;
+}
+
+/*
+ * Check whether pagemap reports soft dirty bit. Kernel has
+ * this functionality under CONFIG_MEM_SOFT_DIRTY option.
+ */
+
+int kerndat_get_dirty_track(void)
+{
+ char *map;
+ int pm2;
+ u64 pmap = 0;
+ int ret = -1;
+
+ map = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+ if (map == MAP_FAILED) {
+ pr_perror("Can't mmap memory for pagemap test");
+ return ret;
+ }
+
+ /*
+ * Kernel shows soft-dirty bits only if this soft-dirty
+ * was at least once re-set. (this is to be removed in
+ * a couple of kernel releases)
+ */
+ ret = do_task_reset_dirty_track(getpid());
+ if (ret < 0)
+ return ret;
+ if (ret == 1)
+ goto no_dt;
+
+ ret = -1;
+ pm2 = open("/proc/self/pagemap", O_RDONLY);
+ if (pm2 < 0) {
+ pr_perror("Can't open pagemap file");
+ munmap(map, PAGE_SIZE);
+ return ret;
+ }
+
+ map[0] = '\0';
+
+ lseek(pm2, (unsigned long)map / PAGE_SIZE * sizeof(u64), SEEK_SET);
+ ret = read(pm2, &pmap, sizeof(pmap));
+ if (ret < 0)
+ pr_perror("Read pmap err!");
+
+ close(pm2);
+ munmap(map, PAGE_SIZE);
+
+ if (pmap & PME_SOFT_DIRTY) {
+ pr_info("Dirty track supported on kernel\n");
+ kdat.has_dirty_track = true;
+ } else {
+no_dt:
+ pr_info("Dirty tracking support is OFF\n");
+ if (opts.track_mem) {
+ pr_err("Tracking memory is not available\n");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Strictly speaking, if there is a machine with huge amount
+ * of memory, we're allowed to send up to 4M and read up to
+ * 6M of tcp data at once. But we will figure out precise size
+ * of a limit a bit later when restore starts.
+ *
+ * Meanwhile set it up to 2M and 3M, which is safe enough to
+ * proceed without errors.
+ */
+
+static int tcp_read_sysctl_limits(void)
+{
+ u32 vect[3] = { };
+ int ret;
+
+ struct sysctl_req req[] = {
+ { "net/ipv4/tcp_rmem", &vect, CTL_U32A(ARRAY_SIZE(vect)), CTL_FLAGS_OPTIONAL },
+ };
+
+ /*
+ * Lets figure out which exactly amount of memory is
+ * availabe for send/read queues on restore.
+ */
+ ret = sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0);
+ if (ret || vect[0] == 0) {
+ pr_warn("TCP mem sysctls are not available. Using defaults.\n");
+ goto out;
+ }
+
+ kdat.tcp_max_rshare = min(kdat.tcp_max_rshare, (int)vect[2]);
+
+ if (kdat.tcp_max_rshare < 128)
+ pr_warn("The memory limits for TCP queues are suspiciously small\n");
+out:
+ pr_debug("TCP recv queue memory limit is %d\n", kdat.tcp_max_rshare);
+ return 0;
+}
+
+/* The page frame number (PFN) is constant for the zero page */
+static int init_zero_page_pfn()
+{
+ void *addr;
+ int ret;
+
+ addr = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (addr == MAP_FAILED) {
+ pr_perror("Unable to map zero page");
+ return 0;
+ }
+
+ if (*((int *) addr) != 0) {
+ BUG();
+ return -1;
+ }
+
+ if (kdat.pmap != PM_FULL) {
+ pr_info("Zero page detection failed, optimization turns off.\n");
+ return 0;
+ }
+
+ ret = vaddr_to_pfn((unsigned long)addr, &kdat.zero_page_pfn);
+ munmap(addr, PAGE_SIZE);
+
+ if (kdat.zero_page_pfn == 0)
+ ret = -1;
+
+ return ret;
+}
+
+static int get_last_cap(void)
+{
+ struct sysctl_req req[] = {
+ { "kernel/cap_last_cap", &kdat.last_cap, CTL_U32 },
+ };
+
+ return sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0);
+}
+
+#ifdef CONFIG_HAS_MEMFD
+static bool kerndat_has_memfd_create(void)
+{
+ int ret;
+
+ ret = syscall(SYS_memfd_create, NULL, 0);
+
+ if (ret == -1 && errno == ENOSYS)
+ kdat.has_memfd = false;
+ else if (ret == -1 && errno == EFAULT)
+ kdat.has_memfd = true;
+ else {
+ pr_err("Unexpected error %d from memfd_create(NULL, 0)\n", ret);
+ return -1;
+ }
+
+ return 0;
+}
+#else
+static bool kerndat_has_memfd_create(void)
+{
+ kdat.has_memfd = false;
+ return 0;
+}
+#endif
+
+static int get_task_size(void)
+{
+ kdat.task_size = task_size();
+ pr_debug("Found task size of %lx\n", kdat.task_size);
+ return 0;
+}
+
+int kerndat_fdinfo_has_lock()
+{
+ int fd, pfd = -1, exit_code = -1, len;
+ char buf[PAGE_SIZE];
+
+ fd = open("/proc/locks", O_RDONLY);
+ if (fd < 0) {
+ pr_perror("Unable to open /proc/locks");
+ return -1;
+ }
+
+ if (flock(fd, LOCK_SH)) {
+ pr_perror("Can't take a lock");
+ goto out;
+ }
+
+ pfd = open_proc(PROC_SELF, "fdinfo/%d", fd);
+ if (pfd < 0)
+ goto out;
+
+ len = read(pfd, buf, sizeof(buf) - 1);
+ if (len < 0) {
+ pr_perror("Unable to read");
+ goto out;
+ }
+ buf[len] = 0;
+
+ kdat.has_fdinfo_lock = (strstr(buf, "lock:") != NULL);
+
+ exit_code = 0;
+out:
+ close(pfd);
+ close(fd);
+
+ return exit_code;
+}
+
+static int get_ipv6()
+{
+ if (access("/proc/sys/net/ipv6", F_OK) < 0) {
+ if (errno == ENOENT) {
+ pr_debug("ipv6 is disabled\n");
+ kdat.ipv6 = false;
+ return 0;
+ }
+ pr_perror("Unable to access /proc/sys/net/ipv6");
+ return -1;
+ }
+ kdat.ipv6 = true;
+ return 0;
+}
+
+int kerndat_loginuid(bool only_dump)
+{
+ unsigned int saved_loginuid;
+ int ret;
+
+ kdat.has_loginuid = false;
+
+ /* No such file: CONFIG_AUDITSYSCALL disabled */
+ saved_loginuid = parse_pid_loginuid(getpid(), &ret, true);
+ if (ret < 0)
+ return 0;
+
+ if (only_dump) {
+ kdat.has_loginuid = true;
+ return 0;
+ }
+
+ /*
+ * From kernel v3.13-rc2 it's possible to unset loginuid value,
+ * on that rely dump/restore code.
+ * See also: marc.info/?l=git-commits-head&m=138509506407067
+ */
+ if (prepare_loginuid(INVALID_UID, LOG_WARN) < 0)
+ return 0;
+ /* Cleaning value back as it was */
+ if (prepare_loginuid(saved_loginuid, LOG_WARN) < 0)
+ return 0;
+
+ kdat.has_loginuid = true;
+ return 0;
+}
+
+int kerndat_init(void)
+{
+ int ret;
+
+ ret = check_pagemap();
+ if (!ret)
+ ret = kerndat_get_shmemdev();
+ if (!ret)
+ ret = kerndat_get_dirty_track();
+ if (!ret)
+ ret = init_zero_page_pfn();
+ if (!ret)
+ ret = get_last_cap();
+ if (!ret)
+ ret = kerndat_fdinfo_has_lock();
+ if (!ret)
+ ret = get_task_size();
+ if (!ret)
+ ret = get_ipv6();
+ if (!ret)
+ ret = kerndat_loginuid(true);
+
+ kerndat_lsm();
+
+ return ret;
+}
+
+int kerndat_init_rst(void)
+{
+ int ret;
+
+ /*
+ * Read TCP sysctls before anything else,
+ * since the limits we're interested in are
+ * not available inside namespaces.
+ */
+
+ ret = check_pagemap();
+ if (!ret)
+ ret = tcp_read_sysctl_limits();
+ if (!ret)
+ ret = get_last_cap();
+ if (!ret)
+ ret = kerndat_has_memfd_create();
+ if (!ret)
+ ret = get_task_size();
+ if (!ret)
+ ret = get_ipv6();
+ if (!ret)
+ ret = kerndat_loginuid(false);
+
+ kerndat_lsm();
+
+ return ret;
+}
diff --git a/criu/libnetlink.c b/criu/libnetlink.c
new file mode 100644
index 000000000000..49c804fd7053
--- /dev/null
+++ b/criu/libnetlink.c
@@ -0,0 +1,160 @@
+#include <linux/types.h>
+#include <sys/socket.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "libnetlink.h"
+#include "util.h"
+
+int parse_rtattr(struct rtattr *tb[], int max, struct rtattr *rta, int len)
+{
+ memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
+ while (RTA_OK(rta, len)) {
+ if ((rta->rta_type <= max) && (!tb[rta->rta_type]))
+ tb[rta->rta_type] = rta;
+ rta = RTA_NEXT(rta, len);
+ }
+ if (len)
+ pr_warn("Trimmed RTA: len %d, rta_len %d\n", len, rta->rta_len);
+ return 0;
+}
+
+static int nlmsg_receive(char *buf, int len, int (*cb)(struct nlmsghdr *, void *),
+ int (*err_cb)(int, void *), void *arg)
+{
+ struct nlmsghdr *hdr;
+
+ for (hdr = (struct nlmsghdr *)buf; NLMSG_OK(hdr, len); hdr = NLMSG_NEXT(hdr, len)) {
+ if (hdr->nlmsg_seq != CR_NLMSG_SEQ)
+ continue;
+ if (hdr->nlmsg_type == NLMSG_DONE) {
+ int *len = (int *)NLMSG_DATA(hdr);
+
+ if (*len < 0) {
+ pr_err("ERROR %d reported by netlink (%s)\n",
+ *len, strerror(-*len));
+ return *len;
+ }
+
+ return 0;
+ }
+ if (hdr->nlmsg_type == NLMSG_ERROR) {
+ struct nlmsgerr *err = (struct nlmsgerr *)NLMSG_DATA(hdr);
+
+ if (hdr->nlmsg_len - sizeof(*hdr) < sizeof(struct nlmsgerr)) {
+ pr_err("ERROR truncated\n");
+ return -1;
+ }
+
+ if (err->error == 0)
+ return 0;
+
+ return err_cb(err->error, arg);
+ }
+ if (cb(hdr, arg))
+ return -1;
+ }
+
+ return 1;
+}
+
+static int rtnl_return_err(int err, void *arg)
+{
+ pr_warn("ERROR %d reported by netlink\n", err);
+ return err;
+}
+
+int do_rtnl_req(int nl, void *req, int size,
+ int (*receive_callback)(struct nlmsghdr *h, void *),
+ int (*error_callback)(int err, void *), void *arg)
+{
+ struct msghdr msg;
+ struct sockaddr_nl nladdr;
+ struct iovec iov;
+ static char buf[16384];
+ int err;
+
+ if (!error_callback)
+ error_callback = rtnl_return_err;
+
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_name = &nladdr;
+ msg.msg_namelen = sizeof(nladdr);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ memset(&nladdr, 0, sizeof(nladdr));
+ nladdr.nl_family = AF_NETLINK;
+
+ iov.iov_base = req;
+ iov.iov_len = size;
+
+ if (sendmsg(nl, &msg, 0) < 0) {
+ err = -errno;
+ pr_perror("Can't send request message");
+ goto err;
+ }
+
+ iov.iov_base = buf;
+ iov.iov_len = sizeof(buf);
+
+ while (1) {
+
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_name = &nladdr;
+ msg.msg_namelen = sizeof(nladdr);
+ msg.msg_iov = &iov;
+ msg.msg_iovlen = 1;
+
+ err = recvmsg(nl, &msg, 0);
+ if (err < 0) {
+ if (errno == EINTR)
+ continue;
+ else {
+ err = -errno;
+ pr_perror("Error receiving nl report");
+ goto err;
+ }
+ }
+ if (err == 0)
+ break;
+
+ if (msg.msg_flags & MSG_TRUNC) {
+ pr_err("Message truncated\n");
+ err = -EMSGSIZE;
+ goto err;
+ }
+
+ err = nlmsg_receive(buf, err, receive_callback, error_callback, arg);
+ if (err < 0)
+ goto err;
+ if (err == 0)
+ break;
+ }
+
+ return 0;
+
+err:
+ return err;
+}
+
+int addattr_l(struct nlmsghdr *n, int maxlen, int type, const void *data,
+ int alen)
+{
+ int len = RTA_LENGTH(alen);
+ struct rtattr *rta;
+
+ if (NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len) > maxlen) {
+ pr_err("addattr_l ERROR: message exceeded bound of %d\n", maxlen);
+ return -1;
+ }
+
+ rta = NLMSG_TAIL(n);
+ rta->rta_type = type;
+ rta->rta_len = len;
+ memcpy(RTA_DATA(rta), data, alen);
+ n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len);
+ return 0;
+}
diff --git a/criu/log.c b/criu/log.c
new file mode 100644
index 000000000000..1435401abac5
--- /dev/null
+++ b/criu/log.c
@@ -0,0 +1,199 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <limits.h>
+
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include <fcntl.h>
+
+#include "compiler.h"
+#include "asm/types.h"
+#include "util.h"
+#include "cr_options.h"
+#include "servicefd.h"
+
+#define DEFAULT_LOGFD STDERR_FILENO
+/* Enable timestamps if verbosity is increased from default */
+#define LOG_TIMESTAMP (DEFAULT_LOGLEVEL + 1)
+
+static unsigned int current_loglevel = DEFAULT_LOGLEVEL;
+
+static char buffer[PAGE_SIZE * 2];
+static char buf_off = 0;
+
+static struct timeval start;
+/*
+ * Manual buf len as sprintf will _always_ put '\0' at the end,
+ * but we want a "constant" pid to be there on restore
+ */
+#define TS_BUF_OFF 12
+
+static void timediff(struct timeval *from, struct timeval *to)
+{
+ to->tv_sec -= from->tv_sec;
+ if (to->tv_usec >= from->tv_usec)
+ to->tv_usec -= from->tv_usec;
+ else {
+ to->tv_sec--;
+ to->tv_usec += 1000000 - from->tv_usec;
+ }
+}
+
+static void print_ts(void)
+{
+ struct timeval t;
+
+ gettimeofday(&t, NULL);
+ timediff(&start, &t);
+ snprintf(buffer, TS_BUF_OFF,
+ "(%02u.%06u)", (unsigned)t.tv_sec, (unsigned)t.tv_usec);
+ buffer[TS_BUF_OFF - 1] = ' '; /* kill the '\0' produced by snprintf */
+}
+
+int log_get_fd(void)
+{
+ int fd = get_service_fd(LOG_FD_OFF);
+
+ return fd < 0 ? DEFAULT_LOGFD : fd;
+}
+
+static void reset_buf_off(void)
+{
+ if (current_loglevel >= LOG_TIMESTAMP)
+ /* reserve space for a timestamp */
+ buf_off = TS_BUF_OFF;
+ else
+ buf_off = 0;
+}
+
+int log_init(const char *output)
+{
+ int new_logfd, fd;
+
+ gettimeofday(&start, NULL);
+ reset_buf_off();
+
+ if (output) {
+ new_logfd = open(output, O_CREAT|O_TRUNC|O_WRONLY|O_APPEND, 0600);
+ if (new_logfd < 0) {
+ pr_perror("Can't create log file %s", output);
+ return -1;
+ }
+ } else {
+ new_logfd = dup(DEFAULT_LOGFD);
+ if (new_logfd < 0) {
+ pr_perror("Can't dup log file");
+ return -1;
+ }
+ }
+
+ fd = install_service_fd(LOG_FD_OFF, new_logfd);
+ close(new_logfd);
+ if (fd < 0)
+ goto err;
+
+ return 0;
+
+err:
+ pr_perror("Log engine failure, can't duplicate descriptor");
+ return -1;
+}
+
+int log_init_by_pid(void)
+{
+ char path[PATH_MAX];
+
+ /*
+ * reset buf_off as this fn is called on each fork while
+ * restoring process tree
+ */
+ reset_buf_off();
+
+ if (!opts.log_file_per_pid) {
+ buf_off += snprintf(buffer + buf_off, sizeof buffer - buf_off, "%6d: ", getpid());
+ return 0;
+ }
+
+ if (!opts.output)
+ return 0;
+
+ snprintf(path, PATH_MAX, "%s.%d", opts.output, getpid());
+
+ return log_init(path);
+}
+
+void log_fini(void)
+{
+ close_service_fd(LOG_FD_OFF);
+}
+
+void log_set_loglevel(unsigned int level)
+{
+ if (level == LOG_UNSET)
+ current_loglevel = DEFAULT_LOGLEVEL;
+ else
+ current_loglevel = level;
+}
+
+unsigned int log_get_loglevel(void)
+{
+ return current_loglevel;
+}
+
+static void __print_on_level(unsigned int loglevel, const char *format, va_list params)
+{
+ int fd, size, ret, off = 0;
+ int __errno = errno;
+
+ if (unlikely(loglevel == LOG_MSG)) {
+ fd = STDOUT_FILENO;
+ off = buf_off; /* skip dangling timestamp */
+ } else {
+ if (loglevel > current_loglevel)
+ return;
+ fd = log_get_fd();
+ if (current_loglevel >= LOG_TIMESTAMP)
+ print_ts();
+ }
+
+ size = vsnprintf(buffer + buf_off, sizeof buffer - buf_off, format, params);
+ size += buf_off;
+
+ while (off < size) {
+ ret = write(fd, buffer + off, size - off);
+ if (ret <= 0)
+ break;
+ off += ret;
+ }
+ errno = __errno;
+}
+
+void print_on_level(unsigned int loglevel, const char *format, ...)
+{
+ va_list params;
+
+ va_start(params, format);
+ __print_on_level(loglevel, format, params);
+ va_end(params);
+}
+
+int write_pidfile(int pid)
+{
+ int fd;
+
+ fd = open(opts.pidfile, O_WRONLY | O_EXCL | O_CREAT, 0600);
+ if (fd == -1) {
+ pr_perror("Can't open %s", opts.pidfile);
+ return -1;
+ }
+
+ dprintf(fd, "%d", pid);
+ close(fd);
+ return 0;
+}
diff --git a/criu/lsm.c b/criu/lsm.c
new file mode 100644
index 000000000000..158caf0733d5
--- /dev/null
+++ b/criu/lsm.c
@@ -0,0 +1,251 @@
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "config.h"
+#include "pstree.h"
+#include "util.h"
+#include "cr_options.h"
+
+#include "protobuf.h"
+#include "protobuf/inventory.pb-c.h"
+#include "protobuf/creds.pb-c.h"
+
+#undef CONFIG_HAS_SELINUX
+
+#ifdef CONFIG_HAS_SELINUX
+#include <selinux/selinux.h>
+#endif
+
+static Lsmtype lsmtype;
+static int (*get_label)(pid_t, char **) = NULL;
+static char *name = NULL;
+
+static int apparmor_get_label(pid_t pid, char **profile_name)
+{
+ FILE *f;
+ char *space;
+
+ f = fopen_proc(pid, "attr/current");
+ if (!f)
+ return -1;
+
+ if (fscanf(f, "%ms", profile_name) != 1) {
+ fclose(f);
+ pr_perror("err scanfing");
+ return -1;
+ }
+
+ fclose(f);
+
+ /*
+ * A profile name can be followed by an enforcement mode, e.g.
+ * lxc-default-with-nesting (enforced)
+ * but the profile name is just the part before the space.
+ */
+ space = strstr(*profile_name, " ");
+ if (space)
+ *space = 0;
+
+ /*
+ * An "unconfined" value means there is no profile, so we don't need to
+ * worry about trying to restore one.
+ */
+ if (strcmp(*profile_name, "unconfined") == 0) {
+ free(*profile_name);
+ *profile_name = NULL;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_HAS_SELINUX
+static int selinux_get_label(pid_t pid, char **output)
+{
+ security_context_t ctx;
+ char *pos, *last;
+ int i;
+
+ if (getpidcon_raw(pid, &ctx) < 0) {
+ pr_perror("getting selinux profile failed");
+ return -1;
+ }
+
+ *output = NULL;
+
+ /*
+ * Since SELinux attributes can be finer grained than at the task
+ * level, and we currently don't try to dump any of these other bits,
+ * let's only allow unconfined profiles, which look something like:
+ *
+ * unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023
+ */
+ pos = (char*)ctx;
+ for (i = 0; i < 3; i++) {
+ last = pos;
+ pos = strstr(pos, ":");
+ if (!pos) {
+ pr_err("Invalid selinux context %s\n", (char *)ctx);
+ freecon(ctx);
+ return -1;
+ }
+
+ *pos = 0;
+ if (!strstartswith(last, "unconfined_")) {
+ pr_err("Non unconfined selinux contexts not supported %s\n", last);
+ freecon(ctx);
+ return -1;
+ }
+
+ pos++;
+ }
+ freecon(ctx);
+
+ return 0;
+}
+#endif
+
+void kerndat_lsm(void)
+{
+ /* On restore, if someone passes --lsm-profile, we might end up doing
+ * detection twice, once during flag parsing and once for
+ * kerndat_init_rst(). Let's detect when we've already done detection
+ * and not do it again.
+ */
+ if (name)
+ return;
+
+ if (access("/sys/kernel/security/apparmor", F_OK) == 0) {
+ get_label = apparmor_get_label;
+ lsmtype = LSMTYPE__APPARMOR;
+ name = "apparmor";
+ return;
+ }
+
+#ifdef CONFIG_HAS_SELINUX
+ /*
+ * This seems to be the canonical place to mount this fs if it is
+ * enabled, although we may (?) want to check /selinux for posterity as
+ * well.
+ */
+ if (access("/sys/fs/selinux", F_OK) == 0) {
+ get_label = selinux_get_label;
+ lsmtype = LSMTYPE__SELINUX;
+ name = "selinux";
+ return;
+ }
+#endif
+
+ get_label = NULL;
+ lsmtype = LSMTYPE__NO_LSM;
+ name = "none";
+}
+
+Lsmtype host_lsm_type(void)
+{
+ return lsmtype;
+}
+
+int collect_lsm_profile(pid_t pid, CredsEntry *ce)
+{
+ ce->lsm_profile = NULL;
+
+ if (lsmtype == LSMTYPE__NO_LSM)
+ return 0;
+
+ if (get_label(pid, &ce->lsm_profile) < 0)
+ return -1;
+
+ if (ce->lsm_profile)
+ pr_info("%d has lsm profile %s\n", pid, ce->lsm_profile);
+
+ return 0;
+}
+
+// in inventory.c
+extern Lsmtype image_lsm;
+
+int validate_lsm(char *lsm_profile)
+{
+ if (image_lsm == LSMTYPE__NO_LSM || image_lsm == lsmtype)
+ return 0;
+
+ /*
+ * This is really only a problem if the processes have actually
+ * specified an LSM profile. If not, we won't restore anything anyway,
+ * so it's fine.
+ */
+ if (lsm_profile) {
+ pr_err("mismatched lsm types and lsm profile specified\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+int render_lsm_profile(char *profile, char **val)
+{
+ *val = NULL;
+
+ switch (lsmtype) {
+ case LSMTYPE__APPARMOR:
+ if (strcmp(profile, "unconfined") != 0 && asprintf(val, "changeprofile %s", profile) < 0) {
+ *val = NULL;
+ return -1;
+ }
+ break;
+ case LSMTYPE__SELINUX:
+ if (asprintf(val, "%s", profile) < 0) {
+ *val = NULL;
+ return -1;
+ }
+ break;
+ default:
+ return -1;
+ }
+
+ return 0;
+}
+
+int parse_lsm_arg(char *arg)
+{
+ char *aux;
+
+ kerndat_lsm();
+
+ aux = strchr(arg, ':');
+ if (aux == NULL) {
+ pr_err("invalid argument %s for --lsm-profile", arg);
+ return -1;
+ }
+
+ *aux = '\0';
+ aux++;
+
+ if (strcmp(arg, "apparmor") == 0) {
+ if (lsmtype != LSMTYPE__APPARMOR) {
+ pr_err("apparmor LSM specified but apparmor not supported by kernel\n");
+ return -1;
+ }
+
+ opts.lsm_profile = aux;
+ } else if (strcmp(arg, "selinux") == 0) {
+ if (lsmtype != LSMTYPE__SELINUX) {
+ pr_err("selinux LSM specified but selinux not supported by kernel\n");
+ return -1;
+ }
+
+ opts.lsm_profile = aux;
+ } else if (strcmp(arg, "none") == 0) {
+ opts.lsm_profile = NULL;
+ } else {
+ pr_err("unknown lsm %s\n", arg);
+ return -1;
+ }
+
+ opts.lsm_supplied = true;
+
+ return 0;
+}
diff --git a/criu/mem.c b/criu/mem.c
new file mode 100644
index 000000000000..332f1928bb2d
--- /dev/null
+++ b/criu/mem.c
@@ -0,0 +1,473 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <sys/mman.h>
+#include <errno.h>
+#include <fcntl.h>
+
+#include "cr_options.h"
+#include "servicefd.h"
+#include "mem.h"
+#include "parasite-syscall.h"
+#include "parasite.h"
+#include "page-pipe.h"
+#include "page-xfer.h"
+#include "log.h"
+#include "kerndat.h"
+#include "stats.h"
+#include "vma.h"
+#include "shmem.h"
+#include "pstree.h"
+#include "restorer.h"
+#include "files-reg.h"
+#include "pagemap-cache.h"
+
+#include "protobuf.h"
+#include "protobuf/pagemap.pb-c.h"
+
+static int task_reset_dirty_track(int pid)
+{
+ int ret;
+
+ if (!opts.track_mem)
+ return 0;
+
+ BUG_ON(!kdat.has_dirty_track);
+
+ ret = do_task_reset_dirty_track(pid);
+ BUG_ON(ret == 1);
+ return ret;
+}
+
+int do_task_reset_dirty_track(int pid)
+{
+ int fd, ret;
+ char cmd[] = "4";
+
+ pr_info("Reset %d's dirty tracking\n", pid);
+
+ fd = __open_proc(pid, EACCES, O_RDWR, "clear_refs");
+ if (fd < 0)
+ return errno == EACCES ? 1 : -1;
+
+ ret = write(fd, cmd, sizeof(cmd));
+ if (ret < 0) {
+ if (errno == EINVAL) /* No clear-soft-dirty in kernel */
+ ret = 1;
+ else {
+ pr_perror("Can't reset %d's dirty memory tracker (%d)\n", pid, errno);
+ ret = -1;
+ }
+ } else {
+ pr_info(" ... done\n");
+ ret = 0;
+ }
+
+ close(fd);
+ return ret;
+}
+
+unsigned int dump_pages_args_size(struct vm_area_list *vmas)
+{
+ /* In the worst case I need one iovec for each page */
+ return sizeof(struct parasite_dump_pages_args) +
+ vmas->nr * sizeof(struct parasite_vma_entry) +
+ (vmas->priv_size + 1) * sizeof(struct iovec);
+}
+
+static inline bool should_dump_page(VmaEntry *vmae, u64 pme)
+{
+#ifdef CONFIG_VDSO
+ /*
+ * vDSO area must be always dumped because on restore
+ * we might need to generate a proxy.
+ */
+ if (vma_entry_is(vmae, VMA_AREA_VDSO))
+ return true;
+ /*
+ * In turn VVAR area is special and referenced from
+ * vDSO area by IP addressing (at least on x86) thus
+ * never ever dump its content but always use one provided
+ * by the kernel on restore, ie runtime VVAR area must
+ * be remapped into proper place..
+ */
+ if (vma_entry_is(vmae, VMA_AREA_VVAR))
+ return false;
+#endif
+ /*
+ * Optimisation for private mapping pages, that haven't
+ * yet being COW-ed
+ */
+ if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE))
+ return false;
+ if (pme & PME_SWAP)
+ return true;
+ if ((pme & PME_PRESENT) && ((pme & PME_PFRAME_MASK) != kdat.zero_page_pfn))
+ return true;
+
+ return false;
+}
+
+static inline bool page_in_parent(u64 pme)
+{
+ /*
+ * If we do memory tracking, but w/o parent images,
+ * then we have to dump all memory
+ */
+
+ return opts.track_mem && opts.img_parent && !(pme & PME_SOFT_DIRTY);
+}
+
+/*
+ * This routine finds out what memory regions to grab from the
+ * dumpee. The iovs generated are then fed into vmsplice to
+ * put the memory into the page-pipe's pipe.
+ *
+ * "Holes" in page-pipe are regions, that should be dumped, but
+ * the memory contents is present in the pagent image set.
+ */
+
+static int generate_iovs(struct vma_area *vma, struct page_pipe *pp, u64 *map, u64 *off, bool has_parent)
+{
+ u64 *at = &map[PAGE_PFN(*off)];
+ unsigned long pfn, nr_to_scan;
+ unsigned long pages[2] = {};
+
+ nr_to_scan = (vma_area_len(vma) - *off) / PAGE_SIZE;
+
+ for (pfn = 0; pfn < nr_to_scan; pfn++) {
+ unsigned long vaddr;
+ int ret;
+
+ if (!should_dump_page(vma->e, at[pfn]))
+ continue;
+
+ vaddr = vma->e->start + *off + pfn * PAGE_SIZE;
+
+ /*
+ * If we're doing incremental dump (parent images
+ * specified) and page is not soft-dirty -- we dump
+ * hole and expect the parent images to contain this
+ * page. The latter would be checked in page-xfer.
+ */
+
+ if (has_parent && page_in_parent(at[pfn])) {
+ ret = page_pipe_add_hole(pp, vaddr);
+ pages[0]++;
+ } else {
+ ret = page_pipe_add_page(pp, vaddr);
+ pages[1]++;
+ }
+
+ if (ret) {
+ *off += pfn * PAGE_SIZE;
+ return ret;
+ }
+ }
+
+ *off += pfn * PAGE_SIZE;
+
+ cnt_add(CNT_PAGES_SCANNED, nr_to_scan);
+ cnt_add(CNT_PAGES_SKIPPED_PARENT, pages[0]);
+ cnt_add(CNT_PAGES_WRITTEN, pages[1]);
+
+ pr_info("Pagemap generated: %lu pages %lu holes\n", pages[1], pages[0]);
+ return 0;
+}
+
+static struct parasite_dump_pages_args *prep_dump_pages_args(struct parasite_ctl *ctl,
+ struct vm_area_list *vma_area_list)
+{
+ struct parasite_dump_pages_args *args;
+ struct parasite_vma_entry *p_vma;
+ struct vma_area *vma;
+
+ args = parasite_args_s(ctl, dump_pages_args_size(vma_area_list));
+
+ p_vma = pargs_vmas(args);
+ args->nr_vmas = 0;
+
+ list_for_each_entry(vma, &vma_area_list->h, list) {
+ if (!vma_area_is_private(vma, kdat.task_size))
+ continue;
+ if (vma->e->prot & PROT_READ)
+ continue;
+
+ p_vma->start = vma->e->start;
+ p_vma->len = vma_area_len(vma);
+ p_vma->prot = vma->e->prot;
+
+ args->nr_vmas++;
+ p_vma++;
+ }
+
+ return args;
+}
+
+static int dump_pages(struct page_pipe *pp, struct parasite_ctl *ctl,
+ struct parasite_dump_pages_args *args, struct page_xfer *xfer)
+{
+ struct page_pipe_buf *ppb;
+ int ret = 0;
+
+ debug_show_page_pipe(pp);
+
+ /* Step 2 -- grab pages into page-pipe */
+ list_for_each_entry(ppb, &pp->bufs, l) {
+ args->nr_segs = ppb->nr_segs;
+ args->nr_pages = ppb->pages_in;
+ pr_debug("PPB: %d pages %d segs %u pipe %d off\n",
+ args->nr_pages, args->nr_segs, ppb->pipe_size, args->off);
+
+ ret = __parasite_execute_daemon(PARASITE_CMD_DUMPPAGES, ctl);
+ if (ret < 0)
+ return -1;
+ ret = parasite_send_fd(ctl, ppb->p[1]);
+ if (ret)
+ return -1;
+
+ ret = __parasite_wait_daemon_ack(PARASITE_CMD_DUMPPAGES, ctl);
+ if (ret < 0)
+ return -1;
+
+ args->off += args->nr_segs;
+ }
+
+ /*
+ * Step 3 -- write pages into image (or delay writing for
+ * pre-dump action (see pre_dump_one_task)
+ */
+ if (xfer) {
+ timing_start(TIME_MEMWRITE);
+ ret = page_xfer_dump_pages(xfer, pp, 0);
+ timing_stop(TIME_MEMWRITE);
+ }
+
+ return ret;
+}
+
+static int __parasite_dump_pages_seized(struct parasite_ctl *ctl,
+ struct parasite_dump_pages_args *args,
+ struct vm_area_list *vma_area_list,
+ struct page_pipe **pp_ret)
+{
+ pmc_t pmc = PMC_INIT;
+ struct page_pipe *pp;
+ struct vma_area *vma_area;
+ struct page_xfer xfer = { .parent = NULL };
+ int ret = -1;
+
+ pr_info("\n");
+ pr_info("Dumping pages (type: %d pid: %d)\n", CR_FD_PAGES, ctl->pid.real);
+ pr_info("----------------------------------------\n");
+
+ timing_start(TIME_MEMDUMP);
+
+ pr_debug(" Private vmas %lu/%lu pages\n",
+ vma_area_list->longest, vma_area_list->priv_size);
+
+ /*
+ * Step 0 -- prepare
+ */
+
+ if (pmc_init(&pmc, ctl->pid.real, &vma_area_list->h,
+ vma_area_list->longest * PAGE_SIZE))
+ return -1;
+
+ ret = -1;
+ pp = create_page_pipe(vma_area_list->priv_size,
+ pargs_iovs(args), pp_ret == NULL);
+ if (!pp)
+ goto out;
+
+ if (pp_ret == NULL) {
+ ret = open_page_xfer(&xfer, CR_FD_PAGEMAP, ctl->pid.virt);
+ if (ret < 0)
+ goto out_pp;
+ } else {
+ ret = check_parent_page_xfer(CR_FD_PAGEMAP, ctl->pid.virt);
+ if (ret < 0)
+ goto out_pp;
+
+ if (ret)
+ xfer.parent = NULL + 1;
+ }
+
+ /*
+ * Step 1 -- generate the pagemap
+ */
+ args->off = 0;
+ list_for_each_entry(vma_area, &vma_area_list->h, list) {
+ u64 off = 0;
+ u64 *map;
+
+ if (!vma_area_is_private(vma_area, kdat.task_size))
+ continue;
+
+ map = pmc_get_map(&pmc, vma_area);
+ if (!map)
+ goto out_xfer;
+again:
+ ret = generate_iovs(vma_area, pp, map, &off, xfer.parent);
+ if (ret == -EAGAIN) {
+ BUG_ON(pp_ret);
+
+ ret = dump_pages(pp, ctl, args, &xfer);
+ if (ret)
+ goto out_xfer;
+ page_pipe_reinit(pp);
+ goto again;
+ }
+ if (ret < 0)
+ goto out_xfer;
+ }
+
+ ret = dump_pages(pp, ctl, args, pp_ret ? NULL : &xfer);
+ if (ret)
+ goto out_xfer;
+
+ timing_stop(TIME_MEMDUMP);
+
+ if (pp_ret)
+ *pp_ret = pp;
+
+ /*
+ * Step 4 -- clean up
+ */
+
+ ret = task_reset_dirty_track(ctl->pid.real);
+out_xfer:
+ if (pp_ret == NULL)
+ xfer.close(&xfer);
+out_pp:
+ if (ret || !pp_ret)
+ destroy_page_pipe(pp);
+out:
+ pmc_fini(&pmc);
+ pr_info("----------------------------------------\n");
+ return ret;
+}
+
+int parasite_dump_pages_seized(struct parasite_ctl *ctl,
+ struct vm_area_list *vma_area_list, struct page_pipe **pp)
+{
+ int ret;
+ struct parasite_dump_pages_args *pargs;
+
+ pargs = prep_dump_pages_args(ctl, vma_area_list);
+
+ /*
+ * Add PROT_READ protection for all VMAs we're about to
+ * dump if they don't have one. Otherwise we'll not be
+ * able to read the memory contents.
+ *
+ * Afterwards -- reprotect memory back.
+ */
+
+ pargs->add_prot = PROT_READ;
+ ret = parasite_execute_daemon(PARASITE_CMD_MPROTECT_VMAS, ctl);
+ if (ret) {
+ pr_err("Can't dump unprotect vmas with parasite\n");
+ return ret;
+ }
+
+ ret = __parasite_dump_pages_seized(ctl, pargs, vma_area_list, pp);
+ if (ret)
+ pr_err("Can't dump page with parasite\n");
+
+ pargs->add_prot = 0;
+ if (parasite_execute_daemon(PARASITE_CMD_MPROTECT_VMAS, ctl)) {
+ pr_err("Can't rollback unprotected vmas with parasite\n");
+ ret = -1;
+ }
+
+ return ret;
+}
+
+static inline int collect_filemap(struct vma_area *vma)
+{
+ struct file_desc *fd;
+
+ fd = collect_special_file(vma->e->shmid);
+ if (!fd)
+ return -1;
+
+ vma->vmfd = fd;
+ return 0;
+}
+
+int prepare_mm_pid(struct pstree_item *i)
+{
+ pid_t pid = i->pid.virt;
+ int ret = -1, vn = 0;
+ struct cr_img *img;
+ struct rst_info *ri = rsti(i);
+
+ img = open_image(CR_FD_MM, O_RSTR, pid);
+ if (!img)
+ return -1;
+
+ ret = pb_read_one_eof(img, &ri->mm, PB_MM);
+ close_image(img);
+ if (ret <= 0)
+ return ret;
+
+ if (collect_special_file(ri->mm->exe_file_id) == NULL)
+ return -1;
+
+ pr_debug("Found %zd VMAs in image\n", ri->mm->n_vmas);
+ img = NULL;
+ if (ri->mm->n_vmas == 0) {
+ /*
+ * Old image. Read VMAs from vma-.img
+ */
+ img = open_image(CR_FD_VMAS, O_RSTR, pid);
+ if (!img)
+ return -1;
+ }
+
+
+ while (vn < ri->mm->n_vmas || img != NULL) {
+ struct vma_area *vma;
+
+ ret = -1;
+ vma = alloc_vma_area();
+ if (!vma)
+ break;
+
+ ret = 0;
+ ri->vmas.nr++;
+ if (!img)
+ vma->e = ri->mm->vmas[vn++];
+ else {
+ ret = pb_read_one_eof(img, &vma->e, PB_VMA);
+ if (ret <= 0) {
+ xfree(vma);
+ close_image(img);
+ break;
+ }
+ }
+ list_add_tail(&vma->list, &ri->vmas.h);
+
+ if (vma_area_is_private(vma, kdat.task_size)) {
+ ri->vmas.priv_size += vma_area_len(vma);
+ if (vma->e->flags & MAP_GROWSDOWN)
+ ri->vmas.priv_size += PAGE_SIZE;
+ }
+
+ pr_info("vma 0x%"PRIx64" 0x%"PRIx64"\n", vma->e->start, vma->e->end);
+
+ if (vma_area_is(vma, VMA_ANON_SHARED) &&
+ !vma_area_is(vma, VMA_AREA_SYSVIPC))
+ ret = collect_shmem(pid, vma->e);
+ else if (vma_area_is(vma, VMA_FILE_PRIVATE) ||
+ vma_area_is(vma, VMA_FILE_SHARED))
+ ret = collect_filemap(vma);
+ else
+ ret = 0;
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
diff --git a/criu/mount.c b/criu/mount.c
new file mode 100644
index 000000000000..05cf6cf0a0f9
--- /dev/null
+++ b/criu/mount.c
@@ -0,0 +1,3455 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <string.h>
+#include <stdlib.h>
+#include <sys/mount.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sched.h>
+
+#include "cr_options.h"
+#include "asm/types.h"
+#include "util.h"
+#include "util-pie.h"
+#include "log.h"
+#include "plugin.h"
+#include "mount.h"
+#include "pstree.h"
+#include "proc_parse.h"
+#include "image.h"
+#include "namespaces.h"
+#include "protobuf.h"
+#include "kerndat.h"
+#include "fs-magic.h"
+#include "sysfs_parse.h"
+
+#include "protobuf/mnt.pb-c.h"
+#include "protobuf/binfmt-misc.pb-c.h"
+
+#define AUTODETECTED_MOUNT "CRIU:AUTOGENERATED"
+#define MS_PROPAGATE (MS_SHARED | MS_PRIVATE | MS_UNBINDABLE | MS_SLAVE)
+
+#undef LOG_PREFIX
+#define LOG_PREFIX "mnt: "
+
+int ext_mount_add(char *key, char *val)
+{
+ struct ext_mount *em;
+
+ em = xmalloc(sizeof(*em));
+ if (!em)
+ return -1;
+
+ em->key = key;
+ em->val = val;
+ list_add_tail(&em->list, &opts.ext_mounts);
+ pr_info("Added %s:%s ext mount mapping\n", key, val);
+ return 0;
+}
+
+/* Lookup ext_mount by key field */
+static struct ext_mount *ext_mount_lookup(char *key)
+{
+ struct ext_mount *em;
+
+ list_for_each_entry(em, &opts.ext_mounts, list)
+ if (!strcmp(em->key, key))
+ return em;
+
+ return NULL;
+}
+
+/*
+ * Single linked list of mount points get from proc/images
+ */
+struct mount_info *mntinfo;
+
+static void mntinfo_add_list(struct mount_info *new)
+{
+ if (!mntinfo)
+ mntinfo = new;
+ else {
+ struct mount_info *pm;
+
+ /* Add to the tail. (FIXME -- make O(1) ) */
+ for (pm = mntinfo; pm->next != NULL; pm = pm->next)
+ ;
+ pm->next = new;
+ }
+}
+
+static int open_mountpoint(struct mount_info *pm);
+
+static struct mount_info *mnt_build_tree(struct mount_info *list, struct mount_info *roots_mp);
+static int validate_mounts(struct mount_info *info, bool for_dump);
+
+/* Asolute paths are used on dump and relative paths are used on restore */
+static inline int is_root(char *p)
+{
+ return (!strcmp(p, "/"));
+}
+
+/* True for the root mount (the topmost one) */
+static inline int is_root_mount(struct mount_info *mi)
+{
+ return is_root(mi->mountpoint + 1);
+}
+
+/*
+ * True if the mountpoint target is root on its FS.
+ *
+ * This is used to determine whether we need to postpone
+ * mounting. E.g. one can bind mount some subdir from a
+ * disk, and in this case we'll have to get the root disk
+ * mount first, then bind-mount it. See do_mount_one().
+ */
+static inline int fsroot_mounted(struct mount_info *mi)
+{
+ return is_root(mi->root);
+}
+
+static struct mount_info *__lookup_overlayfs(struct mount_info *list, char *rpath,
+ unsigned int st_dev, unsigned int st_ino,
+ unsigned int mnt_id)
+{
+ /*
+ * Goes through all entries in the mountinfo table
+ * looking for a mount point that contains the file specified
+ * in rpath. Uses the device number st_dev and the inode number st_ino
+ * to make sure the file is correct.
+ */
+ struct mount_info *mi_ret = NULL;
+ struct mount_info *m;
+ int mntns_root = -1;
+
+ for (m = list; m != NULL; m = m->next) {
+ struct stat f_stat;
+ int ret_stat;
+
+ if (m->fstype->code != FSTYPE__OVERLAYFS)
+ continue;
+
+ /*
+ * We need the mntns root fd of the process to be dumped,
+ * to make sure we stat the correct file
+ */
+ if (mntns_root == -1) {
+ mntns_root = __mntns_get_root_fd(root_item->pid.real);
+ if (mntns_root < 0) {
+ pr_err("Unable to get the root file descriptor of pid %d\n", root_item->pid.real);
+ return ERR_PTR(-ENOENT);
+ }
+ }
+
+ /* Concatenates m->mountpoint with rpath and attempts to stat the resulting path */
+ if (is_root_mount(m)) {
+ ret_stat = fstatat(mntns_root, rpath, &f_stat, 0);
+ } else {
+ char _full_path[PATH_MAX];
+ int n = snprintf(_full_path, PATH_MAX, "%s/%s", m->mountpoint, rpath);
+
+ if (n >= PATH_MAX) {
+ pr_err("Not enough space to concatenate %s and %s\n", m->mountpoint, rpath);
+ return ERR_PTR(-ENOSPC);
+ }
+ ret_stat = fstatat(mntns_root, _full_path, &f_stat, 0);
+ }
+
+ if (ret_stat == 0 && st_dev == f_stat.st_dev && st_ino == f_stat.st_ino)
+ mi_ret = m;
+ }
+
+ return mi_ret;
+}
+
+/*
+ * Looks up the mnt_id and path of a file in an overlayFS directory.
+ *
+ * This is useful in order to fix the OverlayFS bug present in the
+ * Linux Kernel before version 4.2. See fixup_overlayfs for details.
+ *
+ * We first check to see if the mnt_id and st_dev numbers currently match
+ * some entry in the mountinfo table. If so, we already have the correct mnt_id
+ * and no fixup is needed.
+ *
+ * Then we proceed to see if there are any overlayFS mounted directories
+ * in the mountinfo table. If so, we concatenate the mountpoint with the
+ * name of the file, and stat the resulting path to check if we found the
+ * correct device id and node number. If that is the case, we update the
+ * mount id and link variables with the correct values.
+ */
+struct mount_info *lookup_overlayfs(char *rpath, unsigned int st_dev,
+ unsigned int st_ino, unsigned int mnt_id)
+{
+ struct mount_info *m;
+
+ /* If the mnt_id and device number match for some entry, no fixup is needed */
+ for (m = mntinfo; m != NULL; m = m->next)
+ if (st_dev == m->s_dev && mnt_id == m->mnt_id)
+ return NULL;
+
+ return __lookup_overlayfs(mntinfo, rpath, st_dev, st_ino, mnt_id);
+}
+
+static struct mount_info *__lookup_mnt_id(struct mount_info *list, int id)
+{
+ struct mount_info *m;
+
+ for (m = list; m != NULL; m = m->next)
+ if (m->mnt_id == id)
+ return m;
+
+ return NULL;
+}
+
+struct mount_info *lookup_mnt_id(unsigned int id)
+{
+ return __lookup_mnt_id(mntinfo, id);
+}
+
+struct mount_info *lookup_mnt_sdev(unsigned int s_dev)
+{
+ struct mount_info *m;
+
+ for (m = mntinfo; m != NULL; m = m->next)
+ if (m->s_dev == s_dev)
+ return m;
+
+ return NULL;
+}
+
+static struct mount_info *mount_resolve_path(struct mount_info *mntinfo_tree, const char *path)
+{
+ size_t pathlen = strlen(path);
+ struct mount_info *m = mntinfo_tree, *c;
+
+ while (1) {
+ list_for_each_entry(c, &m->children, siblings) {
+ size_t n;
+
+ n = strlen(c->mountpoint + 1);
+ if (n > pathlen)
+ continue;
+
+ if (strncmp(c->mountpoint + 1, path, min(n, pathlen)))
+ continue;
+ if (n < pathlen && path[n] != '/')
+ continue;
+
+ m = c;
+ break;
+ }
+ if (&c->siblings == &m->children)
+ break;
+ }
+
+ pr_debug("Path `%s' resolved to `%s' mountpoint\n", path, m->mountpoint);
+ return m;
+}
+
+dev_t phys_stat_resolve_dev(struct ns_id *ns, dev_t st_dev, const char *path)
+{
+ struct mount_info *m;
+
+ m = mount_resolve_path(ns->mnt.mntinfo_tree, path);
+ /*
+ * BTRFS returns subvolume dev-id instead of
+ * superblock dev-id, in such case return device
+ * obtained from mountinfo (ie subvolume0).
+ */
+ return strcmp(m->fstype->name, "btrfs") ?
+ MKKDEV(major(st_dev), minor(st_dev)) : m->s_dev;
+}
+
+bool phys_stat_dev_match(dev_t st_dev, dev_t phys_dev,
+ struct ns_id *ns, const char *path)
+{
+ if (st_dev == kdev_to_odev(phys_dev))
+ return true;
+
+ return phys_dev == phys_stat_resolve_dev(ns, st_dev, path);
+}
+
+/*
+ * Compare super-blocks mounted at two places
+ */
+static bool mounts_sb_equal(struct mount_info *a, struct mount_info *b)
+{
+ return a->s_dev == b->s_dev && a->fstype == b->fstype &&
+ !strcmp(a->source, b->source) && !strcmp(a->options, b->options);
+}
+
+/*
+ * Compare superblocks AND the way they are mounted
+ */
+static bool mounts_equal(struct mount_info *a, struct mount_info *b)
+{
+ if (!mounts_sb_equal(a, b))
+ return false;
+ if (strcmp(a->root, b->root))
+ return false;
+ if (strcmp(basename(a->mountpoint), basename(b->mountpoint)))
+ return false;
+
+ return true;
+}
+
+/*
+ * mnt_roots is a temporary directory for restoring sub-trees of
+ * non-root namespaces.
+ */
+static char *mnt_roots;
+
+static struct mount_info *mnt_build_ids_tree(struct mount_info *list, struct mount_info *tmp_root_mount)
+{
+ struct mount_info *m, *root = NULL;
+
+ /*
+ * Just resolve the mnt_id:parent_mnt_id relations
+ */
+
+ pr_debug("\tBuilding plain mount tree\n");
+ for (m = list; m != NULL; m = m->next) {
+ struct mount_info *parent;
+
+ pr_debug("\t\tWorking on %d->%d\n", m->mnt_id, m->parent_mnt_id);
+
+ if (m->mnt_id != m->parent_mnt_id)
+ parent = __lookup_mnt_id(list, m->parent_mnt_id);
+ else /* a circular mount reference. It's rootfs or smth like it. */
+ parent = NULL;
+
+ if (!parent) {
+ /* This should be / */
+ if (root == NULL && is_root_mount(m)) {
+ root = m;
+ continue;
+ }
+
+ pr_debug("Mountpoint %d (@%s) w/o parent %d\n",
+ m->mnt_id, m->mountpoint, m->parent_mnt_id);
+
+ if (root && m->is_ns_root) {
+ if (!mounts_sb_equal(root, m) ||
+ strcmp(root->root, m->root)) {
+ pr_err("Nested mount namespaces with different "
+ "roots %d (@%s %s) %d (@%s %s) are not supported yet\n",
+ root->mnt_id, root->mountpoint, root->root,
+ m->mnt_id, m->mountpoint, m->root);
+ return NULL;
+ }
+
+ /*
+ * A root of a sub mount namespace is
+ * mounted in a temporary directory in the
+ * root mount namespace, so its parent is
+ * the main root.
+ */
+ parent = tmp_root_mount;
+ if (unlikely(!tmp_root_mount)) {
+ pr_err("Nested mount %d (@%s %s) w/o root insertion detected\n",
+ m->mnt_id, m->mountpoint, m->root);
+ return NULL;
+ }
+
+ pr_debug("Mountpoint %d (@%s) get parent %d (@%s)\n",
+ m->mnt_id, m->mountpoint,
+ parent->mnt_id, parent->mountpoint);
+ } else {
+ pr_err("No root found for mountpoint %d (@%s)\n",
+ m->mnt_id, m->mountpoint);
+ return NULL;
+ }
+ }
+
+ m->parent = parent;
+ list_add_tail(&m->siblings, &parent->children);
+ }
+
+ if (!root) {
+ pr_err("No root found for tree\n");
+ return NULL;
+ }
+
+ if (tmp_root_mount) {
+ tmp_root_mount->parent = root;
+ list_add_tail(&tmp_root_mount->siblings, &root->children);
+ }
+
+ return root;
+}
+
+static unsigned int mnt_depth(struct mount_info *m)
+{
+ unsigned int depth = 0;
+ char *c;
+
+ for (c = m->mountpoint; *c != '\0'; c++)
+ if (*c == '/')
+ depth++;
+
+ return depth;
+}
+
+static void mnt_resort_siblings(struct mount_info *tree)
+{
+ struct mount_info *m, *p;
+ LIST_HEAD(list);
+
+ /*
+ * Put siblings of each node in an order they can be (u)mounted
+ * I.e. if we have mounts on foo/bar/, foo/bar/foobar/ and foo/
+ * we should put them in the foo/bar/foobar/, foo/bar/, foo/ order.
+ * Otherwise we will not be able to (u)mount them in a sequence.
+ *
+ * Funny, but all we need for this is to sort them in the descending
+ * order of the amount of /-s in a path =)
+ *
+ * Use stupid insertion sort here, we're not expecting mount trees
+ * to contain hundreds (or more) elements.
+ */
+
+ pr_info("\tResorting siblings on %d\n", tree->mnt_id);
+ while (!list_empty(&tree->children)) {
+ unsigned int depth;
+
+ m = list_first_entry(&tree->children, struct mount_info, siblings);
+ list_del(&m->siblings);
+
+ depth = mnt_depth(m);
+ list_for_each_entry(p, &list, siblings)
+ if (mnt_depth(p) <= depth)
+ break;
+
+ list_add(&m->siblings, &p->siblings);
+ mnt_resort_siblings(m);
+ }
+
+ list_splice(&list, &tree->children);
+}
+
+static void mnt_tree_show(struct mount_info *tree, int off)
+{
+ struct mount_info *m;
+
+ pr_info("%*s[%s](%d->%d)\n", off, "",
+ tree->mountpoint, tree->mnt_id, tree->parent_mnt_id);
+
+ list_for_each_entry(m, &tree->children, siblings)
+ mnt_tree_show(m, off + 1);
+
+ pr_info("%*s<--\n", off, "");
+}
+
+static int try_resolve_ext_mount(struct mount_info *info)
+{
+ struct ext_mount *em;
+
+ em = ext_mount_lookup(info->mountpoint + 1 /* trim the . */);
+ if (em == NULL)
+ return -ENOTSUP;
+
+ pr_info("Found %s mapping for %s mountpoint\n",
+ em->val, info->mountpoint);
+ info->external = em;
+ return 0;
+}
+
+static struct mount_info *find_widest_shared(struct mount_info *m)
+{
+ struct mount_info *p;
+
+ /*
+ * Try to find a mount, which is wider or equal.
+ * A is wider than B, if A->root is a subpath of B->root.
+ */
+ list_for_each_entry(p, &m->mnt_share, mnt_share)
+ if (issubpath(m->root, p->root))
+ return p;
+
+ return NULL;
+}
+
+static struct mount_info *find_shared_peer(struct mount_info *m,
+ struct mount_info *ct, char *ct_mountpoint, int m_mpnt_l)
+{
+ struct mount_info *cm;
+
+ list_for_each_entry(cm, &m->children, siblings) {
+ if (strcmp(ct_mountpoint, cm->mountpoint + m_mpnt_l))
+ continue;
+
+ if (!mounts_equal(cm, ct))
+ break;
+
+ return cm;
+ }
+
+ return NULL;
+}
+
+static inline int path_length(char *path)
+{
+ int off;
+
+ off = strlen(path);
+ /*
+ * If we're pure / then set lenght to zero so that adding this
+ * value as sub-path offset would produce the correct result.
+ * E.g. the tail path of the "/foo/bar" relative to the "/foo"
+ * will be the "/foo/bar" + len("/foo") == "/bar", while the
+ * same relative to the "/" should be +0 to be the "/foo/bar",
+ * not +1 and the "foo/bar".
+ */
+ if (path[off - 1] == '/')
+ off--;
+
+ return off;
+}
+
+static int validate_shared(struct mount_info *m)
+{
+ struct mount_info *t, *ct;
+ int t_root_l, m_root_l, t_mpnt_l, m_mpnt_l;
+ char *m_root_rpath;
+ LIST_HEAD(children);
+
+ /*
+ * Check that all mounts in one shared group has the same set of
+ * children. Only visible children are accounted. A non-root bind-mount
+ * doesn't see children out of its root and it's excpected case.
+ *
+ * Here is a few conditions:
+ * 1. t is wider than m
+ * 2. We search a wider mount in the same direction, so when we
+ * enumirate all mounts, we can't be sure that all of them
+ * has the same set of children.
+ */
+
+ t = find_widest_shared(m);
+ if (!t)
+ /*
+ * The current mount is the widest one in its shared group,
+ * all others will be compared to it or with some other,
+ * which will be compared to it.
+ */
+ return 0;
+
+ /* A set of childrent which ar visiable for both should be the same */
+
+ t_root_l = path_length(t->root);
+ m_root_l = path_length(m->root);
+ t_mpnt_l = path_length(t->mountpoint);
+ m_mpnt_l = path_length(m->mountpoint);
+
+ /* For example:
+ * t->root = / t->mp = ./zdtm/live/static/mntns_root_bind.test
+ * m->root = /test m->mp = ./zdtm/live/static/mntns_root_bind.test/test.bind
+ * t_root_l = 0 t_mpnt_l = 39
+ * m_root_l = 5 m_mpnt_l = 49
+ * ct->root = / ct->mp = ./zdtm/live/static/mntns_root_bind.test/test/sub
+ * tp = /test/sub mp = /test len=5
+ */
+
+ /*
+ * ct: | t->root | child mount point |
+ * cm: | m->root | child mount point |
+ * ct: | | /test/sub |
+ * cm: | /test | /sub |
+ * | A | B |
+ * | ct->mountpoint + t_mpnt_l
+ * | m->root + strlen(t->root)
+ */
+
+ m_root_rpath = m->root + t_root_l; /* path from t->root to m->root */
+
+ /* Search a child, which is visiable in both mounts. */
+ list_for_each_entry(ct, &t->children, siblings) {
+ char *ct_mpnt_rpath;
+ struct mount_info *cm;
+
+ if (ct->is_ns_root)
+ continue;
+
+ ct_mpnt_rpath = ct->mountpoint + t_mpnt_l; /* path from t->mountpoint to ct->mountpoint */
+
+ /*
+ * Check whether ct can be is visible at m, i.e. the
+ * ct's rpath starts (as path) with m's rpath.
+ */
+
+ if (!issubpath(ct_mpnt_rpath, m_root_rpath))
+ continue;
+
+ /*
+ * The ct has peer in m but with the mount path deeper according
+ * to m's depth relavie to t. Thus -- trim this difference (the
+ * lenght of m_root_rpath) from ct's mountpoint path.
+ */
+
+ ct_mpnt_rpath += m_root_l - t_root_l;
+
+ /*
+ * Find in m the mountpoint that fully matches with ct (with the
+ * described above path corrections).
+ */
+
+ cm = find_shared_peer(m, ct, ct_mpnt_rpath, m_mpnt_l);
+ if (!cm)
+ goto err;
+
+ /*
+ * Keep this one aside. At the end of t's children scan we should
+ * move _all_ m's children here (the list_empty check below).
+ */
+ list_move(&cm->siblings, &children);
+ }
+
+ if (!list_empty(&m->children))
+ goto err;
+
+ list_splice(&children, &m->children);
+ return 0;
+
+err:
+ list_splice(&children, &m->children);
+ pr_err("%d:%s and %d:%s have different set of mounts\n",
+ m->mnt_id, m->mountpoint, t->mnt_id, t->mountpoint);
+ return -1;
+}
+
+/*
+ * Find the mount_info from which the respective bind-mount
+ * can be created. It can be either an FS-root mount, or the
+ * root of the tree (the latter only if its root path is the
+ * sub-path of the bind mount's root).
+ */
+
+static struct mount_info *find_fsroot_mount_for(struct mount_info *bm)
+{
+ struct mount_info *sm;
+
+ list_for_each_entry(sm, &bm->mnt_bind, mnt_bind)
+ if (fsroot_mounted(sm) ||
+ (sm->parent == NULL &&
+ strstartswith(bm->root, sm->root)))
+ return sm;
+
+ return NULL;
+}
+
+static int validate_mounts(struct mount_info *info, bool for_dump)
+{
+ struct mount_info *m, *t;
+
+ for (m = info; m; m = m->next) {
+ if (m->parent == NULL || m->is_ns_root)
+ /* root mount can be any */
+ continue;
+
+ if (m->shared_id && validate_shared(m))
+ return -1;
+
+ if (m->external)
+ goto skip_fstype;
+
+ /*
+ * Mountpoint can point to / of an FS. In that case this FS
+ * should be of some known type so that we can just mount one.
+ *
+ * Otherwise it's a bindmount mountpoint and we try to find
+ * what fsroot mountpoint it's bound to. If this point is the
+ * root mount, the path to bindmount root should be accessible
+ * form the rootmount path (the strstartswith check in the
+ * else branch below).
+ */
+
+ if (fsroot_mounted(m)) {
+ if (m->fstype->code == FSTYPE__UNSUPPORTED) {
+ pr_err("FS mnt %s dev %#x root %s unsupported id %d\n",
+ m->mountpoint, m->s_dev, m->root, m->mnt_id);
+ return -1;
+ }
+ } else {
+ t = find_fsroot_mount_for(m);
+ if (!t) {
+ int ret;
+
+ /*
+ * No root-mount found for this bind and it's neither
+ * marked nor auto-resolved as external one. So last
+ * chance not to fail is to talk to plugins.
+ */
+
+ if (for_dump) {
+ ret = run_plugins(DUMP_EXT_MOUNT, m->mountpoint, m->mnt_id);
+ if (ret == 0)
+ m->need_plugin = true;
+ } else
+ /*
+ * Plugin should take care of this one
+ * in restore_ext_mount, or do_bind_mount
+ * will mount it as external
+ */
+ ret = m->need_plugin ? 0 : -ENOTSUP;
+
+ if (ret < 0) {
+ if (ret == -ENOTSUP)
+ pr_err("%d:%s doesn't have a proper root mount\n",
+ m->mnt_id, m->mountpoint);
+ return -1;
+ }
+ }
+ }
+skip_fstype:
+ list_for_each_entry(t, &m->parent->children, siblings) {
+ if (m == t)
+ continue;
+ if (!issubpath(m->mountpoint, t->mountpoint))
+ continue;
+
+ pr_err("%d:%s is overmounted\n", m->mnt_id, m->mountpoint);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static char *cut_root_for_bind(char *target_root, char *source_root)
+{
+ int tok = 0;
+ /*
+ * Cut common part of root.
+ * For non-root binds the source is always "/" (checked)
+ * so this will result in this slash removal only.
+ */
+ while (target_root[tok] == source_root[tok]) {
+ tok++;
+ if (source_root[tok] == '\0')
+ break;
+ BUG_ON(target_root[tok] == '\0');
+ }
+
+ return target_root + tok;
+
+}
+
+static struct mount_info *find_best_external_match(struct mount_info *list, struct mount_info *info)
+{
+ struct mount_info *it, *candidate = NULL;
+
+ for (it = list; it; it = it->next) {
+ if (!mounts_sb_equal(info, it))
+ continue;
+
+ /*
+ * This means we have a situation like:
+ *
+ * root at criu:~# mount --bind bind1/subdir/ bind2
+ * root at criu:~# mount --bind bind1/ bind3
+ *
+ * outside the container, and bind1 is directly bind mounted
+ * inside the container. mounts_equal() considers these mounts
+ * equal for bind purposes, but their roots are different, and
+ * we want to match the one with the right root.
+ */
+ if (!issubpath(info->root, it->root))
+ continue;
+
+ candidate = it;
+
+ /*
+ * Consider the case of:
+ *
+ * mount /xxx
+ * mount --bind /xxx /yyy
+ * mount --make-shared /yyy
+ * mount --bind /xxx /zzz
+ * mount --make-shared /zzz
+ * bind mount a shared mount into the namespace
+ *
+ * Here, we want to return the /right/ mount, not just a mount
+ * that's equal. However, in the case:
+ *
+ * bind mount a shared mount into the namespace
+ * inside the namespace, remount MS_PRIVATE
+ * inside the namespace, remount MS_SHARED
+ *
+ * there will be no external mount with matching sharing
+ * because the sharing is only internal; we still want to bind
+ * mount from this mountinfo so we should return it, but we
+ * should make the sharing namespace private after that bind
+ * mount.
+ *
+ * Below are the cases where we found an exact match.
+ */
+ if (info->flags & MS_SHARED && info->shared_id == it->shared_id)
+ return candidate;
+
+ if (info->flags & MS_SLAVE && info->master_id == it->shared_id)
+ return candidate;
+ }
+
+ return candidate;
+}
+
+static struct ns_id *find_ext_ns_id(void)
+{
+ struct ns_id *ns;
+
+ for (ns = ns_ids; ns->next; ns = ns->next)
+ if (ns->type == NS_CRIU && ns->nd == &mnt_ns_desc) {
+ if (!ns->mnt.mntinfo_list &&
+ !collect_mntinfo(ns, true))
+ break;
+ return ns;
+ }
+
+ pr_err("Failed to find criu pid's mount ns\n");
+ return NULL;
+}
+
+static int resolve_external_mounts(struct mount_info *info)
+{
+ struct ns_id *ext_ns = NULL;
+ struct mount_info *m;
+
+ if (opts.autodetect_ext_mounts) {
+ ext_ns = find_ext_ns_id();
+ if (!ext_ns)
+ return -1;
+ }
+
+ for (m = info; m; m = m->next) {
+ int ret;
+ char *p, *cut_root;
+ struct ext_mount *em;
+ struct mount_info *match;
+
+ if (m->parent == NULL || m->is_ns_root)
+ continue;
+
+ ret = try_resolve_ext_mount(m);
+ if (ret < 0 && ret != -ENOTSUP) {
+ return -1;
+ } else if (ret == -ENOTSUP && !ext_ns) {
+ continue;
+ } else if (ret == 0) {
+ continue;
+ }
+
+ match = find_best_external_match(ext_ns->mnt.mntinfo_list, m);
+ if (!match)
+ continue;
+
+ if (m->flags & MS_SHARED) {
+ if (!opts.enable_external_sharing)
+ continue;
+
+ if (m->shared_id != match->shared_id)
+ m->internal_sharing = true;
+ }
+
+ if (m->flags & MS_SLAVE) {
+ if (!opts.enable_external_masters)
+ continue;
+
+ /*
+ * In order to support something like internal slavery,
+ * we need to teach can_mount_now and do_mount_one
+ * about slavery relationships in external mounts. This
+ * seems like an uncommon case, so we punt for not.
+ */
+ if (m->master_id != match->shared_id)
+ continue;
+ }
+
+ cut_root = cut_root_for_bind(m->root, match->root);
+
+ p = xsprintf("%s/%s", match->mountpoint + 1, cut_root);
+ if (!p)
+ return -1;
+
+ em = xmalloc(sizeof(struct ext_mount));
+ if (!em) {
+ free(p);
+ return -1;
+ }
+
+ em->val = AUTODETECTED_MOUNT;
+ em->key = p;
+
+ m->external = em;
+
+ xfree(m->source);
+ m->source = p;
+
+ pr_info("autodetected external mount %s for %s\n", p, m->mountpoint);
+ }
+
+ return 0;
+}
+
+static int resolve_shared_mounts(struct mount_info *info, int root_master_id)
+{
+ struct mount_info *m, *t;
+
+ /*
+ * If we have a shared mounts, both master
+ * slave targets are to be present in mount
+ * list, otherwise we can't be sure if we can
+ * recreate the scheme later on restore.
+ */
+ for (m = info; m; m = m->next) {
+ bool need_share, need_master;
+
+ /* the root master_id can be ignored, because it's already created */
+ if (root_master_id && root_master_id == m->master_id)
+ m->master_id = -1;
+
+ need_share = m->shared_id && list_empty(&m->mnt_share);
+ need_master = m->master_id > 0;
+
+ pr_debug("Inspecting sharing on %2d shared_id %d master_id %d (@%s)\n",
+ m->mnt_id, m->shared_id, m->master_id, m->mountpoint);
+
+ for (t = info; t && (need_share || need_master); t = t->next) {
+ if (t == m)
+ continue;
+ if (need_master && t->shared_id == m->master_id) {
+ pr_debug("\tThe mount %3d is slave for %3d (@%s -> @%s)\n",
+ m->mnt_id, t->mnt_id,
+ m->mountpoint, t->mountpoint);
+ list_add(&m->mnt_slave, &t->mnt_slave_list);
+ m->mnt_master = t;
+ need_master = false;
+ }
+
+ /* Collect all mounts from this group */
+ if (need_share && t->shared_id == m->shared_id) {
+ pr_debug("\tMount %3d is shared with %3d group %3d (@%s -> @%s)\n",
+ m->mnt_id, t->mnt_id, m->shared_id,
+ t->mountpoint, m->mountpoint);
+ list_add(&t->mnt_share, &m->mnt_share);
+ }
+ }
+
+ /*
+ * If we haven't already determined this mount is external,
+ * then we don't know where it came from.
+ */
+ if (need_master && m->parent && !m->external) {
+ pr_err("Mount %d %s (master_id: %d shared_id: %d) "
+ "has unreachable sharing. Try --enable-external-masters.\n", m->mnt_id,
+ m->mountpoint, m->master_id, m->shared_id);
+ return -1;
+ }
+
+ /* Search bind-mounts */
+ if (list_empty(&m->mnt_bind)) {
+ /*
+ * A first mounted point will be set up as a source point
+ * for others. Look at propagate_mount()
+ */
+ for (t = m->next; t; t = t->next) {
+ if (mounts_sb_equal(m, t)) {
+ list_add(&t->mnt_bind, &m->mnt_bind);
+ pr_debug("\tThe mount %3d is bind for %3d (@%s -> @%s)\n",
+ t->mnt_id, m->mnt_id,
+ t->mountpoint, m->mountpoint);
+ }
+ }
+ }
+ }
+
+ return 0;
+}
+
+static struct mount_info *mnt_build_tree(struct mount_info *list, struct mount_info *roots_mp)
+{
+ struct mount_info *tree;
+
+ /*
+ * Organize them in a sequence in which they can be mounted/umounted.
+ */
+
+ pr_info("Building mountpoints tree\n");
+ tree = mnt_build_ids_tree(list, roots_mp);
+ if (!tree)
+ return NULL;
+
+ mnt_resort_siblings(tree);
+ pr_info("Done:\n");
+ mnt_tree_show(tree, 0);
+ return tree;
+}
+
+/*
+ * mnt_fd is a file descriptor on the mountpoint, which is closed in an error case.
+ * If mnt_fd is -1, the mountpoint will be opened by this function.
+ */
+int __open_mountpoint(struct mount_info *pm, int mnt_fd)
+{
+ dev_t dev;
+ struct stat st;
+ int ret;
+
+ if (mnt_fd == -1) {
+ int mntns_root;
+
+ mntns_root = mntns_get_root_fd(pm->nsid);
+ if (mntns_root < 0)
+ return -1;
+
+ mnt_fd = openat(mntns_root, pm->ns_mountpoint, O_RDONLY);
+ if (mnt_fd < 0) {
+ pr_perror("Can't open %s", pm->ns_mountpoint);
+ return -1;
+ }
+ }
+
+ ret = fstat(mnt_fd, &st);
+ if (ret < 0) {
+ pr_perror("fstat(%s) failed", pm->ns_mountpoint);
+ goto err;
+ }
+
+ if (pm->s_dev_rt == MOUNT_INVALID_DEV) {
+ pr_err("Resolving over unvalid device for %#x %s %s\n",
+ pm->s_dev, pm->fstype->name, pm->ns_mountpoint);
+ goto err;
+ }
+
+ dev = phys_stat_resolve_dev(pm->nsid, st.st_dev, pm->ns_mountpoint + 1);
+ /*
+ * Always check for @s_dev_rt here, because the @s_dev
+ * from the image (in case of restore) has all rights
+ * to not match the device (say it's migrated and kernel
+ * allocates new device ID).
+ */
+ if (dev != pm->s_dev_rt) {
+ pr_err("The file system %#x %#x (%#x) %s %s is inaccessible\n",
+ pm->s_dev, pm->s_dev_rt, (int)dev,
+ pm->fstype->name, pm->ns_mountpoint);
+ goto err;
+ }
+
+ return mnt_fd;
+err:
+ close(mnt_fd);
+ return -1;
+}
+
+int open_mount(unsigned int s_dev)
+{
+ struct mount_info *m;
+
+ m = lookup_mnt_sdev(s_dev);
+ if (!m)
+ return -ENOENT;
+
+ return __open_mountpoint(m, -1);
+}
+
+/* Bind-mount a mount point in a temporary place without children */
+static char *get_clean_mnt(struct mount_info *mi, char *mnt_path_tmp, char *mnt_path_root)
+{
+ char *mnt_path;
+
+ mnt_path = mkdtemp(mnt_path_tmp);
+ if (mnt_path == NULL && errno == ENOENT)
+ mnt_path = mkdtemp(mnt_path_root);
+ if (mnt_path == NULL) {
+ pr_perror("Can't create a temporary directory");
+ return NULL;;
+ }
+
+ if (mount(mi->mountpoint, mnt_path, NULL, MS_BIND, NULL)) {
+ pr_perror("Can't bind-mount %d:%s to %s",
+ mi->mnt_id, mi->mountpoint, mnt_path);
+ rmdir(mnt_path);
+ return NULL;
+ }
+
+ return mnt_path;
+}
+
+static int open_mountpoint(struct mount_info *pm)
+{
+ int fd = -1, ns_old = -1;
+ char mnt_path_tmp[] = "/tmp/cr-tmpfs.XXXXXX";
+ char mnt_path_root[] = "/cr-tmpfs.XXXXXX";
+ char *mnt_path = mnt_path_tmp;
+ int cwd_fd;
+
+ /*
+ * If a mount doesn't have children, we can open a mount point,
+ * otherwise we need to create a "private" copy.
+ */
+ if (list_empty(&pm->children))
+ return __open_mountpoint(pm, -1);
+
+ pr_info("Something is mounted on top of %s\n", pm->mountpoint);
+
+ /*
+ * To create a "private" copy, the target mount is bind-mounted
+ * in a temporary place w/o MS_REC (non-recursively).
+ * A mount point can't be bind-mounted in criu's namespace, it will be
+ * mounted in a target namespace. The sequence of actions is
+ * mkdtemp, setns(tgt), mount, open, detach, setns(old).
+ */
+
+ cwd_fd = open(".", O_DIRECTORY);
+ if (cwd_fd < 0) {
+ pr_perror("Unable to open cwd");
+ return -1;
+ }
+
+ if (switch_ns(pm->nsid->ns_pid, &mnt_ns_desc, &ns_old) < 0)
+ goto out;
+
+ mnt_path = get_clean_mnt(pm, mnt_path_tmp, mnt_path_root);
+ if (mnt_path == NULL)
+ goto out;
+
+ fd = open_detach_mount(mnt_path);
+ if (fd < 0)
+ goto out;
+
+ if (restore_ns(ns_old, &mnt_ns_desc)) {
+ ns_old = -1;
+ goto out;
+ }
+ if (fchdir(cwd_fd)) {
+ pr_perror("Unable to restore cwd");
+ close(cwd_fd);
+ close(fd);
+ return -1;
+ }
+ close(cwd_fd);
+
+ return __open_mountpoint(pm, fd);
+out:
+ if (ns_old >= 0)
+ restore_ns(ns_old, &mnt_ns_desc);
+ close_safe(&fd);
+ if (fchdir(cwd_fd))
+ pr_perror("Unable to restore cwd");
+ close(cwd_fd);
+ return -1;
+}
+
+static int attach_option(struct mount_info *pm, char *opt)
+{
+ if (pm->options[0] == '\0')
+ pm->options = xstrcat(pm->options, "%s", opt);
+ else
+ pm->options = xstrcat(pm->options, ",%s", opt);
+ return pm->options ? 0 : -1;
+}
+
+/* Is it mounted w or w/o the newinstance option */
+static int devpts_parse(struct mount_info *pm)
+{
+ int ret;
+
+ ret = kerndat_fs_virtualized(KERNDAT_FS_STAT_DEVPTS, pm->s_dev);
+ if (ret <= 0)
+ return ret;
+
+ /*
+ * Kernel hides this option, but if the fs instance
+ * is new (virtualized) we know that it was created
+ * with -o newinstance.
+ */
+ return attach_option(pm, "newinstance");
+}
+
+static int tmpfs_dump(struct mount_info *pm)
+{
+ int ret = -1, fd = -1, userns_pid = -1;
+ char tmpfs_path[PSFDS];
+ struct cr_img *img;
+
+ fd = open_mountpoint(pm);
+ if (fd < 0)
+ return -1;
+
+ /* if fd happens to be 0 here, we need to move it to something
+ * non-zero, because cr_system_userns closes STDIN_FILENO as we are not
+ * interested in passing stdin to tar.
+ */
+ if (move_img_fd(&fd, STDIN_FILENO) < 0)
+ goto out;
+
+ if (fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) & ~FD_CLOEXEC) == -1) {
+ pr_perror("Can not drop FD_CLOEXEC");
+ goto out;
+ }
+
+ img = open_image(CR_FD_TMPFS_DEV, O_DUMP, pm->s_dev);
+ if (!img)
+ goto out;
+
+ sprintf(tmpfs_path, "/proc/self/fd/%d", fd);
+
+ if (root_ns_mask & CLONE_NEWUSER)
+ userns_pid = root_item->pid.real;
+
+ ret = cr_system_userns(-1, img_raw_fd(img), -1, "tar", (char *[])
+ { "tar", "--create",
+ "--gzip",
+ "--no-unquote",
+ "--no-wildcards",
+ "--one-file-system",
+ "--check-links",
+ "--preserve-permissions",
+ "--sparse",
+ "--numeric-owner",
+ "--directory", tmpfs_path, ".", NULL }, 0, userns_pid);
+
+ if (ret)
+ pr_err("Can't dump tmpfs content\n");
+
+ close_image(img);
+out:
+ close_safe(&fd);
+ return ret;
+}
+
+/*
+ * Virtualized devtmpfs on any side (dump or restore)
+ * means, that we should try to handle it as a plain
+ * tmpfs.
+ *
+ * Interesting case -- shared on dump and virtual on
+ * restore -- will fail, since no tarball with the fs
+ * contents will be found.
+ */
+
+static int devtmpfs_virtual(struct mount_info *pm)
+{
+ return kerndat_fs_virtualized(KERNDAT_FS_STAT_DEVTMPFS, pm->s_dev);
+}
+
+static int devtmpfs_dump(struct mount_info *pm)
+{
+ int ret;
+
+ ret = devtmpfs_virtual(pm);
+ if (ret == 1)
+ ret = tmpfs_dump(pm);
+
+ return ret;
+}
+
+static int tmpfs_restore(struct mount_info *pm)
+{
+ int ret;
+ struct cr_img *img;
+
+ img = open_image(CR_FD_TMPFS_DEV, O_RSTR, pm->s_dev);
+ if (empty_image(img)) {
+ close_image(img);
+ img = open_image(CR_FD_TMPFS_IMG, O_RSTR, pm->mnt_id);
+ }
+ if (!img)
+ return -1;
+ if (empty_image(img)) {
+ close_image(img);
+ return -1;
+ }
+
+ ret = cr_system(img_raw_fd(img), -1, -1, "tar",
+ (char *[]) {"tar", "--extract", "--gzip",
+ "--no-unquote", "--no-wildcards",
+ "--directory", pm->mountpoint, NULL}, 0);
+ close_image(img);
+
+ if (ret) {
+ pr_err("Can't restore tmpfs content\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int devtmpfs_restore(struct mount_info *pm)
+{
+ int ret;
+
+ ret = devtmpfs_virtual(pm);
+ if (ret == 1)
+ ret = tmpfs_restore(pm);
+
+ return ret;
+}
+
+static int binfmt_misc_virtual(struct mount_info *pm)
+{
+ return kerndat_fs_virtualized(KERNDAT_FS_STAT_BINFMT_MISC, pm->s_dev);
+}
+
+static int parse_binfmt_misc_entry(struct bfd *f, BinfmtMiscEntry *bme)
+{
+ while (1) {
+ char *str;
+
+ str = breadline(f);
+ if (IS_ERR(str))
+ return -1;
+ if (!str)
+ break;
+
+ if (!strncmp(str, "enabled", 7)) {
+ bme->enabled = true;
+ continue;
+ }
+
+ if (!strncmp(str, "disabled", 8))
+ continue;
+
+ if (!strncmp(str, "offset ", 7)) {
+ if (sscanf(str + 7, "%i", &bme->offset) != 1)
+ return -1;
+ bme->has_offset = true;
+ continue;
+ }
+
+#define DUP_EQUAL_AS(key, member) \
+ if (!strncmp(str, key, strlen(key))) { \
+ bme->member = xstrdup(str + strlen(key)); \
+ if (!bme->member) \
+ return -1; \
+ continue; \
+ }
+ DUP_EQUAL_AS("interpreter ", interpreter)
+ DUP_EQUAL_AS("flags: ", flags)
+ DUP_EQUAL_AS("extension .", extension)
+ DUP_EQUAL_AS("magic ", magic)
+ DUP_EQUAL_AS("mask ", mask)
+#undef DUP_EQUAL_AS
+
+ pr_perror("binfmt_misc: unsupported feature %s\n", str);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int dump_binfmt_misc_entry(int dfd, char *name, struct cr_img *img)
+{
+ BinfmtMiscEntry bme = BINFMT_MISC_ENTRY__INIT;
+ struct bfd f;
+ int ret = -1;
+
+ f.fd = openat(dfd, name, O_RDONLY);
+ if (f.fd < 0) {
+ pr_perror("binfmt_misc: can't open %s", name);
+ return -1;
+ }
+
+ if (bfdopenr(&f))
+ return -1;
+
+ if (parse_binfmt_misc_entry(&f, &bme))
+ goto err;
+
+ bme.name = name;
+
+ if (pb_write_one(img, &bme, PB_BINFMT_MISC))
+ goto err;
+ ret = 0;
+err:
+ free(bme.interpreter);
+ free(bme.flags);
+ free(bme.extension);
+ free(bme.magic);
+ free(bme.mask);
+ bclose(&f);
+ return ret;
+
+}
+
+static int binfmt_misc_dump(struct mount_info *pm)
+{
+ struct cr_img *img;
+ struct dirent *de;
+ DIR *fdir = NULL;
+ int fd, ret;
+
+ ret = binfmt_misc_virtual(pm);
+ if (ret <= 0)
+ return ret;
+
+ fd = open_mountpoint(pm);
+ if (fd < 0)
+ return -1;
+
+ fdir = fdopendir(fd);
+ if (fdir == NULL) {
+ close(fd);
+ return -1;
+ }
+
+ ret = -1;
+ img = open_image(CR_FD_BINFMT_MISC, O_DUMP, pm->s_dev);
+ if (!img)
+ goto out;
+
+ while ((de = readdir(fdir))) {
+ if (dir_dots(de))
+ continue;
+ if (!strcmp(de->d_name, "register"))
+ continue;
+ if (!strcmp(de->d_name, "status"))
+ continue;
+
+ if (dump_binfmt_misc_entry(fd, de->d_name, img))
+ goto out;
+ }
+
+ ret = 0;
+out:
+ if (img)
+ close_image(img);
+ closedir(fdir);
+ return ret;
+}
+
+static int restore_binfmt_misc_entry(char *mp, char *buf, BinfmtMiscEntry *bme)
+{
+ int fd, len, ret = -1;
+ char path[PATH_MAX+1];
+
+ snprintf(path, PATH_MAX, "%s/register", mp);
+
+ fd = open(path, O_WRONLY);
+ if (fd < 0) {
+ pr_perror("binfmt_misc: can't open %s", path);
+ return -1;
+ }
+
+ len = strlen(buf);
+
+ if (write(fd, buf, len) != len) {
+ pr_perror("binfmt_misc: can't write to %s", path);
+ goto close;
+ }
+
+ if (!bme->enabled) {
+ close(fd);
+ snprintf(path, PATH_MAX, "%s/%s", mp, bme->name);
+
+ fd = open(path, O_WRONLY);
+ if (!fd) {
+ pr_perror("binfmt_misc: can't open %s", path);
+ goto out;
+ }
+ if (write(fd, "0", 1) != 1) {
+ pr_perror("binfmt_misc: can't write to %s", path);
+ goto close;
+ }
+ }
+
+ ret = 0;
+close:
+ close(fd);
+out:
+ return ret;
+}
+
+#define BINFMT_MISC_STR (1920 + 1)
+static int make_bfmtm_magic_str(char *buf, BinfmtMiscEntry *bme)
+{
+ int i, len;
+
+ /*
+ * Format is ":name:type(M):offset:magic:mask:interpreter:flags".
+ * Magic and mask are special fields. Kernel outputs them as
+ * a sequence of hexidecimal numbers (abc -> 616263), and we
+ * dump them without changes. But for registering a new entry
+ * it expects every byte is prepended with \x, i.e. \x61\x62\x63.
+ */
+ len = strlen(bme->name) + 3 /* offset < 128 */ + 2 * strlen(bme->magic)
+ + (bme->mask ? 2 * strlen(bme->mask) : 0) + strlen(bme->interpreter)
+ + (bme->flags ? strlen(bme->flags) : 0) + strlen(":::::::");
+
+ if ((len > BINFMT_MISC_STR - 1) || bme->offset > 128)
+ return -1;
+
+ buf += sprintf(buf, ":%s:M:%d:", bme->name, bme->offset);
+
+ len = strlen(bme->magic);
+ for (i = 0; i < len; i += 2)
+ buf += sprintf(buf, "\\x%c%c", bme->magic[i], bme->magic[i + 1]);
+
+ buf += sprintf(buf, ":");
+
+ if (bme->mask) {
+ len = strlen(bme->mask);
+ for (i = 0; i < len; i += 2)
+ buf += sprintf(buf, "\\x%c%c", bme->mask[i], bme->mask[i + 1]);
+ }
+
+ sprintf(buf, ":%s:%s", bme->interpreter, bme->flags ? : "\0");
+
+ return 1;
+}
+
+static int binfmt_misc_restore(struct mount_info *mi)
+{
+ struct cr_img *img;
+ char *buf;
+ int ret = -1;;
+
+ buf = xmalloc(BINFMT_MISC_STR);
+ if (!buf)
+ return -1;
+
+ img = open_image(CR_FD_BINFMT_MISC, O_RSTR, mi->s_dev);
+ if (!img) {
+ goto free_buf;
+ }
+
+ ret = 0;
+ while (ret == 0) {
+ BinfmtMiscEntry *bme;
+
+ ret = pb_read_one_eof(img, &bme, PB_BINFMT_MISC);
+ if (ret <= 0)
+ break;
+
+ /* :name:type:offset:magic/extension:mask:interpreter:flags */
+ if ((!bme->magic && !bme->extension) || !bme->interpreter) {
+ pr_perror("binfmt_misc: bad dump");
+ ret = -1;
+ } else if (bme->magic) {
+ ret = make_bfmtm_magic_str(buf, bme);
+ } else if (bme->extension) {
+ /* :name:E::extension::interpreter:flags */
+ ret = snprintf(buf, BINFMT_MISC_STR, ":%s:E::%s::%s:%s",
+ bme->name, bme->extension, bme->interpreter,
+ bme->flags ? : "\0");
+ }
+
+ if (ret > 0) {
+ pr_debug("binfmt_misc_pattern=%s\n", buf);
+ ret = restore_binfmt_misc_entry(mi->mountpoint, buf, bme);
+ }
+
+ binfmt_misc_entry__free_unpacked(bme, NULL);
+ }
+
+ close_image(img);
+free_buf:
+ free(buf);
+ return ret;
+}
+
+static int fusectl_dump(struct mount_info *pm)
+{
+ int fd, ret = -1;
+ struct dirent *de;
+ DIR *fdir = NULL;
+
+ fd = open_mountpoint(pm);
+ if (fd < 0)
+ return -1;
+
+ fdir = fdopendir(fd);
+ if (fdir == NULL) {
+ close(fd);
+ return -1;
+ }
+
+ while ((de = readdir(fdir))) {
+ int id;
+ struct mount_info *it;
+
+ if (dir_dots(de))
+ continue;
+
+ if (sscanf(de->d_name, "%d", &id) != 1) {
+ pr_err("wrong number of items scanned in fusectl dump\n");
+ goto out;
+ }
+
+ for (it = mntinfo; it; it = it->next) {
+ if (it->fstype->code == FSTYPE__FUSE && id == minor(it->s_dev) && !it->external) {
+ pr_err("%s is a fuse mount but not external\n", it->mountpoint);
+ goto out;
+ }
+ }
+ }
+
+ ret = 0;
+out:
+ closedir(fdir);
+ return ret;
+}
+
+static int dump_empty_fs(struct mount_info *pm)
+{
+ int fd, ret = -1;
+ fd = open_mountpoint(pm);
+
+ if (fd < 0)
+ return -1;
+
+ ret = is_empty_dir(fd);
+ close(fd);
+ if (ret < 0) {
+ pr_err("%s isn't empty\n", pm->fstype->name);
+ return -1;
+ }
+
+ return ret ? 0 : -1;
+}
+
+/*
+ * Some fses (fuse) cannot be dumped, so we should always fail on dump/restore
+ * of these fses.
+ */
+static int always_fail(struct mount_info *pm)
+{
+ pr_err("failed to dump fs %s (%s): always fail\n", pm->mountpoint,
+ pm->fstype->name);
+ return -1;
+}
+
+static struct fstype fstypes[32] = {
+ {
+ .name = "unsupported",
+ .code = FSTYPE__UNSUPPORTED,
+ }, {
+ .name = "proc",
+ .code = FSTYPE__PROC,
+ }, {
+ .name = "sysfs",
+ .code = FSTYPE__SYSFS,
+ }, {
+ .name = "devtmpfs",
+ .code = FSTYPE__DEVTMPFS,
+ .dump = devtmpfs_dump,
+ .restore = devtmpfs_restore,
+ }, {
+ .name = "binfmt_misc",
+ .code = FSTYPE__BINFMT_MISC,
+ .dump = binfmt_misc_dump,
+ .restore = binfmt_misc_restore,
+ }, {
+ .name = "tmpfs",
+ .code = FSTYPE__TMPFS,
+ .dump = tmpfs_dump,
+ .restore = tmpfs_restore,
+ }, {
+ .name = "devpts",
+ .parse = devpts_parse,
+ .code = FSTYPE__DEVPTS,
+ }, {
+ .name = "simfs",
+ .code = FSTYPE__SIMFS,
+ }, {
+ .name = "btrfs",
+ .code = FSTYPE__UNSUPPORTED,
+ }, {
+ .name = "pstore",
+ .dump = dump_empty_fs,
+ .code = FSTYPE__PSTORE,
+ }, {
+ .name = "mqueue",
+ .dump = dump_empty_fs,
+ .code = FSTYPE__MQUEUE,
+ }, {
+ .name = "securityfs",
+ .code = FSTYPE__SECURITYFS,
+ }, {
+ .name = "fusectl",
+ .dump = fusectl_dump,
+ .code = FSTYPE__FUSECTL,
+ }, {
+ .name = "debugfs",
+ .code = FSTYPE__DEBUGFS,
+ }, {
+ .name = "cgroup",
+ .code = FSTYPE__CGROUP,
+ }, {
+ .name = "aufs",
+ .code = FSTYPE__AUFS,
+ .parse = aufs_parse,
+ }, {
+ .name = "fuse",
+ .code = FSTYPE__FUSE,
+ .dump = always_fail,
+ .restore = always_fail,
+ }, {
+ .name = "overlay",
+ .code = FSTYPE__OVERLAYFS,
+ .parse = overlayfs_parse,
+ },
+};
+
+static char fsauto_all[] = "all";
+static char *fsauto_names;
+
+static bool css_contains(const char *css, const char *str)
+{
+ int len = strlen(str);
+ const char *cur;
+
+ if (!len)
+ return false;
+
+ for (cur = css; (cur = strstr(cur, str)); cur += len) {
+ if (cur > css && cur[-1] != ',')
+ continue;
+ if (cur[len] && cur[len] != ',')
+ continue;
+ return true;
+ }
+
+ return false;
+}
+
+static bool fsname_is_auto(const char *name)
+{
+ if (!fsauto_names)
+ return false;
+
+ if (fsauto_names == fsauto_all)
+ return true;
+
+ return css_contains(fsauto_names, name);
+}
+
+bool add_fsname_auto(const char *names)
+{
+ char *old = fsauto_names;
+
+ if (old == fsauto_all)
+ return true;
+
+ if (css_contains(names, fsauto_all))
+ fsauto_names = fsauto_all;
+ else if (!old)
+ fsauto_names = xstrdup(names);
+ else {
+ if (asprintf(&fsauto_names, "%s,%s", old, names) < 0)
+ fsauto_names = NULL;
+ }
+
+ xfree(old);
+ return fsauto_names != NULL;
+}
+
+static struct fstype *__find_fstype_by_name(char *fst, bool force_auto)
+{
+ int i;
+
+ /*
+ * This fn is required for two things.
+ * 1st -- to check supported filesystems (as just mounting
+ * anything is wrong, almost every fs has its own features)
+ * 2nd -- save some space in the image (since we scan all
+ * names anyway)
+ */
+ for (i = 1; i < ARRAY_SIZE(fstypes); i++) {
+ struct fstype *fstype = fstypes + i;
+
+ if (!fstype->name) {
+ if (!force_auto && !fsname_is_auto(fst))
+ break;
+
+ fstype->name = xstrdup(fst);
+ fstype->code = FSTYPE__AUTO;
+ return fstype;
+ }
+
+ if (!strcmp(fstype->name, fst))
+ return fstype;
+ }
+
+ if (i == ARRAY_SIZE(fstypes)) /* ensure we have a room for auto */
+ pr_err_once("fstypes[] overflow!\n");
+
+ return &fstypes[0];
+}
+
+struct fstype *find_fstype_by_name(char *fst)
+{
+ return __find_fstype_by_name(fst, false);
+}
+
+static struct fstype *decode_fstype(u32 fst, char *fsname)
+{
+ int i;
+
+ if (fst == FSTYPE__AUTO)
+ return __find_fstype_by_name(fsname, true);
+
+ if (fst == FSTYPE__UNSUPPORTED)
+ goto uns;
+
+ for (i = 1; i < ARRAY_SIZE(fstypes); i++) {
+ struct fstype *fstype = fstypes + i;
+
+ if (!fstype->name)
+ break;
+
+ if (fstype->code == fst)
+ return fstype;
+ }
+uns:
+ return &fstypes[0];
+}
+
+static int dump_one_mountpoint(struct mount_info *pm, struct cr_img *img)
+{
+ MntEntry me = MNT_ENTRY__INIT;
+
+ pr_info("\t%d: %x:%s @ %s\n", pm->mnt_id, pm->s_dev,
+ pm->root, pm->mountpoint);
+
+ me.fstype = pm->fstype->code;
+
+ if (me.fstype == FSTYPE__AUTO)
+ me.fsname = pm->fstype->name;
+
+ if (pm->parent && !pm->dumped && !pm->need_plugin && !pm->external &&
+ pm->fstype->dump && fsroot_mounted(pm)) {
+ struct mount_info *t;
+
+ if (pm->fstype->dump(pm))
+ return -1;
+
+ list_for_each_entry(t, &pm->mnt_bind, mnt_bind)
+ t->dumped = true;
+ }
+
+ me.mnt_id = pm->mnt_id;
+ me.root_dev = pm->s_dev;
+ me.parent_mnt_id = pm->parent_mnt_id;
+ me.flags = pm->flags;
+ me.sb_flags = pm->sb_flags;
+ me.has_sb_flags = true;
+ me.mountpoint = pm->mountpoint + 1;
+ me.source = pm->source;
+ me.options = pm->options;
+ me.shared_id = pm->shared_id;
+ me.has_shared_id = true;
+ me.master_id = pm->master_id;
+ me.has_master_id = true;
+ if (pm->need_plugin) {
+ me.has_with_plugin = true;
+ me.with_plugin = true;
+ }
+ if (pm->deleted) {
+ me.has_deleted = true;
+ me.deleted = true;
+ }
+
+ if (pm->internal_sharing) {
+ me.has_internal_sharing = true;
+ me.internal_sharing = true;
+ }
+
+ if (pm->external) {
+ /*
+ * For external mount points dump the mapping's
+ * value instead of root. See collect_mnt_from_image
+ * for reverse mapping details.
+ */
+ me.root = pm->external->val;
+ me.has_ext_mount = true;
+ me.ext_mount = true;
+ } else
+ me.root = pm->root;
+
+ if (pb_write_one(img, &me, PB_MNT))
+ return -1;
+
+ return 0;
+}
+
+static void free_mntinfo(struct mount_info *pms)
+{
+ while (pms) {
+ struct mount_info *pm;
+
+ pm = pms->next;
+ mnt_entry_free(pms);
+ pms = pm;
+ }
+}
+
+struct mount_info *collect_mntinfo(struct ns_id *ns, bool for_dump)
+{
+ struct mount_info *pm;
+
+ pm = parse_mountinfo(ns->ns_pid, ns, for_dump);
+ if (!pm) {
+ pr_err("Can't parse %d's mountinfo\n", ns->ns_pid);
+ return NULL;
+ }
+
+ ns->mnt.mntinfo_tree = mnt_build_tree(pm, NULL);
+ if (ns->mnt.mntinfo_tree == NULL)
+ goto err;
+
+ ns->mnt.mntinfo_list = pm;
+ return pm;
+err:
+ free_mntinfo(pm);
+ return NULL;
+}
+
+static int dump_mnt_ns(struct ns_id *ns, struct mount_info *pms)
+{
+ struct mount_info *pm;
+ int ret = -1;
+ struct cr_img *img;
+ int ns_id = ns->id;
+
+ pr_info("Dumping mountpoints\n");
+ img = open_image(CR_FD_MNTS, O_DUMP, ns_id);
+ if (!img)
+ goto err;
+
+ for (pm = pms; pm && pm->nsid == ns; pm = pm->next)
+ if (dump_one_mountpoint(pm, img))
+ goto err_i;
+
+ ret = 0;
+err_i:
+ close_image(img);
+err:
+ return ret;
+}
+
+/*
+ * _fn_f - pre-order traversal function
+ * _fn_f - post-order traversal function
+ * _plist - a postpone list. _el is added to this list, if _fn_f returns
+ * a positive value, and all lower elements are not enumirated.
+ */
+#define MNT_TREE_WALK(_r, _el, _fn_f, _fn_r, _plist, _prgs) do { \
+ struct mount_info *_mi = _r; \
+ \
+ while (1) { \
+ int ret; \
+ \
+ list_del_init(&_mi->postpone); \
+ \
+ ret = _fn_f(_mi); \
+ if (ret < 0) \
+ return -1; \
+ else if (ret > 0) { \
+ list_add_tail(&_mi->postpone, _plist); \
+ goto up; \
+ } \
+ \
+ _prgs++; \
+ \
+ if (!list_empty(&_mi->children)) { \
+ _mi = list_entry(_mi->children._el, \
+ struct mount_info, siblings); \
+ continue; \
+ } \
+ up: \
+ if (_fn_r(_mi)) \
+ return -1; \
+ if (_mi == _r) \
+ break; \
+ if (_mi->siblings._el == &_mi->parent->children) { \
+ _mi = _mi->parent; \
+ goto up; \
+ } \
+ _mi = list_entry(_mi->siblings._el, \
+ struct mount_info, siblings); \
+ } \
+ } while (0)
+
+#define MNT_WALK_NONE 0 &&
+
+
+static int mnt_tree_for_each(struct mount_info *start,
+ int (*fn)(struct mount_info *))
+{
+ struct mount_info *tmp;
+ LIST_HEAD(postpone);
+ LIST_HEAD(postpone2);
+ int progress;
+
+ pr_debug("Start with %d:%s\n", start->mnt_id, start->mountpoint);
+ list_add(&start->postpone, &postpone);
+
+again:
+ progress = 0;
+
+ list_for_each_entry_safe(start, tmp, &postpone, postpone)
+ MNT_TREE_WALK(start, next, fn, MNT_WALK_NONE, &postpone2, progress);
+
+ if (!progress) {
+ struct mount_info *m;
+
+ pr_err("A few mount points can't be mounted\n");
+ list_for_each_entry(m, &postpone2, postpone) {
+ pr_err("%d:%d %s %s %s\n", m->mnt_id,
+ m->parent_mnt_id, m->root,
+ m->mountpoint, m->source);
+ }
+ return -1;
+ }
+
+ list_splice_init(&postpone2, &postpone);
+
+ if (!list_empty(&postpone))
+ goto again;
+
+ return 0;
+
+}
+
+static int mnt_tree_for_each_reverse(struct mount_info *m,
+ int (*fn)(struct mount_info *))
+{
+ int progress = 0;
+
+ MNT_TREE_WALK(m, prev, MNT_WALK_NONE, fn, (struct list_head *) NULL, progress);
+
+ return 0;
+}
+
+static char *resolve_source(struct mount_info *mi)
+{
+ if (kdev_major(mi->s_dev) == 0)
+ /*
+ * Anonymous block device. Kernel creates them for
+ * diskless mounts.
+ */
+ return mi->source;
+
+ if (mi->fstype->code == FSTYPE__AUTO) {
+ struct stat st;
+
+ if (!stat(mi->source, &st) && S_ISBLK(st.st_mode) &&
+ major(st.st_rdev) == kdev_major(mi->s_dev) &&
+ minor(st.st_rdev) == kdev_minor(mi->s_dev))
+ return mi->source;
+ }
+
+ pr_err("No device for %s mount\n", mi->mountpoint);
+ return NULL;
+}
+
+static int restore_shared_options(struct mount_info *mi, bool private, bool shared, bool slave)
+{
+ pr_debug("%d:%s private %d shared %d slave %d\n",
+ mi->mnt_id, mi->mountpoint, private, shared, slave);
+
+ if (mi->flags & MS_UNBINDABLE) {
+ if (shared || slave)
+ pr_warn("%s has both unbindable and sharing, ignoring unbindable\n", mi->mountpoint);
+ else
+ return mount(NULL, mi->mountpoint, NULL, MS_UNBINDABLE, NULL);
+ }
+
+ if (private && mount(NULL, mi->mountpoint, NULL, MS_PRIVATE, NULL)) {
+ pr_perror("Unable to make %s private", mi->mountpoint);
+ return -1;
+ }
+ if (slave && mount(NULL, mi->mountpoint, NULL, MS_SLAVE, NULL)) {
+ pr_perror("Unable to make %s slave", mi->mountpoint);
+ return -1;
+ }
+ if (shared && mount(NULL, mi->mountpoint, NULL, MS_SHARED, NULL)) {
+ pr_perror("Unable to make %s shared", mi->mountpoint);
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Umount points, which are propagated in slave parents, because
+ * we can't be sure, that they were inherited in a real life.
+ */
+static int umount_from_slaves(struct mount_info *mi)
+{
+ struct mount_info *t;
+ char mpath[PATH_MAX];
+
+ list_for_each_entry(t, &mi->parent->mnt_slave_list, mnt_slave) {
+ if (!t->mounted)
+ continue;
+
+ snprintf(mpath, sizeof(mpath), "%s/%s",
+ t->mountpoint, basename(mi->mountpoint));
+ pr_debug("\t\tUmount slave %s\n", mpath);
+ if (umount(mpath) == -1) {
+ pr_perror("Can't umount slave %s", mpath);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * If something is mounted in one shared point, it will be spread in
+ * all other points from this shared group.
+ *
+ * Look at Documentation/filesystems/sharedsubtree.txt for more details
+ */
+static int propagate_siblings(struct mount_info *mi)
+{
+ struct mount_info *t;
+
+ /*
+ * Find all mounts, which must be bind-mounted from this one
+ * to inherite shared group or master id
+ */
+ list_for_each_entry(t, &mi->mnt_share, mnt_share) {
+ if (t->mounted)
+ continue;
+ pr_debug("\t\tBind share %s\n", t->mountpoint);
+ t->bind = mi;
+ t->s_dev_rt = mi->s_dev_rt;
+ }
+
+ list_for_each_entry(t, &mi->mnt_slave_list, mnt_slave) {
+ if (t->mounted)
+ continue;
+ pr_debug("\t\tBind slave %s\n", t->mountpoint);
+ t->bind = mi;
+ t->s_dev_rt = mi->s_dev_rt;
+ }
+
+ return 0;
+}
+
+static int propagate_mount(struct mount_info *mi)
+{
+ struct mount_info *t;
+
+ propagate_siblings(mi);
+
+ if (!mi->parent)
+ goto skip_parent;
+
+ umount_from_slaves(mi);
+
+ /* Propagate this mount to everyone from a parent group */
+
+ list_for_each_entry(t, &mi->parent->mnt_share, mnt_share) {
+ struct mount_info *c;
+
+ list_for_each_entry(c, &t->children, siblings) {
+ if (mounts_equal(mi, c)) {
+ pr_debug("\t\tPropagate %s\n", c->mountpoint);
+ c->mounted = true;
+ propagate_siblings(c);
+ umount_from_slaves(c);
+ }
+ }
+ }
+
+skip_parent:
+ /*
+ * FIXME Currently non-root mounts can be restored
+ * only if a proper root mount exists
+ */
+ if (fsroot_mounted(mi) || mi->parent == NULL) {
+ list_for_each_entry(t, &mi->mnt_bind, mnt_bind) {
+ if (t->mounted)
+ continue;
+ if (t->bind)
+ continue;
+ if (t->master_id > 0)
+ continue;
+ t->bind = mi;
+ t->s_dev_rt = mi->s_dev_rt;
+ }
+ }
+
+ return 0;
+}
+
+static int fetch_rt_stat(struct mount_info *m, const char *where)
+{
+ struct stat st;
+
+ if (stat(where, &st)) {
+ pr_perror("Can't stat on %s\n", where);
+ return -1;
+ }
+
+ m->s_dev_rt = MKKDEV(major(st.st_dev), minor(st.st_dev));
+ return 0;
+}
+
+/*
+ * Here are a set of flags which we know how to handle for the one mount call.
+ * All of them except MS_RDONLY are set only as mnt flags.
+ * MS_RDONLY is set for both mnt ans sb flags, so we can restore it for one
+ * mount call only if it set for both masks.
+ */
+#define MS_MNT_KNOWN_FLAGS (MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_NOATIME | \
+ MS_NODIRATIME | MS_RELATIME | MS_RDONLY)
+
+static int do_simple_mount(struct mount_info *mi, const char *src, const
+ char *fstype, unsigned long mountflags)
+{
+ return mount(src, mi->mountpoint, fstype, mountflags, mi->options);
+}
+
+static int do_new_mount(struct mount_info *mi)
+{
+ unsigned long sflags = mi->sb_flags;
+ unsigned long mflags = mi->flags & (~MS_PROPAGATE);
+ char *src;
+ struct fstype *tp = mi->fstype;
+ bool remount_ro = (tp->restore && mi->sb_flags & MS_RDONLY);
+ mount_fn_t do_mount = (tp->mount) ? tp->mount : do_simple_mount;
+
+ src = resolve_source(mi);
+ if (!src)
+ return -1;
+
+ /* Merge superblock and mount flags if it's posiable */
+ if (!(mflags & ~MS_MNT_KNOWN_FLAGS) && !((sflags ^ mflags) & MS_RDONLY)) {
+ sflags |= mflags;
+ mflags = 0;
+ }
+
+ if (remount_ro)
+ sflags &= ~MS_RDONLY;
+
+ if (do_mount(mi, src, tp->name, sflags) < 0) {
+ pr_perror("Can't mount at %s", mi->mountpoint);
+ return -1;
+ }
+
+ if (tp->restore && tp->restore(mi))
+ return -1;
+
+ if (remount_ro)
+ return mount(NULL, mi->mountpoint, tp->name,
+ MS_REMOUNT | MS_RDONLY, NULL);
+
+ if (mflags && mount(NULL, mi->mountpoint, NULL,
+ MS_REMOUNT | MS_BIND | mflags, NULL)) {
+ pr_perror("Unable to apply bind-mount options");
+ return -1;
+ }
+
+ if (restore_shared_options(mi, !mi->shared_id && !mi->master_id,
+ mi->shared_id,
+ mi->master_id))
+ return -1;
+
+ mi->mounted = true;
+
+ return 0;
+}
+
+static int restore_ext_mount(struct mount_info *mi)
+{
+ int ret;
+
+ pr_debug("Restoring external bind mount %s\n", mi->mountpoint);
+ ret = run_plugins(RESTORE_EXT_MOUNT, mi->mnt_id, mi->mountpoint, "/", NULL);
+ if (ret)
+ pr_err("Can't restore ext mount (%d)\n", ret);
+ return ret;
+}
+
+static int do_bind_mount(struct mount_info *mi)
+{
+ char mnt_path_tmp[] = "/tmp/cr-tmpfs.XXXXXX";
+ char mnt_path_root[] = "/cr-tmpfs.XXXXXX";
+ char *root, *cut_root, rpath[PATH_MAX];
+ unsigned long mflags;
+ int exit_code = -1;
+ bool shared = false;
+ bool master = false;
+ bool private = false;
+ char *mnt_path = NULL;
+ struct stat st;
+ bool umount_mnt_path = false;
+
+ if (mi->need_plugin) {
+ if (restore_ext_mount(mi))
+ return -1;
+ goto out;
+ }
+
+ if (mi->external) {
+ /*
+ * We have / pointing to criu's ns root still,
+ * so just use the mapping's path. The mountpoint
+ * is tuned in collect_mnt_from_image to refer
+ * to proper location in the namespace we restore.
+ */
+ root = mi->external->val;
+ private = !mi->master_id && (mi->internal_sharing || !mi->shared_id);
+ goto do_bind;
+ }
+
+ shared = mi->shared_id && mi->shared_id == mi->bind->shared_id;
+ master = mi->master_id && mi->master_id == mi->bind->master_id;
+ private = !mi->master_id && !shared;
+ cut_root = cut_root_for_bind(mi->root, mi->bind->root);
+
+ if (list_empty(&mi->bind->children))
+ mnt_path = mi->bind->mountpoint;
+ else {
+ mnt_path = get_clean_mnt(mi->bind, mnt_path_tmp, mnt_path_root);
+ umount_mnt_path = true;
+ }
+ if (mnt_path == NULL)
+ return -1;
+
+ snprintf(rpath, sizeof(rpath), "%s/%s",
+ mnt_path, cut_root);
+ root = rpath;
+do_bind:
+ pr_info("\tBind %s to %s\n", root, mi->mountpoint);
+
+ if (unlikely(mi->deleted)) {
+ if (stat(mi->mountpoint, &st)) {
+ pr_perror("Can't fetch stat on %s", mi->mountpoint);
+ goto err;
+ }
+
+ if (S_ISDIR(st.st_mode)) {
+ if (mkdir(root, (st.st_mode & ~S_IFMT))) {
+ pr_perror("Can't re-create deleted directory %s", root);
+ goto err;
+ }
+ } else if (S_ISREG(st.st_mode)) {
+ int fd = open(root, O_WRONLY | O_CREAT | O_EXCL,
+ st.st_mode & ~S_IFMT);
+ if (fd < 0) {
+ pr_perror("Can't re-create deleted file %s", root);
+ goto err;
+ }
+ close(fd);
+ } else {
+ pr_err("Unsupported st_mode 0%o deleted root %s\n",
+ (int)st.st_mode, root);
+ goto err;
+ }
+ }
+
+ if (mount(root, mi->mountpoint, NULL, MS_BIND, NULL) < 0) {
+ pr_perror("Can't mount at %s", mi->mountpoint);
+ goto err;
+ }
+
+ mflags = mi->flags & (~MS_PROPAGATE);
+ if (!mi->bind || mflags != (mi->bind->flags & (~MS_PROPAGATE)))
+ if (mount(NULL, mi->mountpoint, NULL, MS_BIND | MS_REMOUNT | mflags, NULL)) {
+ pr_perror("Can't mount at %s", mi->mountpoint);
+ goto err;
+ }
+
+ if (unlikely(mi->deleted)) {
+ if (S_ISDIR(st.st_mode)) {
+ if (rmdir(root)) {
+ pr_perror("Can't remove deleted directory %s", root);
+ goto err;
+ }
+ } else if (S_ISREG(st.st_mode)) {
+ if (unlink(root)) {
+ pr_perror("Can't unlink deleted file %s", root);
+ goto err;
+ }
+ }
+ }
+out:
+ /*
+ * shared - the mount is in the same shared group with mi->bind
+ * mi->shared_id && !shared - create a new shared group
+ */
+ if (restore_shared_options(mi, private,
+ mi->shared_id && !shared,
+ mi->master_id && !master))
+ return -1;
+
+ mi->mounted = true;
+ exit_code = 0;
+err:
+ if (umount_mnt_path) {
+ /*
+ * If mnt_path was shared, a new mount may be propagated
+ * into it.
+ */
+ if (mount(NULL, mnt_path, NULL, MS_PRIVATE, NULL)) {
+ pr_perror("Unable to make %s private", mnt_path);
+ return -1;
+ }
+ if (umount2(mnt_path, MNT_DETACH)) {
+ pr_perror("Unable to umount %s", mnt_path);
+ return -1;
+ }
+ if (rmdir(mnt_path)) {
+ pr_perror("Unable to remove %s", mnt_path);
+ return -1;
+ }
+ }
+ return exit_code;
+}
+
+static bool can_mount_now(struct mount_info *mi)
+{
+ /* The root mount */
+ if (!mi->parent)
+ return true;
+
+ if (mi->external)
+ return true;
+
+ /*
+ * We're the slave peer:
+ * - Make sure the master peer is already mounted
+ * - Make sure all children is mounted as well to
+ * eliminame mounts duplications
+ */
+ if (mi->master_id > 0) {
+ struct mount_info *c;
+
+ if (mi->bind == NULL)
+ return false;
+
+ list_for_each_entry(c, &mi->bind->children, siblings) {
+ if (!c->mounted)
+ return false;
+ }
+ }
+
+ if (!fsroot_mounted(mi) && (mi->bind == NULL && !mi->need_plugin && !mi->external))
+ return false;
+
+ if (mi->parent->shared_id) {
+ struct mount_info *p = mi->parent, *n;
+
+ if (mi->parent->shared_id == mi->shared_id) {
+ int rlen = strlen(mi->root);
+ list_for_each_entry(n, &p->mnt_share, mnt_share)
+ if (strlen(n->root) < rlen && !n->mounted)
+ return false;
+ } else {
+ list_for_each_entry(n, &p->mnt_share, mnt_share)
+ if (!n->mounted)
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static int do_mount_root(struct mount_info *mi)
+{
+ if (restore_shared_options(mi, !mi->shared_id && !mi->master_id,
+ mi->shared_id, mi->master_id))
+ return -1;
+
+ return fetch_rt_stat(mi, mi->mountpoint);
+}
+
+static int do_mount_one(struct mount_info *mi)
+{
+ int ret;
+
+ if (mi->mounted)
+ return 0;
+
+ if (!can_mount_now(mi)) {
+ pr_debug("Postpone slave %s\n", mi->mountpoint);
+ return 1;
+ }
+
+ pr_debug("\tMounting %s @%s (%d)\n", mi->fstype->name, mi->mountpoint, mi->need_plugin);
+
+ if (!mi->parent) {
+ /* do_mount_root() is called from populate_mnt_ns() */
+ mi->mounted = true;
+ ret = 0;
+ } else if (!mi->bind && !mi->need_plugin && !mi->external)
+ ret = do_new_mount(mi);
+ else
+ ret = do_bind_mount(mi);
+
+ if (ret == 0 && fetch_rt_stat(mi, mi->mountpoint))
+ return -1;
+
+ if (ret == 0 && propagate_mount(mi))
+ return -1;
+
+ if (mi->fstype->code == FSTYPE__UNSUPPORTED) {
+ struct statfs st;
+
+ if (statfs(mi->mountpoint, &st)) {
+ pr_perror("Unable to statfs %s", mi->mountpoint);
+ return -1;
+ }
+ if (st.f_type == BTRFS_SUPER_MAGIC)
+ mi->fstype = find_fstype_by_name("btrfs");
+ }
+
+ return ret;
+}
+
+static int do_umount_one(struct mount_info *mi)
+{
+ if (!mi->parent)
+ return 0;
+
+ if (mount("none", mi->parent->mountpoint, "none", MS_REC|MS_PRIVATE, NULL)) {
+ pr_perror("Can't mark %s as private", mi->parent->mountpoint);
+ return -1;
+ }
+
+ if (umount(mi->mountpoint)) {
+ pr_perror("Can't umount at %s", mi->mountpoint);
+ return -1;
+ }
+
+ pr_info("Umounted at %s\n", mi->mountpoint);
+ return 0;
+}
+
+static int cr_pivot_root(char *root)
+{
+ char put_root[] = "crtools-put-root.XXXXXX";
+ int exit_code = -1;
+
+ pr_info("Move the root to %s\n", root ? : ".");
+
+ if (root) {
+ if (chdir(root)) {
+ pr_perror("chdir(%s) failed", root);
+ return -1;
+ }
+ }
+
+ if (mkdtemp(put_root) == NULL) {
+ pr_perror("Can't create a temporary directory");
+ return -1;
+ }
+
+ if (mount(put_root, put_root, NULL, MS_BIND, NULL)) {
+ pr_perror("Unable to mount tmpfs in %s", put_root);
+ goto err_root;
+ }
+
+ if (mount(NULL, put_root, NULL, MS_PRIVATE, NULL)) {
+ pr_perror("Can't remount %s with MS_PRIVATE", put_root);
+ goto err_tmpfs;
+ }
+
+ if (pivot_root(".", put_root)) {
+ pr_perror("pivot_root(., %s) failed", put_root);
+ goto err_tmpfs;
+ }
+
+ if (mount("none", put_root, "none", MS_REC|MS_PRIVATE, NULL)) {
+ pr_perror("Can't remount root with MS_PRIVATE");
+ return -1;
+ }
+
+ exit_code = 0;
+
+ if (umount2(put_root, MNT_DETACH)) {
+ pr_perror("Can't umount %s", put_root);
+ return -1;
+ }
+
+err_tmpfs:
+ if (umount2(put_root, MNT_DETACH)) {
+ pr_perror("Can't umount %s", put_root);
+ return -1;
+ }
+
+err_root:
+ if (rmdir(put_root)) {
+ pr_perror("Can't remove the directory %s", put_root);
+ return -1;
+ }
+
+ return exit_code;
+}
+
+struct mount_info *mnt_entry_alloc()
+{
+ struct mount_info *new;
+
+ /*
+ * We rely on xzalloc here for MOUNT_INVALID_DEV.
+ */
+ BUILD_BUG_ON(MOUNT_INVALID_DEV);
+
+ new = xzalloc(sizeof(struct mount_info));
+ if (new) {
+ INIT_LIST_HEAD(&new->children);
+ INIT_LIST_HEAD(&new->siblings);
+ INIT_LIST_HEAD(&new->mnt_slave_list);
+ INIT_LIST_HEAD(&new->mnt_share);
+ INIT_LIST_HEAD(&new->mnt_bind);
+ INIT_LIST_HEAD(&new->postpone);
+ }
+ return new;
+}
+
+void mnt_entry_free(struct mount_info *mi)
+{
+ if (mi) {
+ xfree(mi->root);
+ xfree(mi->mountpoint);
+ xfree(mi->source);
+ xfree(mi->options);
+ xfree(mi);
+ }
+}
+
+/*
+ * Helper for getting a path to where the namespace's root
+ * is re-constructed.
+ */
+static inline int print_ns_root(struct ns_id *ns, char *buf, int bs)
+{
+ return snprintf(buf, bs, "%s/%d", mnt_roots, ns->id);
+}
+
+static int create_mnt_roots(void)
+{
+ int exit_code = -1, cwd_fd;
+
+ if (mnt_roots)
+ return 0;
+
+ cwd_fd = open(".", O_DIRECTORY);
+ if (cwd_fd < 0) {
+ pr_perror("Unable to open cwd");
+ return -1;
+ }
+
+ if (chdir(opts.root ? : "/")) {
+ pr_perror("Unable to change working directory on %s", opts.root);
+ goto out;
+ }
+
+ mnt_roots = strdup(".criu.mntns.XXXXXX");
+ if (mnt_roots == NULL) {
+ pr_perror("Can't allocate memory");
+ goto out;
+ }
+
+ if (mkdtemp(mnt_roots) == NULL) {
+ pr_perror("Unable to create a temporary directory");
+ mnt_roots = NULL;
+ goto out;
+ }
+
+ exit_code = 0;
+out:
+ if (fchdir(cwd_fd)) {
+ pr_perror("Unable to restore cwd");
+ exit_code = -1;
+ }
+ close(cwd_fd);
+
+ return exit_code;
+}
+
+static int rst_collect_local_mntns(enum ns_type typ)
+{
+ struct ns_id *nsid;
+
+ nsid = rst_new_ns_id(0, getpid(), &mnt_ns_desc, typ);
+ if (!nsid)
+ return -1;
+
+ mntinfo = collect_mntinfo(nsid, false);
+ if (!mntinfo)
+ return -1;
+
+ futex_set(&nsid->ns_populated, 1);
+ return 0;
+}
+
+static int get_mp_root(MntEntry *me, struct mount_info *mi)
+{
+ struct ext_mount *em = NULL;
+
+ mi->root = xstrdup(me->root);
+ if (!mi->root)
+ return -1;
+
+ if (!me->ext_mount)
+ goto out;
+
+ /*
+ * External mount point -- get the reverse mapping
+ * from the command line and put into root's place
+ */
+
+ em = ext_mount_lookup(me->root);
+ if (!em) {
+ if (!opts.autodetect_ext_mounts) {
+ pr_err("No mapping for %s mountpoint\n", me->mountpoint);
+ return -1;
+ }
+
+ /*
+ * Make up an external mount entry for this
+ * mount point, since we couldn't find a user
+ * supplied one.
+ */
+ em = xmalloc(sizeof(struct ext_mount));
+ if (!em)
+ return -1;
+
+ /*
+ * Put a : in here since those are invalid on
+ * the cli, so we know it's autogenerated in
+ * debugging.
+ */
+ em->key = AUTODETECTED_MOUNT;
+ em->val = mi->source;
+ }
+
+ mi->external = em;
+out:
+ pr_debug("\t\tWill mount %d from %s%s\n",
+ mi->mnt_id, em ? em->val : mi->root, em ? " (E)" : "");
+ return 0;
+}
+
+static int get_mp_mountpoint(MntEntry *me, struct mount_info *mi, char *root, int root_len)
+{
+ int len;
+
+ len = strlen(me->mountpoint) + root_len + 1;
+ mi->mountpoint = xmalloc(len);
+ if (!mi->mountpoint)
+ return -1;
+
+ /*
+ * For bind-mounts we would also fix the root here
+ * too, but bind-mounts restore merges mountpoint
+ * and root paths together, so there's no need in
+ * that.
+ */
+
+ strcpy(mi->mountpoint, root);
+ strcpy(mi->mountpoint + root_len, me->mountpoint);
+
+ mi->ns_mountpoint = mi->mountpoint + root_len;
+
+ pr_debug("\t\tWill mount %d @ %s\n", mi->mnt_id, mi->mountpoint);
+ return 0;
+}
+
+static int collect_mnt_from_image(struct mount_info **pms, struct ns_id *nsid)
+{
+ MntEntry *me = NULL;
+ int ret, root_len = 1;
+ struct cr_img *img;
+ char root[PATH_MAX] = ".";
+
+ img = open_image(CR_FD_MNTS, O_RSTR, nsid->id);
+ if (!img)
+ return -1;
+
+ if (nsid->type == NS_OTHER)
+ root_len = print_ns_root(nsid, root, sizeof(root));
+
+ pr_debug("Reading mountpoint images (id %d pid %d)\n",
+ nsid->id, (int)nsid->ns_pid);
+
+ while (1) {
+ struct mount_info *pm;
+
+ ret = pb_read_one_eof(img, &me, PB_MNT);
+ if (ret <= 0)
+ break;
+
+ pm = mnt_entry_alloc();
+ if (!pm)
+ goto err;
+
+ pm->nsid = nsid;
+ pm->next = *pms;
+ *pms = pm;
+
+ pm->mnt_id = me->mnt_id;
+ pm->parent_mnt_id = me->parent_mnt_id;
+ pm->s_dev = me->root_dev;
+ pm->flags = me->flags;
+ pm->sb_flags = me->sb_flags;
+ if (!me->has_sb_flags) {
+ const unsigned int mflags = MS_SHARED | MS_PRIVATE |
+ MS_SLAVE | MS_UNBINDABLE |
+ MS_NOSUID | MS_NODEV | MS_NOEXEC |
+ MS_NOATIME | MS_NODIRATIME | MS_RELATIME;
+
+ /*
+ * In old images mnt and sb flags are saved together.
+ * Here we separate them and save the old logic about MS_RDONLY.
+ */
+
+ pm->sb_flags = pm->flags & ~mflags;
+ pm->flags = pm->flags & mflags;
+ }
+ pm->shared_id = me->shared_id;
+ pm->master_id = me->master_id;
+ pm->need_plugin = me->with_plugin;
+ pm->deleted = me->deleted;
+ pm->is_ns_root = is_root(me->mountpoint);
+ if (me->has_internal_sharing)
+ pm->internal_sharing = me->internal_sharing;
+
+ pm->source = xstrdup(me->source);
+ if (!pm->source)
+ goto err;
+
+ pm->options = xstrdup(me->options);
+ if (!pm->options)
+ goto err;
+
+ /* FIXME: abort unsupported early */
+ pm->fstype = decode_fstype(me->fstype, me->fsname);
+
+ if (get_mp_root(me, pm))
+ goto err;
+
+ if (get_mp_mountpoint(me, pm, root, root_len))
+ goto err;
+
+ pr_debug("\tRead %d mp @ %s\n", pm->mnt_id, pm->mountpoint);
+ }
+
+ if (me)
+ mnt_entry__free_unpacked(me, NULL);
+
+ close_image(img);
+
+ return 0;
+err:
+ close_image(img);
+ return -1;
+}
+
+int read_mnt_ns_img(void)
+{
+ struct mount_info *pms = NULL;
+ struct ns_id *nsid;
+
+ for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
+ if (nsid->nd != &mnt_ns_desc)
+ continue;
+
+ if (collect_mnt_from_image(&pms, nsid))
+ return -1;
+ }
+
+ mntinfo = pms;
+ return 0;
+}
+
+int rst_get_mnt_root(int mnt_id, char *path, int plen)
+{
+ struct mount_info *m;
+
+ if (!(root_ns_mask & CLONE_NEWNS) || mnt_id == -1)
+ goto rroot;
+
+ m = lookup_mnt_id(mnt_id);
+ if (m == NULL)
+ return -1;
+
+ if (m->nsid->type == NS_OTHER)
+ return print_ns_root(m->nsid, path, plen);
+
+rroot:
+ path[0] = '/';
+ path[1] = '\0';
+ return 1;
+}
+
+int mntns_maybe_create_roots(void)
+{
+ struct ns_id *ns;
+
+ if (!(root_ns_mask & CLONE_NEWNS))
+ return 0;
+
+ for (ns = ns_ids; ns != NULL; ns = ns->next) {
+ if (ns->nd != &mnt_ns_desc)
+ continue;
+
+ if (ns->type != NS_ROOT) {
+ BUG_ON(ns->type == NS_CRIU);
+
+ /*
+ * If we have more than one (root) namespace,
+ * then we'll need the roots yard.
+ */
+ return create_mnt_roots();
+ }
+ }
+
+ /* No "other" mntns found, just go ahead, we don't need roots yard. */
+ return 0;
+}
+
+static int do_restore_task_mnt_ns(struct ns_id *nsid, struct pstree_item *current)
+{
+ int fd;
+
+ fd = open_proc(root_item->pid.virt, "fd/%d", nsid->mnt.ns_fd);
+ if (fd < 0)
+ return -1;
+
+ if (setns(fd, CLONE_NEWNS)) {
+ pr_perror("Can't restore mntns");
+ close(fd);
+ return -1;
+ }
+ close(fd);
+
+ if (nsid->ns_pid == current->pid.virt)
+ futex_set_and_wake(&nsid->ns_populated, 1);
+
+ return 0;
+}
+
+int restore_task_mnt_ns(struct pstree_item *current)
+{
+ if (current->ids && current->ids->has_mnt_ns_id) {
+ unsigned int id = current->ids->mnt_ns_id;
+ struct ns_id *nsid;
+
+ /*
+ * Regardless of the namespace a task wants to
+ * live in, by that point they all will live in
+ * root's one (see prepare_pstree_kobj_ids() +
+ * get_clone_mask()). So if the current task's
+ * target namespace is the root's one -- it's
+ * already there, otherwise it will have to do
+ * setns().
+ */
+ if (!current->parent || id == current->parent->ids->mnt_ns_id)
+ return 0;
+
+ nsid = lookup_ns_by_id(id, &mnt_ns_desc);
+ if (nsid == NULL) {
+ pr_err("Can't find mount namespace %d\n", id);
+ return -1;
+ }
+
+ BUG_ON(nsid->type == NS_CRIU);
+
+ if (do_restore_task_mnt_ns(nsid, current))
+ return -1;
+ }
+
+ return 0;
+}
+
+void fini_restore_mntns(void)
+{
+ struct ns_id *nsid;
+
+ if (!(root_ns_mask & CLONE_NEWNS))
+ return;
+
+ for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
+ if (nsid->nd != &mnt_ns_desc)
+ continue;
+ close(nsid->mnt.ns_fd);
+ if (nsid->type != NS_ROOT)
+ close(nsid->mnt.root_fd);
+ }
+}
+
+/*
+ * All nested mount namespaces are restore as sub-trees of the root namespace.
+ */
+static int populate_roots_yard(void)
+{
+ char path[PATH_MAX];
+ struct ns_id *nsid;
+
+ if (mnt_roots == NULL)
+ return 0;
+
+ if (make_yard(mnt_roots))
+ return -1;
+
+ for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
+ if (nsid->nd != &mnt_ns_desc)
+ continue;
+
+ print_ns_root(nsid, path, sizeof(path));
+ if (mkdir(path, 0600)) {
+ pr_perror("Unable to create %s", path);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int populate_mnt_ns(void)
+{
+ struct mount_info *pms;
+ struct ns_id *nsid;
+ struct mount_info *roots_mp = NULL;
+
+ if (mnt_roots) {
+ /* mnt_roots is a tmpfs mount and it's private */
+ roots_mp = mnt_entry_alloc();
+ if (!roots_mp)
+ return -1;
+
+ roots_mp->mountpoint = mnt_roots;
+ roots_mp->mounted = true;
+ }
+
+ pms = mnt_build_tree(mntinfo, roots_mp);
+ if (!pms)
+ return -1;
+
+ if (resolve_shared_mounts(mntinfo, pms->master_id))
+ return -1;
+
+ for (nsid = ns_ids; nsid; nsid = nsid->next) {
+ if (nsid->nd != &mnt_ns_desc)
+ continue;
+
+ /*
+ * Make trees of all namespaces look the
+ * same, so that manual paths resolution
+ * works on them.
+ */
+ nsid->mnt.mntinfo_tree = pms;
+ }
+
+ if (validate_mounts(mntinfo, false))
+ return -1;
+
+ /*
+ * Set properties for the root before mounting a root yard,
+ * otherwise the root yard can be propagated into the host
+ * mntns and remain there.
+ */
+ if (do_mount_root(pms))
+ return -1;
+
+ if (populate_roots_yard())
+ return -1;
+
+ return mnt_tree_for_each(pms, do_mount_one);
+}
+
+int depopulate_roots_yard(void)
+{
+ int ret = 0;
+
+ if (mnt_roots == NULL)
+ return 0;
+
+ if (mount("none", mnt_roots, "none", MS_REC|MS_PRIVATE, NULL)) {
+ pr_perror("Can't remount root with MS_PRIVATE");
+ ret = 1;
+ }
+ /*
+ * Don't exit after a first error, becuase this function
+ * can be used to rollback in a error case.
+ * Don't worry about MNT_DETACH, because files are restored after this
+ * and nobody will not be restored from a wrong mount namespace.
+ */
+ if (umount2(mnt_roots, MNT_DETACH)) {
+ pr_perror("Can't unmount %s", mnt_roots);
+ ret = 1;
+ }
+
+ return ret;
+}
+
+void cleanup_mnt_ns(void)
+{
+ char path[PATH_MAX], *root = opts.root ? : "/";
+
+ if (mnt_roots == NULL)
+ return;
+
+ snprintf(path, sizeof(path), "%s/%s", root, mnt_roots);
+ if (rmdir(path))
+ pr_perror("Can't remove the directory %s", mnt_roots);
+}
+
+int prepare_mnt_ns(void)
+{
+ int ret = -1, rst = -1;
+ struct mount_info *old;
+ struct ns_id ns = { .type = NS_CRIU, .ns_pid = PROC_SELF, .nd = &mnt_ns_desc };
+ struct ns_id *nsid;
+
+ if (!(root_ns_mask & CLONE_NEWNS))
+ return rst_collect_local_mntns(NS_CRIU);
+
+ pr_info("Restoring mount namespace\n");
+
+ old = collect_mntinfo(&ns, false);
+ if (old == NULL)
+ return -1;
+
+ if (!opts.root) {
+ if (chdir("/")) {
+ pr_perror("chdir(\"/\") failed");
+ return -1;
+ }
+
+ /*
+ * The new mount namespace is filled with the mountpoint
+ * clones from the original one. We have to umount them
+ * prior to recreating new ones.
+ */
+ pr_info("Cleaning mount namespace\n");
+ if (mnt_tree_for_each_reverse(ns.mnt.mntinfo_tree, do_umount_one))
+ return -1;
+ } else {
+ struct mount_info *mi;
+
+ /*
+ * The whole tree of mountpoints is to be moved into one
+ * place with the pivot_root() call. Don't do manual
+ * umount (as we do above), all this stuff will go away
+ * with a single umount call later.
+ */
+
+ /* moving a mount residing under a shared mount is invalid. */
+ mi = mount_resolve_path(ns.mnt.mntinfo_tree, opts.root);
+ if (mi == NULL) {
+ pr_err("Unable to find mount point for %s\n", opts.root);
+ return -1;
+ }
+ if (mi->parent == NULL) {
+ pr_err("New root and old root are the same\n");
+ return -1;
+ }
+
+ /* Our root is mounted over the parent (in the same directory) */
+ if (!strcmp(mi->parent->mountpoint, mi->mountpoint)) {
+ pr_err("The parent of the new root is unreachable\n");
+ return -1;
+ }
+
+ if (mount("none", mi->parent->mountpoint + 1, "none", MS_SLAVE, NULL)) {
+ pr_perror("Can't remount the parent of the new root with MS_SLAVE");
+ return -1;
+ }
+
+ /* Unprivileged users can't reveal what is under a mount */
+ if (root_ns_mask & CLONE_NEWUSER) {
+ if (mount(opts.root, opts.root, NULL, MS_BIND | MS_REC, NULL)) {
+ pr_perror("Can't remount bind-mount %s into itself", opts.root);
+ return -1;
+ }
+ }
+ if (chdir(opts.root)) {
+ pr_perror("chdir(%s) failed", opts.root ? : "/");
+ return -1;
+ }
+ }
+
+ free_mntinfo(old);
+
+ ret = populate_mnt_ns();
+ if (!ret && opts.root)
+ ret = cr_pivot_root(NULL);
+ if (ret)
+ return -1;
+
+ rst = open_proc(PROC_SELF, "ns/mnt");
+ if (rst < 0)
+ return -1;
+
+ /* resotre non-root namespaces */
+ for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
+ char path[PATH_MAX];
+
+ if (nsid->nd != &mnt_ns_desc)
+ continue;
+ if (nsid->type == NS_ROOT) {
+ /* Pin one with a file descriptor */
+ nsid->mnt.ns_fd = open_proc(PROC_SELF, "ns/mnt");
+ if (nsid->mnt.ns_fd < 0)
+ goto err;
+ /* we set ns_populated so we don't need to open root_fd */
+ futex_set(&nsid->ns_populated, 1);
+ continue;
+ }
+
+ /* Create the new mount namespace */
+ if (unshare(CLONE_NEWNS)) {
+ pr_perror("Unable to create a new mntns");
+ goto err;
+ }
+
+ /* Set its root */
+ path[0] = '/';
+ print_ns_root(nsid, path + 1, sizeof(path) - 1);
+ if (cr_pivot_root(path))
+ goto err;
+
+ /* Pin one with a file descriptor */
+ nsid->mnt.ns_fd = open_proc(PROC_SELF, "ns/mnt");
+ if (nsid->mnt.ns_fd < 0)
+ goto err;
+
+ /* root_fd is used to restore file mappings */
+ nsid->mnt.root_fd = open_proc(PROC_SELF, "root");
+ if (nsid->mnt.root_fd < 0)
+ goto err;
+
+ /* And return back to regain the access to the roots yard */
+ if (setns(rst, CLONE_NEWNS)) {
+ pr_perror("Can't restore mntns back");
+ goto err;
+ }
+ }
+ close(rst);
+
+ return ret;
+err:
+ if (rst >= 0)
+ restore_ns(rst, &mnt_ns_desc);
+ return -1;
+}
+
+static int mntns_root_pid = -1;
+static int mntns_set_root_fd(pid_t pid, int fd)
+{
+ int ret;
+
+ ret = install_service_fd(ROOT_FD_OFF, fd);
+ if (ret >= 0)
+ mntns_root_pid = pid;
+ close(fd);
+
+ return ret;
+}
+
+int __mntns_get_root_fd(pid_t pid)
+{
+
+ int fd, pfd;
+ int ret;
+ char path[PATH_MAX + 1];
+
+ if (mntns_root_pid == pid) /* The required root is already opened */
+ return get_service_fd(ROOT_FD_OFF);
+
+ close_service_fd(ROOT_FD_OFF);
+
+ if (!(root_ns_mask & CLONE_NEWNS)) {
+ /*
+ * If criu and tasks we dump live in the same mount
+ * namespace, we can just open the root directory.
+ * All paths resolution would occur relative to criu's
+ * root. Even if it is not namespace's root, provided
+ * file paths are resolved, we'd get consistent dump.
+ */
+ fd = open("/", O_RDONLY | O_DIRECTORY);
+ if (fd < 0) {
+ pr_perror("Can't open root");
+ return -1;
+ }
+
+ goto set_root;
+ }
+
+ /*
+ * If /proc/pid/root links on '/', it signs that a root of the task
+ * and a root of mntns is the same.
+ */
+
+ pfd = open_pid_proc(pid);
+ ret = readlinkat(pfd, "root", path, sizeof(path) - 1);
+ if (ret < 0) {
+ close_pid_proc();
+ return ret;
+ }
+
+ path[ret] = '\0';
+
+ if (ret != 1 || path[0] != '/') {
+ pr_err("The root task has another root than mntns: %s\n", path);
+ close_pid_proc();
+ return -1;
+ }
+
+ fd = openat(pfd, "root", O_RDONLY | O_DIRECTORY, 0);
+ close_pid_proc();
+ if (fd < 0) {
+ pr_perror("Can't open the task root");
+ return -1;
+ }
+
+set_root:
+ return mntns_set_root_fd(pid, fd);
+}
+
+int mntns_get_root_fd(struct ns_id *mntns) {
+ /*
+ * All namespaces are restored from the root task and during the
+ * CR_STATE_FORKING stage the root task has two file descriptors for
+ * each mntns. One is associated with a namespace and another one is a
+ * root of this mntns.
+ *
+ * When a non-root task is forked, it enters into a proper mount
+ * namespace, restores private mappings and forks children. Some of
+ * these mappings can be associated with files from other namespaces.
+ *
+ * After the CR_STATE_FORKING stage the root task has to close all
+ * mntns file descriptors to restore its descriptors and at this moment
+ * we know that all tasks live in their mount namespaces.
+ *
+ * If we find that a mount namespace isn't populated, we can get its
+ * root from the root task.
+ */
+
+ if (!futex_get(&mntns->ns_populated)) {
+ int fd;
+
+ fd = open_proc(root_item->pid.virt, "fd/%d", mntns->mnt.root_fd);
+ if (fd < 0)
+ return -1;
+
+ return mntns_set_root_fd(mntns->ns_pid, fd);
+ }
+
+ return __mntns_get_root_fd(mntns->ns_pid);
+}
+
+struct ns_id *lookup_nsid_by_mnt_id(int mnt_id)
+{
+ struct mount_info *mi;
+
+ /*
+ * Kernel before 3.15 doesn't show mnt_id for file descriptors.
+ * mnt_id isn't saved for files, if mntns isn't dumped.
+ * In both these cases we have only one root, so here
+ * is not matter which mount will be restured.
+ */
+ if (mnt_id == -1)
+ mi = mntinfo;
+ else
+ mi = lookup_mnt_id(mnt_id);
+ return mi ? mi->nsid : NULL;
+}
+
+int mntns_get_root_by_mnt_id(int mnt_id)
+{
+ struct ns_id *mntns;
+
+ mntns = lookup_nsid_by_mnt_id(mnt_id);
+ BUG_ON(mntns == NULL);
+
+ return mntns_get_root_fd(mntns);
+}
+
+struct collect_mntns_arg {
+ bool need_to_validate;
+ bool for_dump;
+ int root_master_id;
+};
+
+static int collect_mntns(struct ns_id *ns, void *__arg)
+{
+ struct collect_mntns_arg *arg = __arg;
+ struct mount_info *pms;
+
+ pms = collect_mntinfo(ns, arg->for_dump);
+ if (!pms)
+ return -1;
+
+ if (arg->for_dump && ns->type != NS_CRIU)
+ arg->need_to_validate = true;
+
+ mntinfo_add_list(pms);
+
+ if (arg->need_to_validate && ns->id == root_item->ids->mnt_ns_id)
+ arg->root_master_id = ns->mnt.mntinfo_tree->master_id;
+
+ return 0;
+}
+
+int collect_mnt_namespaces(bool for_dump)
+{
+ struct collect_mntns_arg arg;
+ int ret;
+
+ arg.for_dump = for_dump;
+ arg.need_to_validate = false;
+
+ ret = walk_namespaces(&mnt_ns_desc, collect_mntns, &arg);
+ if (ret)
+ goto err;
+
+ ret = resolve_external_mounts(mntinfo);
+ if (ret)
+ goto err;
+
+ if (arg.need_to_validate) {
+ ret = -1;
+
+ if (resolve_shared_mounts(mntinfo, arg.root_master_id))
+ goto err;
+ if (validate_mounts(mntinfo, true))
+ goto err;
+ }
+
+ ret = 0;
+err:
+ return ret;
+}
+
+int dump_mnt_namespaces(void)
+{
+ struct ns_id *nsid;
+
+ if (!(root_ns_mask & CLONE_NEWNS))
+ return 0;
+
+ for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
+ if (nsid->nd != &mnt_ns_desc || nsid->type == NS_CRIU)
+ continue;
+
+ if ((nsid->type == NS_OTHER) && check_mnt_id()) {
+ pr_err("Nested mount namespaces are not supported "
+ "without mnt_id in fdinfo\n");
+ return -1;
+ }
+
+ if (dump_mnt_ns(nsid, nsid->mnt.mntinfo_list))
+ return -1;
+ }
+
+ return 0;
+}
+
+struct ns_desc mnt_ns_desc = NS_DESC_ENTRY(CLONE_NEWNS, "mnt");
diff --git a/criu/namespaces.c b/criu/namespaces.c
new file mode 100644
index 000000000000..9a7836bcad89
--- /dev/null
+++ b/criu/namespaces.c
@@ -0,0 +1,1403 @@
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <sys/prctl.h>
+#include <grp.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <stdarg.h>
+#include <signal.h>
+#include <sched.h>
+
+#include "cr-show.h"
+#include "util.h"
+#include "imgset.h"
+#include "uts_ns.h"
+#include "ipc_ns.h"
+#include "mount.h"
+#include "pstree.h"
+#include "namespaces.h"
+#include "net.h"
+
+#include "protobuf.h"
+#include "protobuf/ns.pb-c.h"
+#include "protobuf/userns.pb-c.h"
+
+static struct ns_desc *ns_desc_array[] = {
+ &net_ns_desc,
+ &uts_ns_desc,
+ &ipc_ns_desc,
+ &pid_ns_desc,
+ &user_ns_desc,
+ &mnt_ns_desc,
+};
+
+static unsigned int parse_ns_link(char *link, size_t len, struct ns_desc *d)
+{
+ unsigned long kid = 0;
+ char *end;
+
+ if (len >= d->len + 2) {
+ if (link[d->len] == ':' && !memcmp(link, d->str, d->len)) {
+ kid = strtoul(&link[d->len + 2], &end, 10);
+ if (end && *end == ']')
+ BUG_ON(kid > UINT_MAX);
+ else
+ kid = 0;
+ }
+ }
+
+ return (unsigned int)kid;
+}
+
+bool check_ns_proc(struct fd_link *link)
+{
+ unsigned int i, kid;
+
+ for (i = 0; i < ARRAY_SIZE(ns_desc_array); i++) {
+ kid = parse_ns_link(link->name + 1, link->len - 1, ns_desc_array[i]);
+ if (!kid)
+ continue;
+
+ link->ns_d = ns_desc_array[i];
+ link->ns_kid = kid;
+ return true;
+ }
+
+ return false;
+}
+
+int switch_ns(int pid, struct ns_desc *nd, int *rst)
+{
+ char buf[32];
+ int nsfd;
+ int ret = -1;
+
+ nsfd = open_proc(pid, "ns/%s", nd->str);
+ if (nsfd < 0) {
+ pr_perror("Can't open ipcns file");
+ goto err_ns;
+ }
+
+ if (rst) {
+ snprintf(buf, sizeof(buf), "/proc/self/ns/%s", nd->str);
+ *rst = open(buf, O_RDONLY);
+ if (*rst < 0) {
+ pr_perror("Can't open ns file");
+ goto err_rst;
+ }
+ }
+
+ ret = setns(nsfd, nd->cflag);
+ if (ret < 0) {
+ pr_perror("Can't setns %d/%s", pid, nd->str);
+ goto err_set;
+ }
+
+ close(nsfd);
+ return 0;
+
+err_set:
+ if (rst)
+ close(*rst);
+err_rst:
+ close(nsfd);
+err_ns:
+ return -1;
+}
+
+int restore_ns(int rst, struct ns_desc *nd)
+{
+ int ret;
+
+ ret = setns(rst, nd->cflag);
+ if (ret < 0)
+ pr_perror("Can't restore ns back");
+
+ close(rst);
+
+ return ret;
+}
+
+struct ns_id *ns_ids = NULL;
+static unsigned int ns_next_id = 1;
+unsigned long root_ns_mask = 0;
+
+static void nsid_add(struct ns_id *ns, struct ns_desc *nd, unsigned int id, pid_t pid)
+{
+ ns->nd = nd;
+ ns->id = id;
+ ns->ns_pid = pid;
+ ns->next = ns_ids;
+ ns_ids = ns;
+
+ pr_info("Add %s ns %d pid %d\n", nd->str, ns->id, ns->ns_pid);
+}
+
+struct ns_id *rst_new_ns_id(unsigned int id, pid_t pid,
+ struct ns_desc *nd, enum ns_type type)
+{
+ struct ns_id *nsid;
+
+ nsid = shmalloc(sizeof(*nsid));
+ if (nsid) {
+ nsid->type = type;
+ nsid_add(nsid, nd, id, pid);
+ futex_set(&nsid->ns_populated, 0);
+ }
+
+ return nsid;
+}
+
+int rst_add_ns_id(unsigned int id, struct pstree_item *i, struct ns_desc *nd)
+{
+ pid_t pid = i->pid.virt;
+ struct ns_id *nsid;
+
+ nsid = lookup_ns_by_id(id, nd);
+ if (nsid) {
+ if (pid_rst_prio(pid, nsid->ns_pid))
+ nsid->ns_pid = pid;
+ return 0;
+ }
+
+ nsid = rst_new_ns_id(id, pid, nd,
+ i == root_item ? NS_ROOT : NS_OTHER);
+ if (nsid == NULL)
+ return -1;
+
+ return 0;
+}
+
+static struct ns_id *lookup_ns_by_kid(unsigned int kid, struct ns_desc *nd)
+{
+ struct ns_id *nsid;
+
+ for (nsid = ns_ids; nsid != NULL; nsid = nsid->next)
+ if (nsid->kid == kid && nsid->nd == nd)
+ return nsid;
+
+ return NULL;
+}
+
+struct ns_id *lookup_ns_by_id(unsigned int id, struct ns_desc *nd)
+{
+ struct ns_id *nsid;
+
+ for (nsid = ns_ids; nsid != NULL; nsid = nsid->next)
+ if (nsid->id == id && nsid->nd == nd)
+ return nsid;
+
+ return NULL;
+}
+
+/*
+ * For all namespaces we support, there are two supported
+ * tasks-to-namespaces layout.
+ *
+ * If root task lives in the same namespace as criu does
+ * all other tasks should live in it too and we do NOT dump
+ * this namespace. On restore tasks inherit the respective
+ * namespace from criu.
+ *
+ * If root task lives in its own namespace, then all other
+ * tasks may live in it. Sometimes (CLONE_SUBNS) there can
+ * be more than one namespace of that type. For this case
+ * we dump all namespace's info and recreate them on restore.
+ */
+
+int walk_namespaces(struct ns_desc *nd, int (*cb)(struct ns_id *, void *), void *oarg)
+{
+ int ret = 0;
+ struct ns_id *ns;
+
+ for (ns = ns_ids; ns != NULL; ns = ns->next) {
+ if (ns->nd != nd)
+ continue;
+
+ if (ns->type == NS_CRIU) {
+ if (root_ns_mask & nd->cflag)
+ continue;
+
+ ret = cb(ns, oarg);
+ break;
+ }
+
+ ret = cb(ns, oarg);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+static unsigned int generate_ns_id(int pid, unsigned int kid, struct ns_desc *nd,
+ struct ns_id **ns_ret)
+{
+ struct ns_id *nsid;
+ enum ns_type type;
+
+ nsid = lookup_ns_by_kid(kid, nd);
+ if (nsid)
+ goto found;
+
+ if (pid != getpid()) {
+ type = NS_OTHER;
+ if (pid == root_item->pid.real) {
+ BUG_ON(root_ns_mask & nd->cflag);
+ pr_info("Will take %s namespace in the image\n", nd->str);
+ root_ns_mask |= nd->cflag;
+ type = NS_ROOT;
+ } else if (nd->cflag & ~CLONE_SUBNS) {
+ pr_err("Can't dump nested %s namespace for %d\n",
+ nd->str, pid);
+ return 0;
+ }
+ } else
+ type = NS_CRIU;
+
+ nsid = xmalloc(sizeof(*nsid));
+ if (!nsid)
+ return 0;
+
+ nsid->type = type;
+ nsid->kid = kid;
+ futex_set(&nsid->ns_populated, 1);
+ nsid_add(nsid, nd, ns_next_id++, pid);
+
+found:
+ if (ns_ret)
+ *ns_ret = nsid;
+ return nsid->id;
+}
+
+static unsigned int __get_ns_id(int pid, struct ns_desc *nd, struct ns_id **ns)
+{
+ int proc_dir, ret;
+ unsigned int kid;
+ char ns_path[10], ns_id[32];
+
+ proc_dir = open_pid_proc(pid);
+ if (proc_dir < 0)
+ return 0;
+
+ sprintf(ns_path, "ns/%s", nd->str);
+ ret = readlinkat(proc_dir, ns_path, ns_id, sizeof(ns_id) - 1);
+ if (ret < 0) {
+ if (errno == ENOENT) {
+ /* The namespace is unsupported */
+ kid = 0;
+ goto out;
+ }
+ pr_perror("Can't readlink ns link");
+ return 0;
+ }
+ ns_id[ret] = '\0';
+
+ kid = parse_ns_link(ns_id, ret, nd);
+ BUG_ON(!kid);
+
+out:
+ return generate_ns_id(pid, kid, nd, ns);
+}
+
+static unsigned int get_ns_id(int pid, struct ns_desc *nd)
+{
+ return __get_ns_id(pid, nd, NULL);
+}
+
+int dump_one_ns_file(int lfd, u32 id, const struct fd_parms *p)
+{
+ struct cr_img *img = img_from_set(glob_imgset, CR_FD_NS_FILES);
+ NsFileEntry nfe = NS_FILE_ENTRY__INIT;
+ struct fd_link *link = p->link;
+ struct ns_id *nsid;
+
+ nsid = lookup_ns_by_kid(link->ns_kid, link->ns_d);
+ if (!nsid) {
+ pr_err("No NS ID with kid %u\n", link->ns_kid);
+ return -1;
+ }
+
+ nfe.id = id;
+ nfe.ns_id = nsid->id;
+ nfe.ns_cflag = link->ns_d->cflag;
+ nfe.flags = p->flags;
+
+ return pb_write_one(img, &nfe, PB_NS_FILE);
+}
+
+const struct fdtype_ops nsfile_dump_ops = {
+ .type = FD_TYPES__NS,
+ .dump = dump_one_ns_file,
+};
+
+struct ns_file_info {
+ struct file_desc d;
+ NsFileEntry *nfe;
+};
+
+static int open_ns_fd(struct file_desc *d)
+{
+ struct ns_file_info *nfi = container_of(d, struct ns_file_info, d);
+ struct pstree_item *item, *t;
+ struct ns_desc *nd = NULL;
+ char path[64];
+ int fd;
+
+ /*
+ * Find out who can open us.
+ *
+ * FIXME I need a hash or RBtree here.
+ */
+ for_each_pstree_item(t) {
+ TaskKobjIdsEntry *ids = t->ids;
+
+ if (ids->pid_ns_id == nfi->nfe->ns_id) {
+ item = t;
+ nd = &pid_ns_desc;
+ break;
+ } else if (ids->net_ns_id == nfi->nfe->ns_id) {
+ item = t;
+ nd = &net_ns_desc;
+ break;
+ } else if (ids->ipc_ns_id == nfi->nfe->ns_id) {
+ item = t;
+ nd = &ipc_ns_desc;
+ break;
+ } else if (ids->uts_ns_id == nfi->nfe->ns_id) {
+ item = t;
+ nd = &uts_ns_desc;
+ break;
+ } else if (ids->mnt_ns_id == nfi->nfe->ns_id) {
+ item = t;
+ nd = &mnt_ns_desc;
+ break;
+ }
+ }
+
+ if (!nd || !item) {
+ pr_err("Can't find suitable NS ID for %#x\n", nfi->nfe->ns_id);
+ return -1;
+ }
+
+ if (nd->cflag != nfi->nfe->ns_cflag) {
+ pr_err("Clone flag mismatch for %#x\n", nfi->nfe->ns_id);
+ return -1;
+ }
+
+ snprintf(path, sizeof(path) - 1, "/proc/%d/ns/%s", item->pid.virt, nd->str);
+ path[sizeof(path) - 1] = '\0';
+
+ fd = open(path, nfi->nfe->flags);
+ if (fd < 0) {
+ pr_perror("Can't open file %s on restore", path);
+ return fd;
+ }
+
+ return fd;
+}
+
+static struct file_desc_ops ns_desc_ops = {
+ .type = FD_TYPES__NS,
+ .open = open_ns_fd,
+};
+
+static int collect_one_nsfile(void *o, ProtobufCMessage *base)
+{
+ struct ns_file_info *nfi = o;
+
+ nfi->nfe = pb_msg(base, NsFileEntry);
+ pr_info("Collected ns file ID %#x NS-ID %#x\n", nfi->nfe->id, nfi->nfe->ns_id);
+ return file_desc_add(&nfi->d, nfi->nfe->id, &ns_desc_ops);
+}
+
+struct collect_image_info nsfile_cinfo = {
+ .fd_type = CR_FD_NS_FILES,
+ .pb_type = PB_NS_FILE,
+ .priv_size = sizeof(struct ns_file_info),
+ .collect = collect_one_nsfile,
+};
+
+/*
+ * Same as dump_task_ns_ids(), but
+ * a) doesn't keep IDs (don't need them)
+ * b) generates them for mount and netns only
+ * mnt ones are needed for open_mount() in
+ * inotify pred-dump
+ * net ones are needed for parasite socket
+ */
+
+int predump_task_ns_ids(struct pstree_item *item)
+{
+ int pid = item->pid.real;
+
+ if (!__get_ns_id(pid, &net_ns_desc, &dmpi(item)->netns))
+ return -1;
+
+ if (!get_ns_id(pid, &mnt_ns_desc))
+ return -1;
+
+ return 0;
+}
+
+int dump_task_ns_ids(struct pstree_item *item)
+{
+ int pid = item->pid.real;
+ TaskKobjIdsEntry *ids = item->ids;
+
+ ids->has_pid_ns_id = true;
+ ids->pid_ns_id = get_ns_id(pid, &pid_ns_desc);
+ if (!ids->pid_ns_id) {
+ pr_err("Can't make pidns id\n");
+ return -1;
+ }
+
+ ids->has_net_ns_id = true;
+ ids->net_ns_id = __get_ns_id(pid, &net_ns_desc, &dmpi(item)->netns);
+ if (!ids->net_ns_id) {
+ pr_err("Can't make netns id\n");
+ return -1;
+ }
+
+ ids->has_ipc_ns_id = true;
+ ids->ipc_ns_id = get_ns_id(pid, &ipc_ns_desc);
+ if (!ids->ipc_ns_id) {
+ pr_err("Can't make ipcns id\n");
+ return -1;
+ }
+
+ ids->has_uts_ns_id = true;
+ ids->uts_ns_id = get_ns_id(pid, &uts_ns_desc);
+ if (!ids->uts_ns_id) {
+ pr_err("Can't make utsns id\n");
+ return -1;
+ }
+
+ ids->has_mnt_ns_id = true;
+ ids->mnt_ns_id = get_ns_id(pid, &mnt_ns_desc);
+ if (!ids->mnt_ns_id) {
+ pr_err("Can't make mntns id\n");
+ return -1;
+ }
+
+ ids->has_user_ns_id = true;
+ ids->user_ns_id = get_ns_id(pid, &user_ns_desc);
+ if (!ids->user_ns_id) {
+ pr_err("Can't make userns id\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+static UsernsEntry userns_entry = USERNS_ENTRY__INIT;
+
+static int userns_id(int id, UidGidExtent **map, int n)
+{
+ int i;
+
+ if (!(root_ns_mask & CLONE_NEWUSER))
+ return id;
+
+ for (i = 0; i < n; i++) {
+ if (map[i]->lower_first <= id &&
+ map[i]->lower_first + map[i]->count > id)
+ return map[i]->first + (id - map[i]->lower_first);
+ }
+
+ return -1;
+}
+
+int userns_uid(int uid)
+{
+ UsernsEntry *e = &userns_entry;
+ return userns_id(uid, e->uid_map, e->n_uid_map);
+}
+
+int userns_gid(int gid)
+{
+ UsernsEntry *e = &userns_entry;
+ return userns_id(gid, e->gid_map, e->n_gid_map);
+}
+
+static int parse_id_map(pid_t pid, char *name, UidGidExtent ***pb_exts)
+{
+ UidGidExtent *extents = NULL;
+ int len = 0, size = 0, ret, i;
+ FILE *f;
+
+ f = fopen_proc(pid, "%s", name);
+ if (f == NULL)
+ return -1;
+
+ ret = -1;
+ while (1) {
+ UidGidExtent *ext;
+
+ if (len == size) {
+ UidGidExtent *t;
+
+ size = size * 2 + 1;
+ t = xrealloc(extents, size * sizeof(UidGidExtent));
+ if (t == NULL)
+ break;
+ extents = t;
+ }
+
+ ext = &extents[len];
+
+ uid_gid_extent__init(ext);
+ ret = fscanf(f, "%d %d %d", &ext->first,
+ &ext->lower_first, &ext->count);
+ if (ret != 3) {
+ if (errno != 0) {
+ pr_perror("Unable to parse extents");
+ ret = -1;
+ } else
+ ret = 0;
+ break;
+ }
+ pr_info("id_map: %d %d %d\n", ext->first, ext->lower_first, ext->count);
+ len++;
+ }
+
+ fclose(f);
+
+ if (ret)
+ goto err;
+
+ if (len) {
+ *pb_exts = xmalloc(sizeof(UidGidExtent *) * len);
+ if (*pb_exts == NULL)
+ goto err;
+
+ for (i = 0; i < len; i++)
+ (*pb_exts)[i] = &extents[i];
+ } else {
+ xfree(extents);
+ *pb_exts = NULL;
+ }
+
+ return len;
+err:
+ xfree(extents);
+ return -1;
+}
+
+int collect_user_ns(struct ns_id *ns, void *oarg)
+{
+ /*
+ * User namespace is dumped before files to get uid and gid
+ * mappings, which are used for convirting local id-s to
+ * userns id-s (userns_uid(), userns_gid())
+ */
+ if (dump_user_ns(root_item->pid.real, root_item->ids->user_ns_id))
+ return -1;
+
+ return 0;
+}
+
+int collect_user_namespaces(bool for_dump)
+{
+ if (!for_dump)
+ return 0;
+
+ if (!(root_ns_mask & CLONE_NEWUSER))
+ return 0;
+
+ return walk_namespaces(&net_ns_desc, collect_user_ns, NULL);
+}
+
+static int check_user_ns(int pid)
+{
+ int status;
+ pid_t chld;
+
+ chld = fork();
+ if (chld == -1) {
+ pr_perror("Unable to fork a process");
+ return -1;
+ }
+
+ if (chld == 0) {
+ /*
+ * Check that we are able to enter into other namespaces
+ * from the target userns namespace. This signs that these
+ * namespaces were created from the target userns.
+ */
+
+ if (switch_ns(pid, &user_ns_desc, NULL))
+ exit(-1);
+
+ if ((root_ns_mask & CLONE_NEWNET) &&
+ switch_ns(pid, &net_ns_desc, NULL))
+ exit(-1);
+ if ((root_ns_mask & CLONE_NEWUTS) &&
+ switch_ns(pid, &uts_ns_desc, NULL))
+ exit(-1);
+ if ((root_ns_mask & CLONE_NEWIPC) &&
+ switch_ns(pid, &ipc_ns_desc, NULL))
+ exit(-1);
+ if ((root_ns_mask & CLONE_NEWNS) &&
+ switch_ns(pid, &mnt_ns_desc, NULL))
+ exit(-1);
+ exit(0);
+ }
+
+ if (waitpid(chld, &status, 0) != chld) {
+ pr_perror("Unable to wait the %d process", pid);
+ return -1;
+ }
+
+ if (status) {
+ pr_err("One or more namespaces doesn't belong to the target user namespace\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+int dump_user_ns(pid_t pid, int ns_id)
+{
+ int ret, exit_code = -1;
+ UsernsEntry *e = &userns_entry;
+ struct cr_img *img;
+
+ if (check_user_ns(pid))
+ return -1;
+
+ ret = parse_id_map(pid, "uid_map", &e->uid_map);
+ if (ret < 0)
+ goto err;
+ e->n_uid_map = ret;
+
+ ret = parse_id_map(pid, "gid_map", &e->gid_map);
+ if (ret < 0)
+ goto err;
+ e->n_gid_map = ret;
+
+ img = open_image(CR_FD_USERNS, O_DUMP, ns_id);
+ if (!img)
+ goto err;
+ ret = pb_write_one(img, e, PB_USERNS);
+ close_image(img);
+ if (ret < 0)
+ goto err;
+
+ return 0;
+err:
+ if (e->uid_map) {
+ xfree(e->uid_map[0]);
+ xfree(e->uid_map);
+ }
+ if (e->gid_map) {
+ xfree(e->gid_map[0]);
+ xfree(e->gid_map);
+ }
+ return exit_code;
+}
+
+void free_userns_maps()
+{
+ if (userns_entry.n_uid_map > 0) {
+ xfree(userns_entry.uid_map[0]);
+ xfree(userns_entry.uid_map);
+ }
+ if (userns_entry.n_gid_map > 0) {
+ xfree(userns_entry.gid_map[0]);
+ xfree(userns_entry.gid_map);
+ }
+}
+
+static int do_dump_namespaces(struct ns_id *ns)
+{
+ int ret;
+
+ ret = switch_ns(ns->ns_pid, ns->nd, NULL);
+ if (ret)
+ return ret;
+
+ switch (ns->nd->cflag) {
+ case CLONE_NEWUTS:
+ pr_info("Dump UTS namespace %d via %d\n",
+ ns->id, ns->ns_pid);
+ ret = dump_uts_ns(ns->id);
+ break;
+ case CLONE_NEWIPC:
+ pr_info("Dump IPC namespace %d via %d\n",
+ ns->id, ns->ns_pid);
+ ret = dump_ipc_ns(ns->id);
+ break;
+ case CLONE_NEWNET:
+ pr_info("Dump NET namespace info %d via %d\n",
+ ns->id, ns->ns_pid);
+ ret = dump_net_ns(ns->id);
+ break;
+ default:
+ pr_err("Unknown namespace flag %x\n", ns->nd->cflag);
+ break;
+ }
+
+ return ret;
+
+}
+
+int dump_namespaces(struct pstree_item *item, unsigned int ns_flags)
+{
+ struct pid *ns_pid = &item->pid;
+ struct ns_id *ns;
+ int pid, nr = 0;
+ int ret = 0;
+
+ /*
+ * The setns syscall is cool, we can switch to the other
+ * namespace and then return back to our initial one, but
+ * for me it's much easier just to fork another task and
+ * let it do the job, all the more so it can be done in
+ * parallel with task dumping routine.
+ *
+ * However, the question how to dump sockets from the target
+ * net namespace with this is still open
+ */
+
+ pr_info("Dumping %d(%d)'s namespaces\n", ns_pid->virt, ns_pid->real);
+
+ if ((ns_flags & CLONE_NEWPID) && ns_pid->virt != 1) {
+ pr_err("Can't dump a pid namespace without the process init\n");
+ return -1;
+ }
+
+ for (ns = ns_ids; ns; ns = ns->next) {
+ /* Skip current namespaces, which are in the list too */
+ if (ns->type == NS_CRIU)
+ continue;
+
+ switch (ns->nd->cflag) {
+ /* No data for pid namespaces to dump */
+ case CLONE_NEWPID:
+ /* Dumped explicitly with dump_mnt_namespaces() */
+ case CLONE_NEWNS:
+ /* Userns is dumped before dumping tasks */
+ case CLONE_NEWUSER:
+ continue;
+ }
+
+ pid = fork();
+ if (pid < 0) {
+ pr_perror("Can't fork ns dumper");
+ return -1;
+ }
+
+ if (pid == 0) {
+ ret = do_dump_namespaces(ns);
+ exit(ret);
+ }
+
+ nr++;
+ }
+
+ while (nr > 0) {
+ int status;
+
+ ret = waitpid(-1, &status, 0);
+ if (ret < 0) {
+ pr_perror("Can't wait ns dumper");
+ return -1;
+ }
+
+ if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
+ pr_err("Namespaces dumping finished with error %d\n", status);
+ return -1;
+ }
+
+ nr--;
+ }
+
+ pr_info("Namespaces dump complete\n");
+ return 0;
+}
+
+static int write_id_map(pid_t pid, UidGidExtent **extents, int n, char *id_map)
+{
+ char buf[PAGE_SIZE];
+ int off = 0, i;
+ int fd;
+
+ /*
+ * We can perform only a single write (that may contain multiple
+ * newline-delimited records) to a uid_map and a gid_map files.
+ */
+ for (i = 0; i < n; i++)
+ off += snprintf(buf + off, sizeof(buf) - off,
+ "%u %u %u\n", extents[i]->first,
+ extents[i]->lower_first,
+ extents[i]->count);
+
+ fd = open_proc_rw(pid, "%s", id_map);
+ if (fd < 0)
+ return -1;
+ if (write(fd, buf, off) != off) {
+ pr_perror("Unable to write into %s", id_map);
+ close(fd);
+ return -1;
+ }
+ close(fd);
+
+ return 0;
+}
+
+struct unsc_msg {
+ struct msghdr h;
+ /*
+ * 0th is the call address
+ * 1st is the flags
+ * 2nd is the optional (NULL in responce) arguments
+ */
+ struct iovec iov[3];
+ char c[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))];
+};
+
+static int usernsd_pid;
+
+static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c,
+ int *x, void *arg, size_t asize, int fd)
+{
+ struct cmsghdr *ch;
+ struct ucred *ucred;
+
+ m->h.msg_iov = m->iov;
+ m->h.msg_iovlen = 2;
+
+ m->iov[0].iov_base = c;
+ m->iov[0].iov_len = sizeof(*c);
+ m->iov[1].iov_base = x;
+ m->iov[1].iov_len = sizeof(*x);
+
+ if (arg) {
+ m->iov[2].iov_base = arg;
+ m->iov[2].iov_len = asize;
+ m->h.msg_iovlen++;
+ }
+
+ m->h.msg_name = NULL;
+ m->h.msg_namelen = 0;
+ m->h.msg_flags = 0;
+
+ m->h.msg_control = &m->c;
+
+ /* Need to memzero because of:
+ * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=514917
+ */
+ memzero(&m->c, sizeof(m->c));
+
+ m->h.msg_controllen = CMSG_SPACE(sizeof(struct ucred));
+
+ ch = CMSG_FIRSTHDR(&m->h);
+ ch->cmsg_len = CMSG_LEN(sizeof(struct ucred));
+ ch->cmsg_level = SOL_SOCKET;
+ ch->cmsg_type = SCM_CREDENTIALS;
+
+ ucred = (struct ucred *) CMSG_DATA(ch);
+ ucred->pid = getpid();
+ ucred->uid = getuid();
+ ucred->gid = getgid();
+
+ if (fd >= 0) {
+ m->h.msg_controllen += CMSG_SPACE(sizeof(int));
+ ch = CMSG_NXTHDR(&m->h, ch);
+ BUG_ON(!ch);
+ ch->cmsg_len = CMSG_LEN(sizeof(int));
+ ch->cmsg_level = SOL_SOCKET;
+ ch->cmsg_type = SCM_RIGHTS;
+ *((int *)CMSG_DATA(ch)) = fd;
+ }
+}
+
+static void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd)
+{
+ struct cmsghdr *ch;
+ struct ucred *ucred;
+
+ ch = CMSG_FIRSTHDR(&um->h);
+ BUG_ON(!ch);
+ BUG_ON(ch->cmsg_len != CMSG_LEN(sizeof(struct ucred)));
+ BUG_ON(ch->cmsg_level != SOL_SOCKET);
+ BUG_ON(ch->cmsg_type != SCM_CREDENTIALS);
+
+ if (pid) {
+ ucred = (struct ucred *) CMSG_DATA(ch);
+ *pid = ucred->pid;
+ }
+
+ ch = CMSG_NXTHDR(&um->h, ch);
+
+ if (ch && ch->cmsg_len == CMSG_LEN(sizeof(int))) {
+ BUG_ON(ch->cmsg_level != SOL_SOCKET);
+ BUG_ON(ch->cmsg_type != SCM_RIGHTS);
+ *fd = *((int *)CMSG_DATA(ch));
+ } else {
+ *fd = -1;
+ }
+}
+
+static int usernsd(int sk)
+{
+ pr_info("uns: Daemon started\n");
+
+ while (1) {
+ struct unsc_msg um;
+ static char msg[MAX_UNSFD_MSG_SIZE];
+ uns_call_t call;
+ int flags, fd, ret;
+ pid_t pid;
+
+ unsc_msg_init(&um, &call, &flags, msg, sizeof(msg), 0);
+ if (recvmsg(sk, &um.h, 0) <= 0) {
+ pr_perror("uns: recv req error");
+ return -1;
+ }
+
+ unsc_msg_pid_fd(&um, &pid, &fd);
+ pr_debug("uns: daemon calls %p (%d, %d, %x)\n", call, pid, fd, flags);
+
+ BUG_ON(fd < 0 && flags & UNS_FDOUT);
+
+ /*
+ * Caller has sent us bare address of the routine it
+ * wants to call. Since the caller is fork()-ed from the
+ * same process as the daemon is, the latter has exactly
+ * the same code at exactly the same address as the
+ * former guy has. So go ahead and just call one!
+ */
+
+ ret = call(msg, fd, pid);
+
+ if (fd >= 0)
+ close(fd);
+
+ if (flags & UNS_ASYNC) {
+ /*
+ * Async call failed and the called doesn't know
+ * about it. Exit now and let the stop_usernsd()
+ * check the exit code and abort the restoration.
+ *
+ * We'd get there either by the end of restore or
+ * from the next userns_call() due to failed
+ * sendmsg() in there.
+ */
+ if (ret < 0) {
+ pr_err("uns: Async call failed. Exiting\n");
+ return -1;
+ }
+
+ continue;
+ }
+
+ if (flags & UNS_FDOUT)
+ fd = ret;
+ else
+ fd = -1;
+
+ unsc_msg_init(&um, &call, &ret, NULL, 0, fd);
+ if (sendmsg(sk, &um.h, 0) <= 0) {
+ pr_perror("uns: send resp error");
+ return -1;
+ }
+
+ if (fd >= 0)
+ close(fd);
+ }
+}
+
+int __userns_call(const char *func_name, uns_call_t call, int flags,
+ void *arg, size_t arg_size, int fd)
+{
+ int ret, res, sk;
+ bool async = flags & UNS_ASYNC;
+ struct unsc_msg um;
+
+ if (unlikely(arg_size > MAX_UNSFD_MSG_SIZE)) {
+ pr_err("uns: message size exceeded\n");
+ return -1;
+ }
+
+ if (!usernsd_pid)
+ return call(arg, fd, getpid());
+
+ sk = get_service_fd(USERNSD_SK);
+ pr_debug("uns: calling %s (%d, %x)\n", func_name, fd, flags);
+
+ if (!async)
+ /*
+ * Why don't we lock for async requests? Because
+ * they just put the request in the daemon's
+ * queue and do not wait for the responce. Thus
+ * when daemon responce there's only one client
+ * waiting for it in recvmsg below, so he
+ * responces to proper caller.
+ */
+ mutex_lock(&task_entries->userns_sync_lock);
+ else
+ /*
+ * If we want the callback to give us and FD then
+ * we should NOT do the asynchronous call.
+ */
+ BUG_ON(flags & UNS_FDOUT);
+
+ /* Send the request */
+
+ unsc_msg_init(&um, &call, &flags, arg, arg_size, fd);
+ ret = sendmsg(sk, &um.h, 0);
+ if (ret <= 0) {
+ pr_perror("uns: send req error");
+ ret = -1;
+ goto out;
+ }
+
+ if (async) {
+ ret = 0;
+ goto out;
+ }
+
+ /* Get the response back */
+
+ unsc_msg_init(&um, &call, &res, NULL, 0, 0);
+ ret = recvmsg(sk, &um.h, 0);
+ if (ret <= 0) {
+ pr_perror("uns: recv resp error");
+ ret = -1;
+ goto out;
+ }
+
+ /* Decode the result and return */
+
+ if (flags & UNS_FDOUT)
+ unsc_msg_pid_fd(&um, NULL, &ret);
+ else
+ ret = res;
+out:
+ if (!async)
+ mutex_unlock(&task_entries->userns_sync_lock);
+
+ return ret;
+}
+
+static int start_usernsd(void)
+{
+ int sk[2];
+ int one = 1;
+
+ if (!(root_ns_mask & CLONE_NEWUSER))
+ return 0;
+
+ /*
+ * Seqpacket to
+ *
+ * a) Help daemon distinguish individual requests from
+ * each other easily. Stream socket require manual
+ * messages boundaries.
+ *
+ * b) Make callers note the damon death by seeing the
+ * disconnected socket. In case of dgram socket
+ * callers would just get stuck in receiving the
+ * responce.
+ */
+
+ if (socketpair(PF_UNIX, SOCK_SEQPACKET, 0, sk)) {
+ pr_perror("Can't make usernsd socket");
+ return -1;
+ }
+
+ if (setsockopt(sk[0], SOL_SOCKET, SO_PASSCRED, &one, sizeof(one)) < 0) {
+ pr_perror("failed to setsockopt");
+ return -1;
+ }
+
+ if (setsockopt(sk[1], SOL_SOCKET, SO_PASSCRED, &one, sizeof(1)) < 0) {
+ pr_perror("failed to setsockopt");
+ return -1;
+ }
+
+ usernsd_pid = fork();
+ if (usernsd_pid < 0) {
+ pr_perror("Can't fork usernsd");
+ close(sk[0]);
+ close(sk[1]);
+ return -1;
+ }
+
+ if (usernsd_pid == 0) {
+ int ret;
+
+ close(sk[0]);
+ ret = usernsd(sk[1]);
+ exit(ret);
+ }
+
+ close(sk[1]);
+ if (install_service_fd(USERNSD_SK, sk[0]) < 0) {
+ kill(usernsd_pid, SIGKILL);
+ waitpid(usernsd_pid, NULL, 0);
+ close(sk[0]);
+ return -1;
+ }
+
+ close(sk[0]);
+ return 0;
+}
+
+static int exit_usernsd(void *arg, int fd, pid_t pid)
+{
+ int code = *(int *)arg;
+ pr_info("uns: `- daemon exits w/ %d\n", code);
+ exit(code);
+}
+
+int stop_usernsd(void)
+{
+ int ret = 0;
+
+ if (usernsd_pid) {
+ int status = -1;
+ sigset_t blockmask, oldmask;
+
+ /*
+ * Don't let the sigchld_handler() mess with us
+ * calling waitpid() on the exited daemon. The
+ * same is done in cr_system().
+ */
+
+ sigemptyset(&blockmask);
+ sigaddset(&blockmask, SIGCHLD);
+ sigprocmask(SIG_BLOCK, &blockmask, &oldmask);
+
+ /*
+ * Send a message to make sure the daemon _has_
+ * proceeded all its queue of asynchronous requests.
+ *
+ * All the restoring processes might have already
+ * closed their USERNSD_SK descriptors, but daemon
+ * still has its in connected state -- this is us
+ * who hold the last reference on the peer.
+ *
+ * If daemon has exited "in advance" due to async
+ * call or socket error, the userns_call() and the
+ * waitpid() below would both fail and we'll see
+ * bad exit status.
+ */
+
+ userns_call(exit_usernsd, UNS_ASYNC, &ret, sizeof(ret), -1);
+ waitpid(usernsd_pid, &status, 0);
+
+ if (WIFEXITED(status))
+ ret = WEXITSTATUS(status);
+ else
+ ret = -1;
+
+ usernsd_pid = 0;
+ sigprocmask(SIG_SETMASK, &oldmask, NULL);
+
+ if (ret != 0)
+ pr_err("uns: daemon exited abnormally\n");
+ else
+ pr_info("uns: daemon stopped\n");
+ }
+
+ return ret;
+}
+
+int prepare_userns(struct pstree_item *item)
+{
+ struct cr_img *img;
+ UsernsEntry *e;
+ int ret;
+
+ img = open_image(CR_FD_USERNS, O_RSTR, item->ids->user_ns_id);
+ if (!img)
+ return -1;
+ ret = pb_read_one(img, &e, PB_USERNS);
+ close_image(img);
+ if (ret < 0)
+ return -1;
+
+ if (write_id_map(item->pid.real, e->uid_map, e->n_uid_map, "uid_map"))
+ return -1;
+
+ if (write_id_map(item->pid.real, e->gid_map, e->n_gid_map, "gid_map"))
+ return -1;
+
+ return 0;
+}
+
+int collect_namespaces(bool for_dump)
+{
+ int ret;
+
+ ret = collect_user_namespaces(for_dump);
+ if (ret < 0)
+ return ret;
+
+ ret = collect_mnt_namespaces(for_dump);
+ if (ret < 0)
+ return ret;
+
+ ret = collect_net_namespaces(for_dump);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+static int prepare_userns_creds()
+{
+ /* UID and GID must be set after restoring /proc/PID/{uid,gid}_maps */
+ if (setuid(0) || setgid(0) || setgroups(0, NULL)) {
+ pr_perror("Unable to initialize id-s");
+ return -1;
+ }
+
+ /*
+ * This flag is dropped after entering userns, but is
+ * required to access files in /proc, so put one here
+ * temoprarily. It will be set to proper value at the
+ * very end.
+ */
+ if (prctl(PR_SET_DUMPABLE, 1, 0)) {
+ pr_perror("Unable to set PR_SET_DUMPABLE");
+ exit(1);
+ }
+
+ return 0;
+}
+
+int prepare_namespace(struct pstree_item *item, unsigned long clone_flags)
+{
+ pid_t pid = item->pid.virt;
+ int id;
+
+ pr_info("Restoring namespaces %d flags 0x%lx\n",
+ item->pid.virt, clone_flags);
+
+ if ((clone_flags & CLONE_NEWUSER) && prepare_userns_creds())
+ return -1;
+
+ /*
+ * On netns restore we launch an IP tool, thus we
+ * have to restore it _before_ altering the mount
+ * tree (i.e. -- mnt_ns restoring)
+ */
+
+ id = ns_per_id ? item->ids->net_ns_id : pid;
+ if ((clone_flags & CLONE_NEWNET) && prepare_net_ns(id))
+ return -1;
+ id = ns_per_id ? item->ids->uts_ns_id : pid;
+ if ((clone_flags & CLONE_NEWUTS) && prepare_utsns(id))
+ return -1;
+ id = ns_per_id ? item->ids->ipc_ns_id : pid;
+ if ((clone_flags & CLONE_NEWIPC) && prepare_ipc_ns(id))
+ return -1;
+
+ /*
+ * This one is special -- there can be several mount
+ * namespaces and prepare_mnt_ns handles them itself.
+ */
+ if (prepare_mnt_ns())
+ return -1;
+
+ return 0;
+}
+
+int prepare_namespace_before_tasks(void)
+{
+ if (start_usernsd())
+ goto err_unds;
+
+ if (netns_keep_nsfd())
+ goto err_netns;
+
+ if (mntns_maybe_create_roots())
+ goto err_mnt;
+
+ if (read_mnt_ns_img())
+ goto err_img;
+
+ return 0;
+
+err_img:
+ cleanup_mnt_ns();
+err_mnt:
+ /*
+ * Nothing, netns' descriptor will be closed
+ * on criu exit
+ */
+err_netns:
+ stop_usernsd();
+err_unds:
+ return -1;
+}
+
+int try_show_namespaces(int ns_pid)
+{
+ struct cr_imgset *imgset;
+ int i, ret;
+ struct cr_img *img;
+ TaskKobjIdsEntry *ids;
+
+ pr_msg("Namespaces for %d:\n", ns_pid);
+
+ img = open_image(CR_FD_IDS, O_RSTR, ns_pid);
+ if (!img)
+ return -1;
+ ret = pb_read_one(img, &ids, PB_IDS);
+ close_image(img);
+ if (ret < 0)
+ return -1;
+
+ imgset = cr_imgset_open(ids->net_ns_id, NETNS, O_SHOW);
+ if (imgset) {
+ pr_msg("-------------------NETNS---------------------\n");
+ for (i = _CR_FD_NETNS_FROM + 1; i < _CR_FD_NETNS_TO; i++) {
+ img = img_from_set(imgset, i);
+ if (!img)
+ continue;
+
+ cr_parse_fd(img, imgset_template[i].magic);
+ }
+ close_cr_imgset(&imgset);
+ }
+
+ imgset = cr_imgset_open(ids->ipc_ns_id, IPCNS, O_SHOW);
+ if (imgset) {
+ pr_msg("-------------------IPCNS---------------------\n");
+ for (i = _CR_FD_IPCNS_FROM + 1; i < _CR_FD_IPCNS_TO; i++) {
+ img = img_from_set(imgset, i);
+ if (!img)
+ continue;
+
+ cr_parse_fd(img, imgset_template[i].magic);
+ }
+ close_cr_imgset(&imgset);
+ }
+
+ img = open_image(CR_FD_UTSNS, O_SHOW, ids->uts_ns_id);
+ if (img) {
+ pr_msg("-------------------UTSNS---------------------\n");
+ cr_parse_fd(img, imgset_template[CR_FD_UTSNS].magic);
+ close_image(img);
+ }
+
+ img = open_image(CR_FD_MNTS, O_SHOW, ids->mnt_ns_id);
+ if (img) {
+ pr_msg("-------------------MNTNS---------------------\n");
+ cr_parse_fd(img, imgset_template[CR_FD_MNTS].magic);
+ close_image(img);
+ }
+
+ pr_msg("---[ end of %d namespaces ]---\n", ns_pid);
+ return 0;
+}
+
+struct ns_desc pid_ns_desc = NS_DESC_ENTRY(CLONE_NEWPID, "pid");
+struct ns_desc user_ns_desc = NS_DESC_ENTRY(CLONE_NEWUSER, "user");
diff --git a/criu/net.c b/criu/net.c
new file mode 100644
index 000000000000..9f62cd0c6ba2
--- /dev/null
+++ b/criu/net.c
@@ -0,0 +1,1429 @@
+#include <unistd.h>
+#include <sys/socket.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nfnetlink_conntrack.h>
+#include <linux/netfilter/nf_conntrack_tcp.h>
+#include <string.h>
+#include <net/if_arp.h>
+#include <sys/wait.h>
+#include <sched.h>
+#include <sys/mount.h>
+#include <net/if.h>
+#include <linux/sockios.h>
+#include <libnl3/netlink/msg.h>
+
+#include "imgset.h"
+#include "namespaces.h"
+#include "net.h"
+#include "libnetlink.h"
+#include "cr_options.h"
+#include "sk-inet.h"
+#include "tun.h"
+#include "util-pie.h"
+#include "plugin.h"
+#include "action-scripts.h"
+#include "sockets.h"
+#include "pstree.h"
+#include "string.h"
+#include "sysctl.h"
+#include "kerndat.h"
+
+#include "protobuf.h"
+#include "protobuf/netdev.pb-c.h"
+
+static int ns_sysfs_fd = -1;
+
+int read_ns_sys_file(char *path, char *buf, int len)
+{
+ int fd, rlen;
+
+ BUG_ON(ns_sysfs_fd == -1);
+
+ fd = openat(ns_sysfs_fd, path, O_RDONLY, 0);
+ if (fd < 0) {
+ pr_perror("Can't open ns' %s", path);
+ return -1;
+ }
+
+ rlen = read(fd, buf, len);
+ close(fd);
+
+ if (rlen == len) {
+ pr_err("Too small buffer to read ns sys file %s\n", path);
+ return -1;
+ }
+
+ if (rlen > 0)
+ buf[rlen - 1] = '\0';
+
+ return rlen;
+}
+
+static char *devconfs[] = {
+ "accept_local",
+ "accept_redirects",
+ "accept_source_route",
+ "arp_accept",
+ "arp_announce",
+ "arp_filter",
+ "arp_ignore",
+ "arp_notify",
+ "bootp_relay",
+ "disable_policy",
+ "disable_xfrm",
+ "force_igmp_version",
+ "forwarding",
+ "igmpv2_unsolicited_report_interval",
+ "igmpv3_unsolicited_report_interval",
+ "log_martians",
+ "medium_id",
+ "promote_secondaries",
+ "proxy_arp",
+ "proxy_arp_pvlan",
+ "route_localnet",
+ "rp_filter",
+ "secure_redirects",
+ "send_redirects",
+ "shared_media",
+ "src_valid_mark",
+ "tag",
+ "ignore_routes_with_linkdown",
+};
+
+/*
+ * I case if some entry is missing in
+ * the kernel, simply write DEVCONFS_UNUSED
+ * into the image so we would skip it.
+ */
+#define DEVCONFS_UNUSED (-1u)
+
+#define NET_CONF_PATH "net/ipv4/conf"
+#define MAX_CONF_OPT_PATH IFNAMSIZ+50
+
+static int ipv4_conf_op(char *tgt, int *conf, int n, int op, NetnsEntry **netns)
+{
+ int i, ri;
+ int ret, flags = op == CTL_READ ? CTL_FLAGS_OPTIONAL : 0;
+ struct sysctl_req req[ARRAY_SIZE(devconfs)];
+ char path[ARRAY_SIZE(devconfs)][MAX_CONF_OPT_PATH];
+
+ if (n > ARRAY_SIZE(devconfs))
+ pr_warn("The image contains unknown sysctl-s\n");
+
+ for (i = 0, ri = 0; i < ARRAY_SIZE(devconfs); i++) {
+ if (i >= n) {
+ pr_warn("Skip %s/%s\n", tgt, devconfs[i]);
+ continue;
+ }
+ /*
+ * If dev conf value is the same as default skip restoring it
+ */
+ if (netns && conf[i] == (*netns)->def_conf[i]) {
+ pr_debug("DEBUG Skip %s/%s, val =%d\n", tgt, devconfs[i], conf[i]);
+ continue;
+ }
+
+ if (op == CTL_WRITE && conf[i] == DEVCONFS_UNUSED)
+ continue;
+ else if (op == CTL_READ)
+ conf[i] = DEVCONFS_UNUSED;
+
+ snprintf(path[i], MAX_CONF_OPT_PATH, "%s/%s/%s", NET_CONF_PATH, tgt, devconfs[i]);
+ req[ri].name = path[i];
+ req[ri].arg = &conf[i];
+ req[ri].type = CTL_32;
+ req[ri].flags = flags;
+ ri++;
+ }
+
+ ret = sysctl_op(req, ri, op, CLONE_NEWNET);
+ if (ret < 0) {
+ pr_err("Failed to %s %s/<confs>\n", (op == CTL_READ)?"read":"write", tgt);
+ return -1;
+ }
+ return 0;
+}
+
+int write_netdev_img(NetDeviceEntry *nde, struct cr_imgset *fds)
+{
+ return pb_write_one(img_from_set(fds, CR_FD_NETDEV), nde, PB_NETDEV);
+}
+
+static int dump_one_netdev(int type, struct ifinfomsg *ifi,
+ struct rtattr **tb, struct cr_imgset *fds,
+ int (*dump)(NetDeviceEntry *, struct cr_imgset *))
+{
+ int ret;
+ NetDeviceEntry netdev = NET_DEVICE_ENTRY__INIT;
+
+ if (!tb[IFLA_IFNAME]) {
+ pr_err("No name for link %d\n", ifi->ifi_index);
+ return -1;
+ }
+
+ netdev.type = type;
+ netdev.ifindex = ifi->ifi_index;
+ netdev.mtu = *(int *)RTA_DATA(tb[IFLA_MTU]);
+ netdev.flags = ifi->ifi_flags;
+ netdev.name = RTA_DATA(tb[IFLA_IFNAME]);
+
+ if (tb[IFLA_ADDRESS] && (type != ND_TYPE__LOOPBACK)) {
+ netdev.has_address = true;
+ netdev.address.data = RTA_DATA(tb[IFLA_ADDRESS]);
+ netdev.address.len = RTA_PAYLOAD(tb[IFLA_ADDRESS]);
+ pr_info("Found ll addr (%02x:../%d) for %s\n",
+ (int)netdev.address.data[0],
+ (int)netdev.address.len, netdev.name);
+ }
+
+ netdev.n_conf = ARRAY_SIZE(devconfs);
+ netdev.conf = xmalloc(sizeof(int) * netdev.n_conf);
+ if (!netdev.conf)
+ return -1;
+
+ ret = ipv4_conf_op(netdev.name, netdev.conf, netdev.n_conf, CTL_READ, NULL);
+ if (ret < 0)
+ goto err_free;
+
+ if (!dump)
+ dump = write_netdev_img;
+
+ ret = dump(&netdev, fds);
+err_free:
+ xfree(netdev.conf);
+ return ret;
+}
+
+static char *link_kind(struct ifinfomsg *ifi, struct rtattr **tb)
+{
+ struct rtattr *linkinfo[IFLA_INFO_MAX + 1];
+
+ if (!tb[IFLA_LINKINFO]) {
+ pr_err("No linkinfo for eth link %d\n", ifi->ifi_index);
+ return NULL;
+ }
+
+ parse_rtattr_nested(linkinfo, IFLA_INFO_MAX, tb[IFLA_LINKINFO]);
+ if (!linkinfo[IFLA_INFO_KIND]) {
+ pr_err("No kind for eth link %d\n", ifi->ifi_index);
+ return NULL;
+ }
+
+ return RTA_DATA(linkinfo[IFLA_INFO_KIND]);
+}
+
+static int dump_unknown_device(struct ifinfomsg *ifi, char *kind,
+ struct rtattr **tb, struct cr_imgset *fds)
+{
+ int ret;
+
+ ret = run_plugins(DUMP_EXT_LINK, ifi->ifi_index, ifi->ifi_type, kind);
+ if (ret == 0)
+ return dump_one_netdev(ND_TYPE__EXTLINK, ifi, tb, fds, NULL);
+
+ if (ret == -ENOTSUP)
+ pr_err("Unsupported link %d (type %d kind %s)\n",
+ ifi->ifi_index, ifi->ifi_type, kind);
+ return -1;
+}
+
+static int dump_bridge(NetDeviceEntry *nde, struct cr_imgset *imgset)
+{
+ char spath[IFNAMSIZ + 16]; /* len("class/net//brif") + 1 for null */
+ int ret, fd;
+
+ ret = snprintf(spath, sizeof(spath), "class/net/%s/brif", nde->name);
+ if (ret < 0 || ret >= sizeof(spath))
+ return -1;
+
+ /* Let's only allow dumping empty bridges for now. To do a full bridge
+ * restore, we need to make sure the bridge and slaves are restored in
+ * the right order and attached correctly. It looks like the veth code
+ * supports this, but we need some way to do ordering.
+ */
+ fd = openat(ns_sysfs_fd, spath, O_DIRECTORY, 0);
+ if (fd < 0) {
+ pr_perror("opening %s failed", spath);
+ return -1;
+ }
+
+ ret = is_empty_dir(fd);
+ close(fd);
+ if (ret < 0) {
+ pr_perror("problem testing %s for emptiness", spath);
+ return -1;
+ }
+
+ if (!ret) {
+ pr_err("dumping bridges with attached slaves not supported currently\n");
+ return -1;
+ }
+
+ return write_netdev_img(nde, imgset);
+}
+
+static int dump_one_ethernet(struct ifinfomsg *ifi, char *kind,
+ struct rtattr **tb, struct cr_imgset *fds)
+{
+ if (!strcmp(kind, "veth"))
+ /*
+ * This is not correct. The peer of the veth device may
+ * be either outside or inside the netns we're working
+ * on, but there's currently no way of finding this out.
+ *
+ * Sigh... we have to assume, that the veth device is a
+ * connection to the outer world and just dump this end :(
+ */
+ return dump_one_netdev(ND_TYPE__VETH, ifi, tb, fds, NULL);
+ if (!strcmp(kind, "tun"))
+ return dump_one_netdev(ND_TYPE__TUN, ifi, tb, fds, dump_tun_link);
+ if (!strcmp(kind, "bridge"))
+ return dump_one_netdev(ND_TYPE__BRIDGE, ifi, tb, fds, dump_bridge);
+
+ return dump_unknown_device(ifi, kind, tb, fds);
+}
+
+static int dump_one_gendev(struct ifinfomsg *ifi, char *kind,
+ struct rtattr **tb, struct cr_imgset *fds)
+{
+ if (!strcmp(kind, "tun"))
+ return dump_one_netdev(ND_TYPE__TUN, ifi, tb, fds, dump_tun_link);
+
+ return dump_unknown_device(ifi, kind, tb, fds);
+}
+
+static int dump_one_voiddev(struct ifinfomsg *ifi, char *kind,
+ struct rtattr **tb, struct cr_imgset *fds)
+{
+ if (!strcmp(kind, "venet"))
+ return dump_one_netdev(ND_TYPE__VENET, ifi, tb, fds, NULL);
+
+ return dump_unknown_device(ifi, kind, tb, fds);
+}
+
+static int dump_one_link(struct nlmsghdr *hdr, void *arg)
+{
+ struct cr_imgset *fds = arg;
+ struct ifinfomsg *ifi;
+ int ret = 0, len = hdr->nlmsg_len - NLMSG_LENGTH(sizeof(*ifi));
+ struct rtattr *tb[IFLA_MAX + 1];
+ char *kind;
+
+ ifi = NLMSG_DATA(hdr);
+
+ if (len < 0) {
+ pr_err("No iflas for link %d\n", ifi->ifi_index);
+ return -1;
+ }
+
+ parse_rtattr(tb, IFLA_MAX, IFLA_RTA(ifi), len);
+ pr_info("\tLD: Got link %d, type %d\n", ifi->ifi_index, ifi->ifi_type);
+
+ if (ifi->ifi_type == ARPHRD_LOOPBACK)
+ return dump_one_netdev(ND_TYPE__LOOPBACK, ifi, tb, fds, NULL);
+
+ kind = link_kind(ifi, tb);
+ if (!kind)
+ goto unk;
+
+ switch (ifi->ifi_type) {
+ case ARPHRD_ETHER:
+ ret = dump_one_ethernet(ifi, kind, tb, fds);
+ break;
+ case ARPHRD_NONE:
+ ret = dump_one_gendev(ifi, kind, tb, fds);
+ break;
+ case ARPHRD_VOID:
+ ret = dump_one_voiddev(ifi, kind, tb, fds);
+ break;
+ default:
+unk:
+ ret = dump_unknown_device(ifi, kind, tb, fds);
+ break;
+ }
+
+ return ret;
+}
+
+static int dump_one_nf(struct nlmsghdr *hdr, void *arg)
+{
+ struct cr_img *img = arg;
+
+ if (lazy_image(img) && open_image_lazy(img))
+ return -1;
+
+ if (write_img_buf(img, hdr, hdr->nlmsg_len))
+ return -1;
+
+ return 0;
+}
+
+static int ct_restore_callback(struct nlmsghdr *nlh)
+{
+ struct nfgenmsg *msg;
+ struct nlattr *tb[CTA_MAX+1], *tbp[CTA_PROTOINFO_MAX + 1], *tb_tcp[CTA_PROTOINFO_TCP_MAX+1];
+ int err;
+
+ msg = NLMSG_DATA(nlh);
+
+ if (msg->nfgen_family != AF_INET && msg->nfgen_family != AF_INET6)
+ return 0;
+
+ err = nlmsg_parse(nlh, sizeof(struct nfgenmsg), tb, CTA_MAX, NULL);
+ if (err < 0)
+ return -1;
+
+ if (!tb[CTA_PROTOINFO])
+ return 0;
+
+ err = nla_parse_nested(tbp, CTA_PROTOINFO_MAX, tb[CTA_PROTOINFO], NULL);
+ if (err < 0)
+ return -1;
+
+ if (!tbp[CTA_PROTOINFO_TCP])
+ return 0;
+
+ err = nla_parse_nested(tb_tcp, CTA_PROTOINFO_TCP_MAX, tbp[CTA_PROTOINFO_TCP], NULL);
+ if (err < 0)
+ return -1;
+
+ if (tb_tcp[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) {
+ struct nf_ct_tcp_flags *flags;
+
+ flags = nla_data(tb_tcp[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]);
+ flags->flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+ flags->mask |= IP_CT_TCP_FLAG_BE_LIBERAL;
+ }
+
+ if (tb_tcp[CTA_PROTOINFO_TCP_FLAGS_REPLY]) {
+ struct nf_ct_tcp_flags *flags;
+
+ flags = nla_data(tb_tcp[CTA_PROTOINFO_TCP_FLAGS_REPLY]);
+ flags->flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
+ flags->mask |= IP_CT_TCP_FLAG_BE_LIBERAL;
+ }
+
+ return 0;
+}
+
+static int restore_nf_ct(int pid, int type)
+{
+ struct nlmsghdr *nlh = NULL;
+ int exit_code = -1, sk;
+ struct cr_img *img;
+
+ img = open_image(type, O_RSTR, pid);
+ if (empty_image(img)) {
+ close_image(img);
+ return 0;
+ }
+
+ sk = socket(AF_NETLINK, SOCK_RAW, NETLINK_NETFILTER);
+ if (sk < 0) {
+ pr_perror("Can't open rtnl sock for net dump");
+ goto out_img;
+ }
+
+ nlh = xmalloc(sizeof(struct nlmsghdr));
+ if (nlh == NULL)
+ goto out;
+
+ while (1) {
+ struct nlmsghdr *p;
+ int ret;
+
+ ret = read_img_buf_eof(img, nlh, sizeof(struct nlmsghdr));
+ if (ret < 0)
+ goto out;
+ if (ret == 0)
+ break;
+
+ p = xrealloc(nlh, nlh->nlmsg_len);
+ if (p == NULL)
+ goto out;
+ nlh = p;
+
+ ret = read_img_buf_eof(img, nlh + 1, nlh->nlmsg_len - sizeof(struct nlmsghdr));
+ if (ret < 0)
+ goto out;
+ if (ret == 0) {
+ pr_err("The image file was truncated\n");
+ goto out;
+ }
+
+ if (type == CR_FD_NETNF_CT)
+ if (ct_restore_callback(nlh))
+ goto out;
+
+ nlh->nlmsg_flags = NLM_F_REQUEST|NLM_F_ACK|NLM_F_CREATE;
+ ret = do_rtnl_req(sk, nlh, nlh->nlmsg_len, NULL, NULL, NULL);
+ if (ret)
+ goto out;
+ }
+
+ exit_code = 0;
+out:
+ xfree(nlh);
+ close(sk);
+out_img:
+ close_image(img);
+ return exit_code;
+}
+
+static int dump_nf_ct(struct cr_imgset *fds, int type)
+{
+ struct cr_img *img;
+ struct {
+ struct nlmsghdr nlh;
+ struct nfgenmsg g;
+ } req;
+ int sk, ret;
+
+ pr_info("Dumping netns links\n");
+
+ ret = sk = socket(AF_NETLINK, SOCK_RAW, NETLINK_NETFILTER);
+ if (sk < 0) {
+ pr_perror("Can't open rtnl sock for net dump");
+ goto out;
+ }
+
+ memset(&req, 0, sizeof(req));
+ req.nlh.nlmsg_len = sizeof(req);
+ req.nlh.nlmsg_type = (NFNL_SUBSYS_CTNETLINK << 8);
+
+ if (type == CR_FD_NETNF_CT)
+ req.nlh.nlmsg_type |= IPCTNL_MSG_CT_GET;
+ else if (type == CR_FD_NETNF_EXP)
+ req.nlh.nlmsg_type |= IPCTNL_MSG_EXP_GET;
+ else
+ BUG();
+
+ req.nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST;
+ req.nlh.nlmsg_pid = 0;
+ req.nlh.nlmsg_seq = CR_NLMSG_SEQ;
+ req.g.nfgen_family = AF_UNSPEC;
+
+ img = img_from_set(fds, type);
+
+ ret = do_rtnl_req(sk, &req, sizeof(req), dump_one_nf, NULL, img);
+ close(sk);
+out:
+ return ret;
+
+}
+
+static int dump_links(struct cr_imgset *fds)
+{
+ int sk, ret;
+ struct {
+ struct nlmsghdr nlh;
+ struct rtgenmsg g;
+ } req;
+
+ pr_info("Dumping netns links\n");
+
+ ret = sk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+ if (sk < 0) {
+ pr_perror("Can't open rtnl sock for net dump");
+ goto out;
+ }
+
+ memset(&req, 0, sizeof(req));
+ req.nlh.nlmsg_len = sizeof(req);
+ req.nlh.nlmsg_type = RTM_GETLINK;
+ req.nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST;
+ req.nlh.nlmsg_pid = 0;
+ req.nlh.nlmsg_seq = CR_NLMSG_SEQ;
+ req.g.rtgen_family = AF_PACKET;
+
+ ret = do_rtnl_req(sk, &req, sizeof(req), dump_one_link, NULL, fds);
+ close(sk);
+out:
+ return ret;
+}
+
+static int restore_link_cb(struct nlmsghdr *hdr, void *arg)
+{
+ pr_info("Got response on SETLINK =)\n");
+ return 0;
+}
+
+struct newlink_req {
+ struct nlmsghdr h;
+ struct ifinfomsg i;
+ char buf[1024];
+};
+
+static int do_rtm_link_req(int msg_type, NetDeviceEntry *nde, int nlsk,
+ int (*link_info)(NetDeviceEntry *, struct newlink_req *))
+{
+ struct newlink_req req;
+
+ memset(&req, 0, sizeof(req));
+
+ req.h.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
+ req.h.nlmsg_flags = NLM_F_REQUEST|NLM_F_ACK|NLM_F_CREATE;
+ req.h.nlmsg_type = msg_type;
+ req.h.nlmsg_seq = CR_NLMSG_SEQ;
+ req.i.ifi_family = AF_PACKET;
+ /*
+ * SETLINK is called for external devices which may
+ * have ifindex changed. Thus configure them by their
+ * name only.
+ */
+ if (msg_type == RTM_NEWLINK)
+ req.i.ifi_index = nde->ifindex;
+ req.i.ifi_flags = nde->flags;
+
+ addattr_l(&req.h, sizeof(req), IFLA_IFNAME, nde->name, strlen(nde->name));
+ addattr_l(&req.h, sizeof(req), IFLA_MTU, &nde->mtu, sizeof(nde->mtu));
+
+ if (nde->has_address) {
+ pr_debug("Restore ll addr (%02x:../%d) for device\n",
+ (int)nde->address.data[0], (int)nde->address.len);
+ addattr_l(&req.h, sizeof(req), IFLA_ADDRESS,
+ nde->address.data, nde->address.len);
+ }
+
+ if (link_info) {
+ struct rtattr *linkinfo;
+ int ret;
+
+ linkinfo = NLMSG_TAIL(&req.h);
+ addattr_l(&req.h, sizeof(req), IFLA_LINKINFO, NULL, 0);
+
+ ret = link_info(nde, &req);
+ if (ret < 0)
+ return ret;
+
+ linkinfo->rta_len = (void *)NLMSG_TAIL(&req.h) - (void *)linkinfo;
+ }
+
+ return do_rtnl_req(nlsk, &req, req.h.nlmsg_len, restore_link_cb, NULL, NULL);
+}
+
+int restore_link_parms(NetDeviceEntry *nde, int nlsk)
+{
+ return do_rtm_link_req(RTM_SETLINK, nde, nlsk, NULL);
+}
+
+static int restore_one_link(NetDeviceEntry *nde, int nlsk,
+ int (*link_info)(NetDeviceEntry *, struct newlink_req *))
+{
+ pr_info("Restoring netdev %s idx %d\n", nde->name, nde->ifindex);
+ return do_rtm_link_req(RTM_NEWLINK, nde, nlsk, link_info);
+}
+
+#ifndef VETH_INFO_MAX
+enum {
+ VETH_INFO_UNSPEC,
+ VETH_INFO_PEER,
+
+ __VETH_INFO_MAX
+#define VETH_INFO_MAX (__VETH_INFO_MAX - 1)
+};
+#endif
+
+#if IFLA_MAX <= 28
+#define IFLA_NET_NS_FD 28
+#endif
+
+static int veth_link_info(NetDeviceEntry *nde, struct newlink_req *req)
+{
+ int ns_fd = get_service_fd(NS_FD_OFF);
+ struct rtattr *veth_data, *peer_data;
+ struct ifinfomsg ifm;
+ struct veth_pair *n;
+
+ BUG_ON(ns_fd < 0);
+
+ addattr_l(&req->h, sizeof(*req), IFLA_INFO_KIND, "veth", 4);
+
+ veth_data = NLMSG_TAIL(&req->h);
+ addattr_l(&req->h, sizeof(*req), IFLA_INFO_DATA, NULL, 0);
+ peer_data = NLMSG_TAIL(&req->h);
+ memset(&ifm, 0, sizeof(ifm));
+ addattr_l(&req->h, sizeof(*req), VETH_INFO_PEER, &ifm, sizeof(ifm));
+ list_for_each_entry(n, &opts.veth_pairs, node) {
+ if (!strcmp(nde->name, n->inside))
+ break;
+ }
+ if (&n->node != &opts.veth_pairs)
+ addattr_l(&req->h, sizeof(*req), IFLA_IFNAME, n->outside, strlen(n->outside));
+ addattr_l(&req->h, sizeof(*req), IFLA_NET_NS_FD, &ns_fd, sizeof(ns_fd));
+ peer_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)peer_data;
+ veth_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)veth_data;
+
+ return 0;
+}
+
+static int venet_link_info(NetDeviceEntry *nde, struct newlink_req *req)
+{
+ int ns_fd = get_service_fd(NS_FD_OFF);
+ struct rtattr *venet_data;
+
+ BUG_ON(ns_fd < 0);
+
+ venet_data = NLMSG_TAIL(&req->h);
+ addattr_l(&req->h, sizeof(*req), IFLA_INFO_KIND, "venet", 5);
+ addattr_l(&req->h, sizeof(*req), IFLA_INFO_DATA, NULL, 0);
+ addattr_l(&req->h, sizeof(*req), IFLA_NET_NS_FD, &ns_fd, sizeof(ns_fd));
+ venet_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)venet_data;
+
+ return 0;
+}
+
+static int bridge_link_info(NetDeviceEntry *nde, struct newlink_req *req)
+{
+ struct rtattr *bridge_data;
+
+ bridge_data = NLMSG_TAIL(&req->h);
+ addattr_l(&req->h, sizeof(*req), IFLA_INFO_KIND, "bridge", sizeof("bridge"));
+ bridge_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)bridge_data;
+
+ return 0;
+}
+
+static int restore_link(NetDeviceEntry *nde, int nlsk)
+{
+ pr_info("Restoring link %s type %d\n", nde->name, nde->type);
+
+ switch (nde->type) {
+ case ND_TYPE__LOOPBACK: /* fallthrough */
+ case ND_TYPE__EXTLINK: /* see comment in protobuf/netdev.proto */
+ return restore_link_parms(nde, nlsk);
+ case ND_TYPE__VENET:
+ return restore_one_link(nde, nlsk, venet_link_info);
+ case ND_TYPE__VETH:
+ return restore_one_link(nde, nlsk, veth_link_info);
+ case ND_TYPE__TUN:
+ return restore_one_tun(nde, nlsk);
+ case ND_TYPE__BRIDGE:
+ return restore_one_link(nde, nlsk, bridge_link_info);
+
+ default:
+ pr_err("Unsupported link type %d\n", nde->type);
+ break;
+ }
+
+ return -1;
+}
+
+static int restore_links(int pid, NetnsEntry **netns)
+{
+ int nlsk, ret;
+ struct cr_img *img;
+ NetDeviceEntry *nde;
+
+ img = open_image(CR_FD_NETDEV, O_RSTR, pid);
+ if (!img)
+ return -1;
+
+ nlsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+ if (nlsk < 0) {
+ pr_perror("Can't create nlk socket");
+ close_image(img);
+ return -1;
+ }
+
+ while (1) {
+ ret = pb_read_one_eof(img, &nde, PB_NETDEV);
+ if (ret <= 0)
+ break;
+
+ ret = restore_link(nde, nlsk);
+ if (ret) {
+ pr_err("Can't restore link\n");
+ goto exit;
+ }
+
+ if (nde->conf) {
+ NetnsEntry **def_netns = netns;
+ /*
+ * optimize restore of devices configuration except lo
+ * lo is created with namespace and before default is set
+ * so we cant optimize its restore
+ */
+ if (nde->type == ND_TYPE__LOOPBACK)
+ def_netns = NULL;
+ ret = ipv4_conf_op(nde->name, nde->conf, nde->n_conf, CTL_WRITE, def_netns);
+ }
+exit:
+ net_device_entry__free_unpacked(nde, NULL);
+ if (ret)
+ break;
+ }
+
+ close(nlsk);
+ close_image(img);
+ return ret;
+}
+
+static int run_ip_tool(char *arg1, char *arg2, char *arg3, int fdin, int fdout, unsigned flags)
+{
+ char *ip_tool_cmd;
+ int ret;
+
+ pr_debug("\tRunning ip %s %s\n", arg1, arg2);
+
+ ip_tool_cmd = getenv("CR_IP_TOOL");
+ if (!ip_tool_cmd)
+ ip_tool_cmd = "ip";
+
+ ret = cr_system(fdin, fdout, -1, ip_tool_cmd,
+ (char *[]) { "ip", arg1, arg2, arg3, NULL }, flags);
+ if (ret) {
+ if (!(flags & CRS_CAN_FAIL))
+ pr_err("IP tool failed on %s %s\n", arg1, arg2);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int run_iptables_tool(char *def_cmd, int fdin, int fdout)
+{
+ int ret;
+ char *cmd;
+
+ cmd = getenv("CR_IPTABLES");
+ if (!cmd)
+ cmd = def_cmd;
+ pr_debug("\tRunning %s for %s\n", cmd, def_cmd);
+ ret = cr_system(fdin, fdout, -1, "sh", (char *[]) { "sh", "-c", cmd, NULL }, 0);
+ if (ret)
+ pr_err("%s failed\n", def_cmd);
+
+ return ret;
+}
+
+static inline int dump_ifaddr(struct cr_imgset *fds)
+{
+ struct cr_img *img = img_from_set(fds, CR_FD_IFADDR);
+ return run_ip_tool("addr", "save", NULL, -1, img_raw_fd(img), 0);
+}
+
+static inline int dump_route(struct cr_imgset *fds)
+{
+ struct cr_img *img;
+
+ img = img_from_set(fds, CR_FD_ROUTE);
+ if (run_ip_tool("route", "save", NULL, -1, img_raw_fd(img), 0))
+ return -1;
+
+ /* If ipv6 is disabled, "ip -6 route dump" dumps all routes */
+ if (!kdat.ipv6)
+ return 0;
+
+ img = img_from_set(fds, CR_FD_ROUTE6);
+ if (run_ip_tool("-6", "route", "save", -1, img_raw_fd(img), 0))
+ return -1;
+
+ return 0;
+}
+
+static inline int dump_rule(struct cr_imgset *fds)
+{
+ struct cr_img *img;
+ char *path;
+
+ img = img_from_set(fds, CR_FD_RULE);
+ path = xstrdup(img->path);
+
+ if (!path)
+ return -1;
+
+ if (run_ip_tool("rule", "save", NULL, -1, img_raw_fd(img), CRS_CAN_FAIL)) {
+ pr_warn("Check if \"ip rule save\" is supported!\n");
+ unlinkat(get_service_fd(IMG_FD_OFF), path, 0);
+ }
+
+ free(path);
+
+ return 0;
+}
+
+static inline int dump_iptables(struct cr_imgset *fds)
+{
+ struct cr_img *img;
+
+ img = img_from_set(fds, CR_FD_IPTABLES);
+ if (run_iptables_tool("iptables-save", -1, img_raw_fd(img)))
+ return -1;
+
+ if (kdat.ipv6) {
+ img = img_from_set(fds, CR_FD_IP6TABLES);
+ if (run_iptables_tool("ip6tables-save", -1, img_raw_fd(img)))
+ return -1;
+ }
+
+ return 0;
+}
+
+static int dump_netns_conf(struct cr_imgset *fds)
+{
+ int ret, n;
+ NetnsEntry netns = NETNS_ENTRY__INIT;
+
+ netns.n_def_conf = ARRAY_SIZE(devconfs);
+ netns.n_all_conf = ARRAY_SIZE(devconfs);
+ netns.def_conf = xmalloc(sizeof(int) * netns.n_def_conf);
+ if (!netns.def_conf)
+ return -1;
+ netns.all_conf = xmalloc(sizeof(int) * netns.n_all_conf);
+ if (!netns.all_conf) {
+ xfree(netns.def_conf);
+ return -1;
+ }
+
+ n = netns.n_def_conf;
+ ret = ipv4_conf_op("default", netns.def_conf, n, CTL_READ, NULL);
+ if (ret < 0)
+ goto err_free;
+ ret = ipv4_conf_op("all", netns.all_conf, n, CTL_READ, NULL);
+ if (ret < 0)
+ goto err_free;
+
+ ret = pb_write_one(img_from_set(fds, CR_FD_NETNS), &netns, PB_NETNS);
+err_free:
+ xfree(netns.def_conf);
+ xfree(netns.all_conf);
+ return ret;
+}
+
+static int restore_ip_dump(int type, int pid, char *cmd)
+{
+ int ret = -1;
+ struct cr_img *img;
+
+ img = open_image(type, O_RSTR, pid);
+ if (empty_image(img)) {
+ close_image(img);
+ return 0;
+ }
+ if (img) {
+ ret = run_ip_tool(cmd, "restore", NULL, img_raw_fd(img), -1, 0);
+ close_image(img);
+ }
+
+ return ret;
+}
+
+static inline int restore_ifaddr(int pid)
+{
+ return restore_ip_dump(CR_FD_IFADDR, pid, "addr");
+}
+
+static inline int restore_route(int pid)
+{
+ if (restore_ip_dump(CR_FD_ROUTE, pid, "route"))
+ return -1;
+
+ if (restore_ip_dump(CR_FD_ROUTE6, pid, "route"))
+ return -1;
+
+ return 0;
+}
+
+static inline int restore_rule(int pid)
+{
+ struct cr_img *img;
+ int ret = 0;
+
+ img = open_image(CR_FD_RULE, O_RSTR, pid);
+ if (!img) {
+ ret = -1;
+ goto out;
+ }
+
+ if (empty_image(img))
+ goto close;
+
+ /*
+ * Delete 3 default rules to prevent duplicates. See kernel's
+ * function fib_default_rules_init() for the details.
+ */
+ run_ip_tool("rule", "delete", NULL, -1, -1, 0);
+ run_ip_tool("rule", "delete", NULL, -1, -1, 0);
+ run_ip_tool("rule", "delete", NULL, -1, -1, 0);
+
+ if (restore_ip_dump(CR_FD_RULE, pid, "rule"))
+ ret = -1;
+close:
+ close_image(img);
+out:
+ return ret;
+}
+
+static inline int restore_iptables(int pid)
+{
+ int ret = -1;
+ struct cr_img *img;
+
+ img = open_image(CR_FD_IPTABLES, O_RSTR, pid);
+ if (img) {
+ ret = run_iptables_tool("iptables-restore", img_raw_fd(img), -1);
+ close_image(img);
+ }
+ if (ret)
+ return ret;
+
+ img = open_image(CR_FD_IP6TABLES, O_RSTR, pid);
+ if (img == NULL)
+ return -1;
+ if (empty_image(img))
+ goto out;
+
+ ret = run_iptables_tool("ip6tables-restore", img_raw_fd(img), -1);
+out:
+ close_image(img);
+
+ return ret;
+}
+
+static int restore_netns_conf(int pid, NetnsEntry **netns)
+{
+ int ret = 0, n;
+ struct cr_img *img;
+
+ img = open_image(CR_FD_NETNS, O_RSTR, pid);
+ if (!img)
+ return -1;
+
+ if (empty_image(img))
+ /* Backward compatibility */
+ goto out;
+
+ ret = pb_read_one(img, netns, PB_NETNS);
+ if (ret < 0) {
+ pr_err("Can not read netns object\n");
+ return -1;
+ }
+
+ n = (*netns)->n_def_conf;
+ ret = ipv4_conf_op("default", (*netns)->def_conf, n, CTL_WRITE, NULL);
+ if (ret)
+ goto out;
+ ret = ipv4_conf_op("all", (*netns)->all_conf, n, CTL_WRITE, NULL);
+out:
+ close_image(img);
+ return ret;
+}
+
+static int mount_ns_sysfs(void)
+{
+ char sys_mount[] = "crtools-sys.XXXXXX";
+
+ BUG_ON(ns_sysfs_fd != -1);
+
+ /*
+ * A new mntns is required to avoid the race between
+ * open_detach_mount and creating mntns.
+ */
+ if (unshare(CLONE_NEWNS)) {
+ pr_perror("Can't create new mount namespace");
+ return -1;
+ }
+
+ if (mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL)) {
+ pr_perror("Can't mark the root mount as private");
+ return -1;
+ }
+
+ if (mkdtemp(sys_mount) == NULL) {
+ pr_perror("mkdtemp failed %s", sys_mount);
+ return -1;
+ }
+
+ /*
+ * The setns() is called, so we're in proper context,
+ * no need in pulling the mountpoint from parasite.
+ */
+ pr_info("Mount ns' sysfs in %s\n", sys_mount);
+ if (mount("sysfs", sys_mount, "sysfs", MS_MGC_VAL, NULL)) {
+ pr_perror("mount failed");
+ rmdir(sys_mount);
+ return -1;
+ }
+
+ ns_sysfs_fd = open_detach_mount(sys_mount);
+ return ns_sysfs_fd >= 0 ? 0 : -1;
+}
+
+int dump_net_ns(int ns_id)
+{
+ struct cr_imgset *fds;
+ int ret;
+
+ fds = cr_imgset_open(ns_id, NETNS, O_DUMP);
+ if (fds == NULL)
+ return -1;
+
+ ret = mount_ns_sysfs();
+ if (!ret)
+ ret = dump_netns_conf(fds);
+ if (!ret)
+ ret = dump_links(fds);
+ if (!ret)
+ ret = dump_ifaddr(fds);
+ if (!ret)
+ ret = dump_route(fds);
+ if (!ret)
+ ret = dump_rule(fds);
+ if (!ret)
+ ret = dump_iptables(fds);
+ if (!ret)
+ ret = dump_nf_ct(fds, CR_FD_NETNF_CT);
+ if (!ret)
+ ret = dump_nf_ct(fds, CR_FD_NETNF_EXP);
+
+ close(ns_sysfs_fd);
+ ns_sysfs_fd = -1;
+
+ close_cr_imgset(&fds);
+ return ret;
+}
+
+int prepare_net_ns(int pid)
+{
+ int ret;
+ NetnsEntry *netns = NULL;
+
+ ret = restore_netns_conf(pid, &netns);
+ if (!ret)
+ ret = restore_links(pid, &netns);
+ if (netns)
+ netns_entry__free_unpacked(netns, NULL);
+
+ if (!ret)
+ ret = restore_ifaddr(pid);
+ if (!ret)
+ ret = restore_route(pid);
+ if (!ret)
+ ret = restore_rule(pid);
+ if (!ret)
+ ret = restore_iptables(pid);
+ if (!ret)
+ ret = restore_nf_ct(pid, CR_FD_NETNF_CT);
+ if (!ret)
+ ret = restore_nf_ct(pid, CR_FD_NETNF_EXP);
+
+ close_service_fd(NS_FD_OFF);
+
+ return ret;
+}
+
+int netns_keep_nsfd(void)
+{
+ int ns_fd, ret;
+
+ if (!(root_ns_mask & CLONE_NEWNET))
+ return 0;
+
+ /*
+ * When restoring a net namespace we need to communicate
+ * with the original (i.e. -- init) one. Thus, prepare for
+ * that before we leave the existing namespaces.
+ */
+
+ ns_fd = open("/proc/self/ns/net", O_RDONLY | O_CLOEXEC);
+ if (ns_fd < 0) {
+ pr_perror("Can't cache net fd");
+ return -1;
+ }
+
+ ret = install_service_fd(NS_FD_OFF, ns_fd);
+ if (ret < 0)
+ pr_err("Can't install ns net reference\n");
+ else
+ pr_info("Saved netns fd for links restore\n");
+ close(ns_fd);
+
+ return ret >= 0 ? 0 : -1;
+}
+
+/*
+ * If we want to modify iptables, we need to recevied the current
+ * configuration, change it and load a new one into the kernel.
+ * iptables can change or add only one rule.
+ * iptables-restore allows to make a few changes for one iteration,
+ * so it works faster.
+ */
+static int iptables_restore(bool ipv6, char *buf, int size)
+{
+ int pfd[2], ret = -1;
+ char *cmd4[] = {"iptables-restore", "--noflush", NULL};
+ char *cmd6[] = {"ip6tables-restore", "--noflush", NULL};
+ char **cmd = ipv6 ? cmd6 : cmd4;;
+
+ if (pipe(pfd) < 0) {
+ pr_perror("Unable to create pipe");
+ return -1;
+ }
+
+ if (write(pfd[1], buf, size) < size) {
+ pr_perror("Unable to write iptables configugration");
+ goto err;
+ }
+ close_safe(&pfd[1]);
+
+ ret = cr_system(pfd[0], -1, -1, cmd[0], cmd, 0);
+err:
+ close_safe(&pfd[1]);
+ close_safe(&pfd[0]);
+ return ret;
+}
+
+static int network_lock_internal()
+{
+ char conf[] = "*filter\n"
+ ":CRIU - [0:0]\n"
+ "-I INPUT -j CRIU\n"
+ "-I OUTPUT -j CRIU\n"
+ "-A CRIU -j DROP\n"
+ "COMMIT\n";
+ int ret = 0, nsret;
+
+ if (switch_ns(root_item->pid.real, &net_ns_desc, &nsret))
+ return -1;
+
+
+ ret |= iptables_restore(false, conf, sizeof(conf) - 1);
+ if (kdat.ipv6)
+ ret |= iptables_restore(true, conf, sizeof(conf) - 1);
+
+ if (restore_ns(nsret, &net_ns_desc))
+ ret = -1;
+
+ return ret;
+}
+
+static int network_unlock_internal()
+{
+ char conf[] = "*filter\n"
+ ":CRIU - [0:0]\n"
+ "-D INPUT -j CRIU\n"
+ "-D OUTPUT -j CRIU\n"
+ "-X CRIU\n"
+ "COMMIT\n";
+ int ret = 0, nsret;
+
+ if (switch_ns(root_item->pid.real, &net_ns_desc, &nsret))
+ return -1;
+
+
+ ret |= iptables_restore(false, conf, sizeof(conf) - 1);
+ if (kdat.ipv6)
+ ret |= iptables_restore(true, conf, sizeof(conf) - 1);
+
+ if (restore_ns(nsret, &net_ns_desc))
+ ret = -1;
+
+ return ret;
+}
+
+int network_lock(void)
+{
+ pr_info("Lock network\n");
+
+ /* Each connection will be locked on dump */
+ if (!(root_ns_mask & CLONE_NEWNET))
+ return 0;
+
+ if (run_scripts(ACT_NET_LOCK))
+ return -1;
+
+ return network_lock_internal();
+}
+
+void network_unlock(void)
+{
+ pr_info("Unlock network\n");
+
+ cpt_unlock_tcp_connections();
+ rst_unlock_tcp_connections();
+
+ if (root_ns_mask & CLONE_NEWNET) {
+ run_scripts(ACT_NET_UNLOCK);
+ network_unlock_internal();
+ }
+}
+
+int veth_pair_add(char *in, char *out)
+{
+ char *aux;
+ struct veth_pair *n;
+
+ n = xmalloc(sizeof(*n));
+ if (n == NULL)
+ return -1;
+
+ n->inside = in;
+ n->outside = out;
+ /*
+ * Does the out string specify a bridge for
+ * moving the outside end of the veth pair to?
+ */
+ aux = strrchr(out, '@');
+ if (aux) {
+ *aux++ = '\0';
+ n->bridge = aux;
+ } else {
+ n->bridge = NULL;
+ }
+
+ list_add(&n->node, &opts.veth_pairs);
+ if (n->bridge)
+ pr_debug("Added %s:%s@%s veth map\n", in, out, aux);
+ else
+ pr_debug("Added %s:%s veth map\n", in, out);
+ return 0;
+}
+
+/*
+ * The setns() syscall (called by switch_ns()) can be extremely
+ * slow. If we call it two or more times from the same task the
+ * kernel will synchonously go on a very slow routine called
+ * synchronize_rcu() trying to put a reference on old namespaces.
+ *
+ * To avoid doing this more than once we pre-create all the
+ * needed other-ns sockets in advance.
+ */
+
+static int prep_ns_sockets(struct ns_id *ns, bool for_dump)
+{
+ int nsret = -1, ret;
+
+ if (ns->type != NS_CRIU) {
+ pr_info("Switching to %d's net for collecting sockets\n", ns->ns_pid);
+ if (switch_ns(ns->ns_pid, &net_ns_desc, &nsret))
+ return -1;
+ }
+
+ if (for_dump) {
+ ret = ns->net.nlsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG);
+ if (ret < 0) {
+ pr_perror("Can't create sock diag socket");
+ goto err_nl;
+ }
+ } else
+ ns->net.nlsk = -1;
+
+ ret = ns->net.seqsk = socket(PF_UNIX, SOCK_SEQPACKET, 0);
+ if (ret < 0) {
+ pr_perror("Can't create seqsk for parasite");
+ goto err_sq;
+ }
+
+ ret = 0;
+out:
+ if (nsret >= 0 && restore_ns(nsret, &net_ns_desc) < 0) {
+ nsret = -1;
+ if (ret == 0)
+ goto err_ret;
+ }
+
+ return ret;
+
+err_ret:
+ close(ns->net.seqsk);
+err_sq:
+ if (ns->net.nlsk >= 0)
+ close(ns->net.nlsk);
+err_nl:
+ goto out;
+}
+
+static int collect_net_ns(struct ns_id *ns, void *oarg)
+{
+ bool for_dump = (oarg == (void *)1);
+ int ret;
+
+ pr_info("Collecting netns %d/%d\n", ns->id, ns->ns_pid);
+ ret = prep_ns_sockets(ns, for_dump);
+ if (ret)
+ return ret;
+
+ if (!for_dump)
+ return 0;
+
+ return collect_sockets(ns);
+}
+
+int collect_net_namespaces(bool for_dump)
+{
+ return walk_namespaces(&net_ns_desc, collect_net_ns,
+ (void *)(for_dump ? 1UL : 0));
+}
+
+struct ns_desc net_ns_desc = NS_DESC_ENTRY(CLONE_NEWNET, "net");
+
+int move_veth_to_bridge(void)
+{
+ int s;
+ int ret;
+ struct veth_pair *n;
+ struct ifreq ifr;
+
+ s = -1;
+ ret = 0;
+ list_for_each_entry(n, &opts.veth_pairs, node) {
+ if (n->bridge == NULL)
+ continue;
+
+ pr_debug("\tMoving dev %s to bridge %s\n", n->outside, n->bridge);
+
+ if (s == -1) {
+ s = socket(AF_LOCAL, SOCK_STREAM|SOCK_CLOEXEC, 0);
+ if (s < 0) {
+ pr_perror("Can't create control socket");
+ return -1;
+ }
+ }
+
+ /*
+ * Add the device to the bridge. This is equivalent to:
+ * $ brctl addif <bridge> <device>
+ */
+ ifr.ifr_ifindex = if_nametoindex(n->outside);
+ if (ifr.ifr_ifindex == 0) {
+ pr_perror("Can't get index of %s", n->outside);
+ ret = -1;
+ break;
+ }
+ strlcpy(ifr.ifr_name, n->bridge, IFNAMSIZ);
+ ret = ioctl(s, SIOCBRADDIF, &ifr);
+ if (ret < 0) {
+ pr_perror("Can't add interface %s to bridge %s",
+ n->outside, n->bridge);
+ break;
+ }
+
+ /*
+ * Make sure the device is up. This is equivalent to:
+ * $ ip link set dev <device> up
+ */
+ ifr.ifr_ifindex = 0;
+ strlcpy(ifr.ifr_name, n->outside, IFNAMSIZ);
+ ret = ioctl(s, SIOCGIFFLAGS, &ifr);
+ if (ret < 0) {
+ pr_perror("Can't get flags of interface %s", n->outside);
+ break;
+ }
+ if (ifr.ifr_flags & IFF_UP)
+ continue;
+ ifr.ifr_flags |= IFF_UP;
+ ret = ioctl(s, SIOCSIFFLAGS, &ifr);
+ if (ret < 0) {
+ pr_perror("Can't set flags of interface %s to 0x%x",
+ n->outside, ifr.ifr_flags);
+ break;
+ }
+ }
+
+ if (s >= 0)
+ close(s);
+ return ret;
+}
diff --git a/criu/netfilter.c b/criu/netfilter.c
new file mode 100644
index 000000000000..95e18aa97451
--- /dev/null
+++ b/criu/netfilter.c
@@ -0,0 +1,124 @@
+#include <sys/socket.h>
+#include <arpa/inet.h>
+#include <unistd.h>
+#include <string.h>
+#include <wait.h>
+#include <stdlib.h>
+
+#include "asm/types.h"
+#include "util.h"
+#include "list.h"
+#include "files.h"
+#include "netfilter.h"
+#include "sockets.h"
+#include "sk-inet.h"
+
+static char buf[512];
+
+/*
+ * Need to configure simple netfilter rules for blocking connections
+ * ANy brave soul to write it using xtables-devel?
+ */
+
+static const char *nf_conn_cmd = "%s -t filter %s %s --protocol tcp "
+ "--source %s --sport %d --destination %s --dport %d -j DROP";
+
+static char iptable_cmd_ipv4[] = "iptables";
+static char iptable_cmd_ipv6[] = "ip6tables";
+
+static int nf_connection_switch_raw(int family, u32 *src_addr, u16 src_port,
+ u32 *dst_addr, u16 dst_port,
+ bool input, bool lock)
+{
+ char sip[INET_ADDR_LEN], dip[INET_ADDR_LEN];
+ char *cmd;
+ char *argv[4] = { "sh", "-c", buf, NULL };
+ int ret;
+
+ switch (family) {
+ case AF_INET:
+ cmd = iptable_cmd_ipv4;
+ break;
+ case AF_INET6:
+ cmd = iptable_cmd_ipv6;
+ break;
+ default:
+ pr_err("Unknown socket family %d\n", family);
+ return -1;
+ };
+
+ if (!inet_ntop(family, (void *)src_addr, sip, INET_ADDR_LEN) ||
+ !inet_ntop(family, (void *)dst_addr, dip, INET_ADDR_LEN)) {
+ pr_perror("nf: Can't translate ip addr");
+ return -1;
+ }
+
+ snprintf(buf, sizeof(buf), nf_conn_cmd, cmd,
+ lock ? "-A" : "-D",
+ input ? "INPUT" : "OUTPUT",
+ dip, (int)dst_port, sip, (int)src_port);
+
+ pr_debug("\tRunning iptables [%s]\n", buf);
+
+ /*
+ * cr_system is used here, because it blocks SIGCHLD before waiting
+ * a child and the child can't be waited from SIGCHLD handler.
+ */
+ ret = cr_system(-1, -1, -1, "sh", argv, 0);
+ if (ret < 0 || !WIFEXITED(ret) || WEXITSTATUS(ret)) {
+ pr_perror("Iptables configuration failed");
+ return -1;
+ }
+
+ pr_info("%s %s:%d - %s:%d connection\n", lock ? "Locked" : "Unlocked",
+ sip, (int)src_port, dip, (int)dst_port);
+ return 0;
+}
+
+static int nf_connection_switch(struct inet_sk_desc *sk, bool lock)
+{
+ int ret = 0;
+
+ ret = nf_connection_switch_raw(sk->sd.family,
+ sk->src_addr, sk->src_port,
+ sk->dst_addr, sk->dst_port, true, lock);
+ if (ret)
+ return -1;
+
+ ret = nf_connection_switch_raw(sk->sd.family,
+ sk->dst_addr, sk->dst_port,
+ sk->src_addr, sk->src_port, false, lock);
+ if (ret) /* rollback */
+ nf_connection_switch_raw(sk->sd.family,
+ sk->src_addr, sk->src_port,
+ sk->dst_addr, sk->dst_port, true, !lock);
+ return ret;
+}
+
+int nf_lock_connection(struct inet_sk_desc *sk)
+{
+ return nf_connection_switch(sk, true);
+}
+
+int nf_unlock_connection(struct inet_sk_desc *sk)
+{
+ return nf_connection_switch(sk, false);
+}
+
+int nf_unlock_connection_info(struct inet_sk_info *si)
+{
+ int ret = 0;
+
+ ret |= nf_connection_switch_raw(si->ie->family,
+ si->ie->src_addr, si->ie->src_port,
+ si->ie->dst_addr, si->ie->dst_port, true, false);
+ ret |= nf_connection_switch_raw(si->ie->family,
+ si->ie->dst_addr, si->ie->dst_port,
+ si->ie->src_addr, si->ie->src_port, false, false);
+ /*
+ * rollback nothing in case of any error,
+ * because nobody checks errors of this function
+ */
+
+ return ret;
+}
diff --git a/criu/page-pipe.c b/criu/page-pipe.c
new file mode 100644
index 000000000000..db58f6a59c8e
--- /dev/null
+++ b/criu/page-pipe.c
@@ -0,0 +1,238 @@
+#include <unistd.h>
+#include <fcntl.h>
+
+#undef LOG_PREFIX
+#define LOG_PREFIX "page-pipe: "
+
+#include "config.h"
+#include "util.h"
+#include "page-pipe.h"
+
+/* can existing iov accumulate the page? */
+static inline bool iov_grow_page(struct iovec *iov, unsigned long addr)
+{
+ if ((unsigned long)iov->iov_base + iov->iov_len == addr) {
+ iov->iov_len += PAGE_SIZE;
+ return true;
+ }
+
+ return false;
+}
+
+static inline void iov_init(struct iovec *iov, unsigned long addr)
+{
+ iov->iov_base = (void *)addr;
+ iov->iov_len = PAGE_SIZE;
+}
+
+static int page_pipe_grow(struct page_pipe *pp)
+{
+ struct page_pipe_buf *ppb;
+
+ pr_debug("Will grow page pipe (iov off is %u)\n", pp->free_iov);
+
+ if (!list_empty(&pp->free_bufs)) {
+ ppb = list_first_entry(&pp->free_bufs, struct page_pipe_buf, l);
+ list_move_tail(&ppb->l, &pp->bufs);
+ goto out;
+ }
+
+ if (pp->chunk_mode && pp->nr_pipes == NR_PIPES_PER_CHUNK)
+ return -EAGAIN;
+
+ ppb = xmalloc(sizeof(*ppb));
+ if (!ppb)
+ return -1;
+
+ if (pipe(ppb->p)) {
+ xfree(ppb);
+ pr_perror("Can't make pipe for page-pipe");
+ return -1;
+ }
+
+ ppb->pipe_size = fcntl(ppb->p[0], F_GETPIPE_SZ, 0) / PAGE_SIZE;
+ pp->nr_pipes++;
+
+ list_add_tail(&ppb->l, &pp->bufs);
+out:
+ ppb->pages_in = 0;
+ ppb->nr_segs = 0;
+ ppb->iov = &pp->iovs[pp->free_iov];
+
+ return 0;
+}
+
+struct page_pipe *create_page_pipe(unsigned int nr_segs,
+ struct iovec *iovs, bool chunk_mode)
+{
+ struct page_pipe *pp;
+
+ pr_debug("Create page pipe for %u segs\n", nr_segs);
+
+ pp = xmalloc(sizeof(*pp));
+ if (pp) {
+ pp->nr_pipes = 0;
+ INIT_LIST_HEAD(&pp->bufs);
+ INIT_LIST_HEAD(&pp->free_bufs);
+ pp->nr_iovs = nr_segs;
+ pp->iovs = iovs;
+ pp->free_iov = 0;
+
+ pp->nr_holes = 0;
+ pp->free_hole = 0;
+ pp->holes = NULL;
+
+ pp->chunk_mode = chunk_mode;
+
+ if (page_pipe_grow(pp))
+ return NULL;
+ }
+
+ return pp;
+}
+
+void destroy_page_pipe(struct page_pipe *pp)
+{
+ struct page_pipe_buf *ppb, *n;
+
+ pr_debug("Killing page pipe\n");
+
+ list_splice(&pp->free_bufs, &pp->bufs);
+ list_for_each_entry_safe(ppb, n, &pp->bufs, l) {
+ close(ppb->p[0]);
+ close(ppb->p[1]);
+ xfree(ppb);
+ }
+
+ xfree(pp);
+}
+
+void page_pipe_reinit(struct page_pipe *pp)
+{
+ struct page_pipe_buf *ppb, *n;
+
+ BUG_ON(!pp->chunk_mode);
+
+ pr_debug("Clean up page pipe\n");
+
+ list_for_each_entry_safe(ppb, n, &pp->bufs, l)
+ list_move(&ppb->l, &pp->free_bufs);
+
+ pp->free_hole = 0;
+
+ if (page_pipe_grow(pp))
+ BUG(); /* It can't fail, because ppb is in free_bufs */
+}
+
+static inline int try_add_page_to(struct page_pipe *pp, struct page_pipe_buf *ppb,
+ unsigned long addr)
+{
+ if (ppb->pages_in == ppb->pipe_size) {
+ unsigned long new_size = ppb->pipe_size << 1;
+ int ret;
+
+ if (new_size > PIPE_MAX_SIZE)
+ return 1;
+
+ ret = fcntl(ppb->p[0], F_SETPIPE_SZ, new_size * PAGE_SIZE);
+ if (ret < 0)
+ return 1; /* need to add another buf */
+
+ ret /= PAGE_SIZE;
+ BUG_ON(ret < ppb->pipe_size);
+
+ pr_debug("Grow pipe %x -> %x\n", ppb->pipe_size, ret);
+ ppb->pipe_size = ret;
+ }
+
+ if (ppb->nr_segs) {
+ if (iov_grow_page(&ppb->iov[ppb->nr_segs - 1], addr))
+ goto out;
+
+ if (ppb->nr_segs == UIO_MAXIOV)
+ /* XXX -- shrink pipe back? */
+ return 1;
+ }
+
+ pr_debug("Add iov to page pipe (%u iovs, %u/%u total)\n",
+ ppb->nr_segs, pp->free_iov, pp->nr_iovs);
+ iov_init(&ppb->iov[ppb->nr_segs++], addr);
+ pp->free_iov++;
+ BUG_ON(pp->free_iov > pp->nr_iovs);
+out:
+ ppb->pages_in++;
+ return 0;
+}
+
+static inline int try_add_page(struct page_pipe *pp, unsigned long addr)
+{
+ BUG_ON(list_empty(&pp->bufs));
+ return try_add_page_to(pp, list_entry(pp->bufs.prev, struct page_pipe_buf, l), addr);
+}
+
+int page_pipe_add_page(struct page_pipe *pp, unsigned long addr)
+{
+ int ret;
+
+ ret = try_add_page(pp, addr);
+ if (ret <= 0)
+ return ret;
+
+ ret = page_pipe_grow(pp);
+ if (ret < 0)
+ return ret;
+
+ ret = try_add_page(pp, addr);
+ BUG_ON(ret > 0);
+ return ret;
+}
+
+#define PP_HOLES_BATCH 32
+
+int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr)
+{
+ if (pp->free_hole >= pp->nr_holes) {
+ pp->holes = xrealloc(pp->holes,
+ (pp->nr_holes + PP_HOLES_BATCH) * sizeof(struct iovec));
+ if (!pp->holes)
+ return -1;
+
+ pp->nr_holes += PP_HOLES_BATCH;
+ }
+
+ if (pp->free_hole &&
+ iov_grow_page(&pp->holes[pp->free_hole - 1], addr))
+ goto out;
+
+ iov_init(&pp->holes[pp->free_hole++], addr);
+out:
+ return 0;
+}
+
+void debug_show_page_pipe(struct page_pipe *pp)
+{
+ struct page_pipe_buf *ppb;
+ int i;
+ struct iovec *iov;
+
+ if (pr_quelled(LOG_DEBUG))
+ return;
+
+ pr_debug("Page pipe:\n");
+ pr_debug("* %u pipes %u/%u iovs:\n",
+ pp->nr_pipes, pp->free_iov, pp->nr_iovs);
+ list_for_each_entry(ppb, &pp->bufs, l) {
+ pr_debug("\tbuf %u pages, %u iovs:\n",
+ ppb->pages_in, ppb->nr_segs);
+ for (i = 0; i < ppb->nr_segs; i++) {
+ iov = &ppb->iov[i];
+ pr_debug("\t\t%p %lu\n", iov->iov_base, iov->iov_len / PAGE_SIZE);
+ }
+ }
+
+ pr_debug("* %u holes:\n", pp->free_hole);
+ for (i = 0; i < pp->free_hole; i++) {
+ iov = &pp->holes[i];
+ pr_debug("\t%p %lu\n", iov->iov_base, iov->iov_len / PAGE_SIZE);
+ }
+}
diff --git a/criu/page-read.c b/criu/page-read.c
new file mode 100644
index 000000000000..28ecd5bdb2bb
--- /dev/null
+++ b/criu/page-read.c
@@ -0,0 +1,360 @@
+#include <fcntl.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include "image.h"
+#include "cr_options.h"
+#include "servicefd.h"
+#include "page-read.h"
+
+#include "protobuf.h"
+#include "protobuf/pagemap.pb-c.h"
+
+#ifndef SEEK_DATA
+#define SEEK_DATA 3
+#define SEEK_HOLE 4
+#endif
+
+static int get_page_vaddr(struct page_read *pr, struct iovec *iov)
+{
+ int ret;
+ u64 img_va;
+
+ ret = read_img_eof(pr->pmi, &img_va);
+ if (ret <= 0)
+ return ret;
+
+ iov->iov_base = (void *)decode_pointer(img_va);
+ iov->iov_len = PAGE_SIZE;
+
+ return 1;
+}
+
+static int read_page(struct page_read *pr, unsigned long vaddr, int nr, void *buf)
+{
+ int ret;
+
+ BUG_ON(nr != 1);
+
+ ret = read(img_raw_fd(pr->pmi), buf, PAGE_SIZE);
+ if (ret != PAGE_SIZE) {
+ pr_err("Can't read mapping page %d\n", ret);
+ return -1;
+ }
+
+ return 1;
+}
+
+void pagemap2iovec(PagemapEntry *pe, struct iovec *iov)
+{
+ iov->iov_base = decode_pointer(pe->vaddr);
+ iov->iov_len = pe->nr_pages * PAGE_SIZE;
+}
+
+void iovec2pagemap(struct iovec *iov, PagemapEntry *pe)
+{
+ pe->vaddr = encode_pointer(iov->iov_base);
+ pe->nr_pages = iov->iov_len / PAGE_SIZE;
+}
+
+static int get_pagemap(struct page_read *pr, struct iovec *iov)
+{
+ int ret;
+ PagemapEntry *pe;
+
+ ret = pb_read_one_eof(pr->pmi, &pe, PB_PAGEMAP);
+ if (ret <= 0)
+ return ret;
+
+ pagemap2iovec(pe, iov);
+
+ pr->pe = pe;
+ pr->cvaddr = (unsigned long)iov->iov_base;
+
+ if (pe->in_parent && !pr->parent) {
+ pr_err("No parent for snapshot pagemap\n");
+ return -1;
+ }
+
+ return 1;
+}
+
+static void put_pagemap(struct page_read *pr)
+{
+ pagemap_entry__free_unpacked(pr->pe, NULL);
+}
+
+static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, int nr, void *buf);
+
+static void skip_pagemap_pages(struct page_read *pr, unsigned long len)
+{
+ if (!len)
+ return;
+
+ pr_debug("\tpr%u Skip %lu bytes from page-dump\n", pr->id, len);
+ if (!pr->pe->in_parent)
+ lseek(img_raw_fd(pr->pi), len, SEEK_CUR);
+ pr->cvaddr += len;
+}
+
+int seek_pagemap_page(struct page_read *pr, unsigned long vaddr, bool warn)
+{
+ int ret;
+ struct iovec iov;
+
+ if (pr->pe)
+ pagemap2iovec(pr->pe, &iov);
+ else
+ goto new_pagemap;
+
+ while (1) {
+ unsigned long iov_end;
+
+ if (vaddr < pr->cvaddr) {
+ if (warn)
+ pr_err("Missing %lx in parent pagemap, current iov: base=%lx,len=%zu\n",
+ vaddr, (unsigned long)iov.iov_base, iov.iov_len);
+ return 0;
+ }
+ iov_end = (unsigned long)iov.iov_base + iov.iov_len;
+
+ if (iov_end <= vaddr) {
+ skip_pagemap_pages(pr, iov_end - pr->cvaddr);
+ put_pagemap(pr);
+new_pagemap:
+ ret = get_pagemap(pr, &iov);
+ if (ret <= 0)
+ return ret;
+
+ continue;
+ }
+
+ skip_pagemap_pages(pr, vaddr - pr->cvaddr);
+ return 1;
+ }
+}
+
+static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, int nr)
+{
+ if (vaddr < pe->vaddr || (vaddr - pe->vaddr) / PAGE_SIZE + nr > pe->nr_pages) {
+ pr_err("Page read err %"PRIx64":%u vs %lx:%u\n",
+ pe->vaddr, pe->nr_pages, vaddr, nr);
+ BUG();
+ }
+}
+
+static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, int nr, void *buf)
+{
+ int ret;
+ unsigned long len = nr * PAGE_SIZE;
+
+ pr_info("pr%u Read %lx %u pages\n", pr->id, vaddr, nr);
+ pagemap_bound_check(pr->pe, vaddr, nr);
+
+ if (pr->pe->in_parent) {
+ struct page_read *ppr = pr->parent;
+
+ /*
+ * Parent pagemap at this point entry may be shorter
+ * than the current vaddr:nr needs, so we have to
+ * carefully 'split' the vaddr:nr into pieces and go
+ * to parent page-read with the longest requests it
+ * can handle.
+ */
+
+ do {
+ int p_nr;
+
+ pr_debug("\tpr%u Read from parent\n", pr->id);
+ ret = seek_pagemap_page(ppr, vaddr, true);
+ if (ret <= 0)
+ return -1;
+
+ /*
+ * This is how many pages we have in the parent
+ * page_read starting from vaddr. Go ahead and
+ * read as much as we can.
+ */
+ p_nr = ppr->pe->nr_pages - (vaddr - ppr->pe->vaddr) / PAGE_SIZE;
+ pr_info("\tparent has %u pages in\n", p_nr);
+ if (p_nr > nr)
+ p_nr = nr;
+
+ ret = read_pagemap_page(ppr, vaddr, p_nr, buf);
+ if (ret == -1)
+ return ret;
+
+ /*
+ * OK, let's see how much data we have left and go
+ * to parent page-read again for the next pagemap
+ * entry.
+ */
+ nr -= p_nr;
+ vaddr += p_nr * PAGE_SIZE;
+ buf += p_nr * PAGE_SIZE;
+ } while (nr);
+ } else {
+ int fd = img_raw_fd(pr->pi);
+ off_t current_vaddr = lseek(fd, 0, SEEK_CUR);
+
+ pr_debug("\tpr%u Read page from self %lx/%"PRIx64"\n", pr->id, pr->cvaddr, current_vaddr);
+ ret = read(fd, buf, len);
+ if (ret != len) {
+ pr_perror("Can't read mapping page %d", ret);
+ return -1;
+ }
+
+ if (opts.auto_dedup) {
+ ret = punch_hole(pr, current_vaddr, len, false);
+ if (ret == -1) {
+ return -1;
+ }
+ }
+ }
+
+ pr->cvaddr += len;
+
+ return 1;
+}
+
+static void close_page_read(struct page_read *pr)
+{
+ int ret;
+
+ if (pr->bunch.iov_len > 0) {
+ ret = punch_hole(pr, 0, 0, true);
+ if (ret == -1)
+ return;
+
+ pr->bunch.iov_len = 0;
+ }
+
+ if (pr->parent) {
+ close_page_read(pr->parent);
+ xfree(pr->parent);
+ }
+
+ close_image(pr->pmi);
+ if (pr->pi)
+ close_image(pr->pi);
+}
+
+static int try_open_parent(int dfd, int pid, struct page_read *pr, int pr_flags)
+{
+ int pfd, ret;
+ struct page_read *parent = NULL;
+
+ pfd = openat(dfd, CR_PARENT_LINK, O_RDONLY);
+ if (pfd < 0 && errno == ENOENT)
+ goto out;
+
+ parent = xmalloc(sizeof(*parent));
+ if (!parent)
+ goto err_cl;
+
+ ret = open_page_read_at(pfd, pid, parent, pr_flags);
+ if (ret < 0)
+ goto err_free;
+
+ if (!ret) {
+ xfree(parent);
+ parent = NULL;
+ }
+
+ close(pfd);
+out:
+ pr->parent = parent;
+ return 0;
+
+err_free:
+ xfree(parent);
+err_cl:
+ close(pfd);
+ return -1;
+}
+
+int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flags)
+{
+ int flags, i_typ, i_typ_o;
+ static unsigned ids = 1;
+
+ if (opts.auto_dedup)
+ pr_flags |= PR_MOD;
+ if (pr_flags & PR_MOD)
+ flags = O_RDWR;
+ else
+ flags = O_RSTR;
+
+ switch (pr_flags & PR_TYPE_MASK) {
+ case PR_TASK:
+ i_typ = CR_FD_PAGEMAP;
+ i_typ_o = CR_FD_PAGES_OLD;
+ break;
+ case PR_SHMEM:
+ i_typ = CR_FD_SHMEM_PAGEMAP;
+ i_typ_o = CR_FD_SHM_PAGES_OLD;
+ break;
+ default:
+ BUG();
+ return -1;
+ }
+
+ pr->pe = NULL;
+ pr->parent = NULL;
+ pr->bunch.iov_len = 0;
+ pr->bunch.iov_base = NULL;
+
+ pr->pmi = open_image_at(dfd, i_typ, O_RSTR, (long)pid);
+ if (!pr->pmi)
+ return -1;
+
+ if (empty_image(pr->pmi)) {
+ close_image(pr->pmi);
+ goto open_old;
+ }
+
+ if ((i_typ != CR_FD_SHMEM_PAGEMAP) && try_open_parent(dfd, pid, pr, pr_flags)) {
+ close_image(pr->pmi);
+ return -1;
+ }
+
+ pr->pi = open_pages_image_at(dfd, flags, pr->pmi);
+ if (!pr->pi) {
+ close_page_read(pr);
+ return -1;
+ }
+
+ pr->get_pagemap = get_pagemap;
+ pr->put_pagemap = put_pagemap;
+ pr->read_pages = read_pagemap_page;
+ pr->close = close_page_read;
+ pr->id = ids++;
+
+ pr_debug("Opened page read %u (parent %u)\n",
+ pr->id, pr->parent ? pr->parent->id : 0);
+
+ return 1;
+
+open_old:
+ pr->pmi = open_image_at(dfd, i_typ_o, flags, pid);
+ if (!pr->pmi)
+ return -1;
+
+ if (empty_image(pr->pmi)) {
+ close_image(pr->pmi);
+ return 0;
+ }
+
+ pr->get_pagemap = get_page_vaddr;
+ pr->put_pagemap = NULL;
+ pr->read_pages = read_page;
+ pr->pi = NULL;
+ pr->close = close_page_read;
+
+ return 1;
+}
+
+int open_page_read(int pid, struct page_read *pr, int pr_flags)
+{
+ return open_page_read_at(get_service_fd(IMG_FD_OFF), pid, pr, pr_flags);
+}
diff --git a/criu/page-xfer.c b/criu/page-xfer.c
new file mode 100644
index 000000000000..eee8f5f17992
--- /dev/null
+++ b/criu/page-xfer.c
@@ -0,0 +1,880 @@
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <linux/falloc.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/stat.h>
+
+#include "cr_options.h"
+#include "servicefd.h"
+#include "image.h"
+#include "page-xfer.h"
+#include "page-pipe.h"
+#include "util.h"
+#include "protobuf.h"
+#include "protobuf/pagemap.pb-c.h"
+
+struct page_server_iov {
+ u32 cmd;
+ u32 nr_pages;
+ u64 vaddr;
+ u64 dst_id;
+};
+
+static void psi2iovec(struct page_server_iov *ps, struct iovec *iov)
+{
+ iov->iov_base = decode_pointer(ps->vaddr);
+ iov->iov_len = ps->nr_pages * PAGE_SIZE;
+}
+
+static void iovec2psi(struct iovec *iov, struct page_server_iov *ps)
+{
+ ps->vaddr = encode_pointer(iov->iov_base);
+ ps->nr_pages = iov->iov_len / PAGE_SIZE;
+}
+
+static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, long id);
+
+#define PS_IOV_ADD 1
+#define PS_IOV_HOLE 2
+#define PS_IOV_OPEN 3
+#define PS_IOV_OPEN2 4
+#define PS_IOV_PARENT 5
+
+#define PS_IOV_FLUSH 0x1023
+#define PS_IOV_FLUSH_N_CLOSE 0x1024
+
+#define PS_TYPE_BITS 8
+#define PS_TYPE_MASK ((1 << PS_TYPE_BITS) - 1)
+
+static inline u64 encode_pm_id(int type, long id)
+{
+ return ((u64)id) << PS_TYPE_BITS | type;
+}
+
+static int decode_pm_type(u64 dst_id)
+{
+ return dst_id & PS_TYPE_MASK;
+}
+
+static long decode_pm_id(u64 dst_id)
+{
+ return (long)(dst_id >> PS_TYPE_BITS);
+}
+
+struct page_xfer_job {
+ u64 dst_id;
+ int p[2];
+ unsigned pipe_size;
+ struct page_xfer loc_xfer;
+};
+
+static struct page_xfer_job cxfer = {
+ .dst_id = ~0,
+};
+
+static void page_server_close(void)
+{
+ if (cxfer.dst_id != ~0)
+ cxfer.loc_xfer.close(&cxfer.loc_xfer);
+}
+
+static void close_page_xfer(struct page_xfer *xfer);
+static int page_server_open(int sk, struct page_server_iov *pi)
+{
+ int type;
+ long id;
+
+ type = decode_pm_type(pi->dst_id);
+ id = decode_pm_id(pi->dst_id);
+ pr_info("Opening %d/%ld\n", type, id);
+
+ page_server_close();
+
+ if (open_page_local_xfer(&cxfer.loc_xfer, type, id))
+ return -1;
+
+ cxfer.dst_id = pi->dst_id;
+
+ if (sk >= 0) {
+ char has_parent = !!cxfer.loc_xfer.parent;
+
+ if (write(sk, &has_parent, 1) != 1) {
+ pr_perror("Unable to send reponse");
+ close_page_xfer(&cxfer.loc_xfer);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int prep_loc_xfer(struct page_server_iov *pi)
+{
+ if (cxfer.dst_id != pi->dst_id) {
+ pr_warn("Deprecated IO w/o open\n");
+ return page_server_open(-1, pi);
+ } else
+ return 0;
+}
+
+static int page_server_add(int sk, struct page_server_iov *pi)
+{
+ size_t len;
+ struct page_xfer *lxfer = &cxfer.loc_xfer;
+ struct iovec iov;
+
+ pr_debug("Adding %"PRIx64"/%u\n", pi->vaddr, pi->nr_pages);
+
+ if (prep_loc_xfer(pi))
+ return -1;
+
+ psi2iovec(pi, &iov);
+ if (lxfer->write_pagemap(lxfer, &iov))
+ return -1;
+
+ len = iov.iov_len;
+ while (len > 0) {
+ ssize_t chunk;
+
+ chunk = len;
+ if (chunk > cxfer.pipe_size)
+ chunk = cxfer.pipe_size;
+
+ chunk = splice(sk, NULL, cxfer.p[1], NULL, chunk, SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
+ if (chunk < 0) {
+ pr_perror("Can't read from socket");
+ return -1;
+ }
+
+ if (lxfer->write_pages(lxfer, cxfer.p[0], chunk))
+ return -1;
+
+ len -= chunk;
+ }
+
+ return 0;
+}
+
+static int page_server_hole(int sk, struct page_server_iov *pi)
+{
+ struct page_xfer *lxfer = &cxfer.loc_xfer;
+ struct iovec iov;
+
+ pr_debug("Adding %"PRIx64"/%u hole\n", pi->vaddr, pi->nr_pages);
+
+ if (prep_loc_xfer(pi))
+ return -1;
+
+ psi2iovec(pi, &iov);
+ if (lxfer->write_hole(lxfer, &iov))
+ return -1;
+
+ return 0;
+}
+
+static int page_server_check_parent(int sk, struct page_server_iov *pi);
+
+static int page_server_serve(int sk)
+{
+ int ret = -1;
+ bool flushed = false;
+
+ /*
+ * This socket only accepts data except one thing -- it
+ * writes back the has_parent bit from time to time, so
+ * make it NODELAY all the time.
+ */
+ tcp_nodelay(sk, true);
+
+ if (pipe(cxfer.p)) {
+ pr_perror("Can't make pipe for xfer");
+ close(sk);
+ return -1;
+ }
+
+ cxfer.pipe_size = fcntl(cxfer.p[0], F_GETPIPE_SZ, 0);
+ pr_debug("Created xfer pipe size %u\n", cxfer.pipe_size);
+
+ while (1) {
+ struct page_server_iov pi;
+
+ ret = recv(sk, &pi, sizeof(pi), MSG_WAITALL);
+ if (!ret)
+ break;
+
+ if (ret != sizeof(pi)) {
+ pr_perror("Can't read pagemap from socket");
+ ret = -1;
+ break;
+ }
+
+ flushed = false;
+
+ switch (pi.cmd) {
+ case PS_IOV_OPEN:
+ ret = page_server_open(-1, &pi);
+ break;
+ case PS_IOV_OPEN2:
+ ret = page_server_open(sk, &pi);
+ break;
+ case PS_IOV_PARENT:
+ ret = page_server_check_parent(sk, &pi);
+ break;
+ case PS_IOV_ADD:
+ ret = page_server_add(sk, &pi);
+ break;
+ case PS_IOV_HOLE:
+ ret = page_server_hole(sk, &pi);
+ break;
+ case PS_IOV_FLUSH:
+ case PS_IOV_FLUSH_N_CLOSE:
+ {
+ int32_t status = 0;
+
+ ret = 0;
+
+ /*
+ * An answer must be sent back to inform another side,
+ * that all data were received
+ */
+ if (write(sk, &status, sizeof(status)) != sizeof(status)) {
+ pr_perror("Can't send the final package");
+ ret = -1;
+ }
+
+ flushed = true;
+ break;
+ }
+ default:
+ pr_err("Unknown command %u\n", pi.cmd);
+ ret = -1;
+ break;
+ }
+
+ if (ret || (pi.cmd == PS_IOV_FLUSH_N_CLOSE))
+ break;
+ }
+
+ if (!ret && !flushed) {
+ pr_err("The data were not flushed\n");
+ ret = -1;
+ }
+
+ if (ret == 0 && opts.ps_socket == -1) {
+ char c;
+
+ /*
+ * Wait when a remote side closes the connection
+ * to avoid TIME_WAIT bucket
+ */
+
+ if (read(sk, &c, sizeof(c)) != 0) {
+ pr_perror("Unexpected data");
+ ret = -1;
+ }
+ }
+
+ page_server_close();
+ pr_info("Session over\n");
+
+ close(sk);
+ return ret;
+}
+
+static int get_sockaddr_in(struct sockaddr_in *addr)
+{
+ memset(addr, 0, sizeof(*addr));
+ addr->sin_family = AF_INET;
+
+ if (!opts.addr)
+ addr->sin_addr.s_addr = INADDR_ANY;
+ else if (!inet_aton(opts.addr, &addr->sin_addr)) {
+ pr_perror("Bad page server address");
+ return -1;
+ }
+
+ addr->sin_port = opts.port;
+ return 0;
+}
+
+int cr_page_server(bool daemon_mode, int cfd)
+{
+ int sk = -1, ask = -1, ret;
+ struct sockaddr_in saddr, caddr;
+ socklen_t slen = sizeof(saddr);
+ socklen_t clen = sizeof(caddr);
+
+ up_page_ids_base();
+
+ if (opts.ps_socket != -1) {
+ ret = 0;
+ ask = opts.ps_socket;
+ pr_info("Re-using ps socket %d\n", ask);
+ goto no_server;
+ }
+
+ pr_info("Starting page server on port %u\n", (int)ntohs(opts.port));
+
+ sk = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP);
+ if (sk < 0) {
+ pr_perror("Can't init page server");
+ return -1;
+ }
+
+ if (get_sockaddr_in(&saddr))
+ goto out;
+
+ if (bind(sk, (struct sockaddr *)&saddr, slen)) {
+ pr_perror("Can't bind page server");
+ goto out;
+ }
+
+ if (listen(sk, 1)) {
+ pr_perror("Can't listen on page server socket");
+ goto out;
+ }
+
+ /* Get socket port in case of autobind */
+ if (opts.port == 0) {
+ if (getsockname(sk, (struct sockaddr *)&saddr, &slen)) {
+ pr_perror("Can't get page server name");
+ goto out;
+ }
+
+ opts.port = ntohs(saddr.sin_port);
+ pr_info("Using %u port\n", opts.port);
+ }
+
+no_server:
+ if (daemon_mode) {
+ ret = cr_daemon(1, 0, &ask, cfd);
+ if (ret == -1) {
+ pr_err("Can't run in the background\n");
+ goto out;
+ }
+ if (ret > 0) { /* parent task, daemon started */
+ close_safe(&sk);
+ if (opts.pidfile) {
+ if (write_pidfile(ret) == -1) {
+ pr_perror("Can't write pidfile");
+ kill(ret, SIGKILL);
+ waitpid(ret, NULL, 0);
+ return -1;
+ }
+ }
+
+ return ret;
+ }
+ }
+
+ if (sk >= 0) {
+ ret = ask = accept(sk, (struct sockaddr *)&caddr, &clen);
+ if (ask < 0)
+ pr_perror("Can't accept connection to server");
+ else
+ pr_info("Accepted connection from %s:%u\n",
+ inet_ntoa(caddr.sin_addr),
+ (int)ntohs(caddr.sin_port));
+ close(sk);
+ }
+
+ if (ask >= 0)
+ ret = page_server_serve(ask);
+
+ if (daemon_mode)
+ exit(ret);
+
+ return ret;
+
+out:
+ close(sk);
+ return -1;
+}
+
+static int page_server_sk = -1;
+
+int connect_to_page_server(void)
+{
+ struct sockaddr_in saddr;
+
+ if (!opts.use_page_server)
+ return 0;
+
+ if (opts.ps_socket != -1) {
+ page_server_sk = opts.ps_socket;
+ pr_info("Re-using ps socket %d\n", page_server_sk);
+ goto out;
+ }
+
+ pr_info("Connecting to server %s:%u\n",
+ opts.addr, (int)ntohs(opts.port));
+
+ page_server_sk = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP);
+ if (page_server_sk < 0) {
+ pr_perror("Can't create socket");
+ return -1;
+ }
+
+ if (get_sockaddr_in(&saddr))
+ return -1;
+
+ if (connect(page_server_sk, (struct sockaddr *)&saddr, sizeof(saddr)) < 0) {
+ pr_perror("Can't connect to server");
+ return -1;
+ }
+
+out:
+ /*
+ * CORK the socket at the very beginning. As per ANK
+ * the corked by default socket with sporadic NODELAY-s
+ * on urgent data is the smartest mode ever.
+ */
+ tcp_cork(page_server_sk, true);
+ return 0;
+}
+
+int disconnect_from_page_server(void)
+{
+ struct page_server_iov pi = { };
+ int32_t status = -1;
+ int ret = -1;
+
+ if (!opts.use_page_server)
+ return 0;
+
+ if (page_server_sk == -1)
+ return 0;
+
+ pr_info("Disconnect from the page server %s:%u\n",
+ opts.addr, (int)ntohs(opts.port));
+
+ if (opts.ps_socket != -1)
+ /*
+ * The socket might not get closed (held by
+ * the parent process) so we must order the
+ * page-server to terminate itself.
+ */
+ pi.cmd = PS_IOV_FLUSH_N_CLOSE;
+ else
+ pi.cmd = PS_IOV_FLUSH;
+
+ if (write(page_server_sk, &pi, sizeof(pi)) != sizeof(pi)) {
+ pr_perror("Can't write the fini command to server");
+ goto out;
+ }
+
+ if (read(page_server_sk, &status, sizeof(status)) != sizeof(status)) {
+ pr_perror("The page server doesn't answer");
+ goto out;
+ }
+
+ ret = 0;
+out:
+ close_safe(&page_server_sk);
+ return ret ? : status;
+}
+
+static int write_pagemap_to_server(struct page_xfer *xfer,
+ struct iovec *iov)
+{
+ struct page_server_iov pi;
+
+ pi.cmd = PS_IOV_ADD;
+ pi.dst_id = xfer->dst_id;
+ iovec2psi(iov, &pi);
+
+ if (write(xfer->sk, &pi, sizeof(pi)) != sizeof(pi)) {
+ pr_perror("Can't write pagemap to server");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int write_pages_to_server(struct page_xfer *xfer,
+ int p, unsigned long len)
+{
+ pr_debug("Splicing %lu bytes / %lu pages into socket\n", len, len / PAGE_SIZE);
+
+ if (splice(p, NULL, xfer->sk, NULL, len, SPLICE_F_MOVE) != len) {
+ pr_perror("Can't write pages to socket");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int write_hole_to_server(struct page_xfer *xfer, struct iovec *iov)
+{
+ struct page_server_iov pi;
+
+ pi.cmd = PS_IOV_HOLE;
+ pi.dst_id = xfer->dst_id;
+ iovec2psi(iov, &pi);
+
+ if (write(xfer->sk, &pi, sizeof(pi)) != sizeof(pi)) {
+ pr_perror("Can't write pagehole to server");
+ return -1;
+ }
+
+ return 0;
+}
+
+static void close_server_xfer(struct page_xfer *xfer)
+{
+ xfer->sk = -1;
+}
+
+static int open_page_server_xfer(struct page_xfer *xfer, int fd_type, long id)
+{
+ struct page_server_iov pi;
+ char has_parent;
+
+ xfer->sk = page_server_sk;
+ xfer->write_pagemap = write_pagemap_to_server;
+ xfer->write_pages = write_pages_to_server;
+ xfer->write_hole = write_hole_to_server;
+ xfer->close = close_server_xfer;
+ xfer->dst_id = encode_pm_id(fd_type, id);
+ xfer->parent = NULL;
+
+ pi.cmd = PS_IOV_OPEN2;
+ pi.dst_id = xfer->dst_id;
+ pi.vaddr = 0;
+ pi.nr_pages = 0;
+
+ if (write(xfer->sk, &pi, sizeof(pi)) != sizeof(pi)) {
+ pr_perror("Can't write to page server");
+ return -1;
+ }
+
+ /* Push the command NOW */
+ tcp_nodelay(xfer->sk, true);
+
+ if (read(xfer->sk, &has_parent, 1) != 1) {
+ pr_perror("The page server doesn't answer");
+ return -1;
+ }
+
+ if (has_parent)
+ xfer->parent = (void *) 1; /* This is required for generate_iovs() */
+
+ return 0;
+}
+
+static int write_pagemap_loc(struct page_xfer *xfer,
+ struct iovec *iov)
+{
+ int ret;
+ PagemapEntry pe = PAGEMAP_ENTRY__INIT;
+
+ iovec2pagemap(iov, &pe);
+ if (opts.auto_dedup && xfer->parent != NULL) {
+ ret = dedup_one_iovec(xfer->parent, iov);
+ if (ret == -1) {
+ pr_perror("Auto-deduplication failed");
+ return ret;
+ }
+ }
+ return pb_write_one(xfer->pmi, &pe, PB_PAGEMAP);
+}
+
+static int write_pages_loc(struct page_xfer *xfer,
+ int p, unsigned long len)
+{
+ ssize_t ret;
+
+ ret = splice(p, NULL, img_raw_fd(xfer->pi), NULL, len, SPLICE_F_MOVE);
+ if (ret == -1) {
+ pr_perror("Unable to spice data");
+ return -1;
+ }
+ if (ret != len) {
+ pr_err("Only %zu of %lu bytes have been spliced\n", ret, len);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int check_pagehole_in_parent(struct page_read *p, struct iovec *iov)
+{
+ int ret;
+ unsigned long off, end;
+
+ /*
+ * Try to find pagemap entry in parent, from which
+ * the data will be read on restore.
+ *
+ * This is the optimized version of the page-by-page
+ * read_pagemap_page routine.
+ */
+
+ pr_debug("Checking %p/%zu hole\n", iov->iov_base, iov->iov_len);
+ off = (unsigned long)iov->iov_base;
+ end = off + iov->iov_len;
+ while (1) {
+ struct iovec piov;
+ unsigned long pend;
+
+ ret = seek_pagemap_page(p, off, true);
+ if (ret <= 0 || !p->pe)
+ return -1;
+
+ pagemap2iovec(p->pe, &piov);
+ pr_debug("\tFound %p/%zu\n", piov.iov_base, piov.iov_len);
+
+ /*
+ * The pagemap entry in parent may heppen to be
+ * shorter, than the hole we write. In this case
+ * we should go ahead and check the remainder.
+ */
+
+ pend = (unsigned long)piov.iov_base + piov.iov_len;
+ if (end <= pend)
+ return 0;
+
+ pr_debug("\t\tcontinue on %lx\n", pend);
+ off = pend;
+ }
+}
+
+static int write_pagehole_loc(struct page_xfer *xfer, struct iovec *iov)
+{
+ PagemapEntry pe = PAGEMAP_ENTRY__INIT;
+
+ if (xfer->parent != NULL) {
+ int ret;
+
+ ret = check_pagehole_in_parent(xfer->parent, iov);
+ if (ret) {
+ pr_err("Hole %p/%zu not found in parent\n",
+ iov->iov_base, iov->iov_len);
+ return -1;
+ }
+ }
+
+ iovec2pagemap(iov, &pe);
+ pe.has_in_parent = true;
+ pe.in_parent = true;
+
+ if (pb_write_one(xfer->pmi, &pe, PB_PAGEMAP) < 0)
+ return -1;
+
+ return 0;
+}
+
+static void close_page_xfer(struct page_xfer *xfer)
+{
+ if (xfer->parent != NULL) {
+ xfer->parent->close(xfer->parent);
+ xfree(xfer->parent);
+ xfer->parent = NULL;
+ }
+ close_image(xfer->pi);
+ close_image(xfer->pmi);
+}
+
+int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp,
+ unsigned long off)
+{
+ struct page_pipe_buf *ppb;
+ struct iovec *hole = NULL;
+
+ pr_debug("Transfering pages:\n");
+
+ if (pp->free_hole)
+ hole = &pp->holes[0];
+
+ list_for_each_entry(ppb, &pp->bufs, l) {
+ int i;
+
+ pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs);
+
+ for (i = 0; i < ppb->nr_segs; i++) {
+ struct iovec *iov = &ppb->iov[i];
+
+ while (hole && (hole->iov_base < iov->iov_base)) {
+ BUG_ON(hole->iov_base < (void *)off);
+ hole->iov_base -= off;
+ pr_debug("\th %p [%u]\n", hole->iov_base,
+ (unsigned int)(hole->iov_len / PAGE_SIZE));
+ if (xfer->write_hole(xfer, hole))
+ return -1;
+
+ hole++;
+ if (hole >= &pp->holes[pp->free_hole])
+ hole = NULL;
+ }
+
+ BUG_ON(iov->iov_base < (void *)off);
+ iov->iov_base -= off;
+ pr_debug("\tp %p [%u]\n", iov->iov_base,
+ (unsigned int)(iov->iov_len / PAGE_SIZE));
+
+ if (xfer->write_pagemap(xfer, iov))
+ return -1;
+ if (xfer->write_pages(xfer, ppb->p[0], iov->iov_len))
+ return -1;
+ }
+ }
+
+ while (hole) {
+ BUG_ON(hole->iov_base < (void *)off);
+ hole->iov_base -= off;
+ pr_debug("\th* %p [%u]\n", hole->iov_base,
+ (unsigned int)(hole->iov_len / PAGE_SIZE));
+ if (xfer->write_hole(xfer, hole))
+ return -1;
+
+ hole++;
+ if (hole >= &pp->holes[pp->free_hole])
+ hole = NULL;
+ }
+
+ return 0;
+}
+
+static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, long id)
+{
+ xfer->pmi = open_image(fd_type, O_DUMP, id);
+ if (!xfer->pmi)
+ return -1;
+
+ xfer->pi = open_pages_image(O_DUMP, xfer->pmi);
+ if (!xfer->pi) {
+ close_image(xfer->pmi);
+ return -1;
+ }
+
+ /*
+ * Open page-read for parent images (if it exists). It will
+ * be used for two things:
+ * 1) when writing a page, those from parent will be dedup-ed
+ * 2) when writing a hole, the respective place would be checked
+ * to exist in parent (either pagemap or hole)
+ */
+ xfer->parent = NULL;
+ if (fd_type == CR_FD_PAGEMAP) {
+ int ret;
+ int pfd;
+
+ pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY);
+ if (pfd < 0 && errno == ENOENT)
+ goto out;
+
+ xfer->parent = xmalloc(sizeof(*xfer->parent));
+ if (!xfer->parent) {
+ close(pfd);
+ return -1;
+ }
+
+ ret = open_page_read_at(pfd, id, xfer->parent, PR_TASK);
+ if (ret <= 0) {
+ pr_perror("No parent image found, though parent directory is set");
+ xfree(xfer->parent);
+ xfer->parent = NULL;
+ close(pfd);
+ goto out;
+ }
+ close(pfd);
+ }
+
+out:
+ xfer->write_pagemap = write_pagemap_loc;
+ xfer->write_pages = write_pages_loc;
+ xfer->write_hole = write_pagehole_loc;
+ xfer->close = close_page_xfer;
+ return 0;
+}
+
+int open_page_xfer(struct page_xfer *xfer, int fd_type, long id)
+{
+ if (opts.use_page_server)
+ return open_page_server_xfer(xfer, fd_type, id);
+ else
+ return open_page_local_xfer(xfer, fd_type, id);
+}
+
+/*
+ * Return:
+ * 1 - if a parent image exists
+ * 0 - if a parent image doesn't exist
+ * -1 - in error cases
+ */
+int check_parent_local_xfer(int fd_type, int id)
+{
+ char path[PATH_MAX];
+ struct stat st;
+ int ret, pfd;
+
+ pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY);
+ if (pfd < 0 && errno == ENOENT)
+ return 0;
+
+ snprintf(path, sizeof(path), imgset_template[fd_type].fmt, id);
+ ret = fstatat(pfd, path, &st, 0);
+ if (ret == -1 && errno != ENOENT) {
+ pr_perror("Unable to stat %s", path);
+ close(pfd);
+ return -1;
+ }
+
+ close(pfd);
+ return (ret == 0);
+}
+
+static int page_server_check_parent(int sk, struct page_server_iov *pi)
+{
+ int type, ret;
+ long id;
+
+ type = decode_pm_type(pi->dst_id);
+ id = decode_pm_id(pi->dst_id);
+
+ ret = check_parent_local_xfer(type, id);
+ if (ret < 0)
+ return -1;
+
+ if (write(sk, &ret, sizeof(ret)) != sizeof(ret)) {
+ pr_perror("Unable to send reponse");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int check_parent_server_xfer(int fd_type, long id)
+{
+ struct page_server_iov pi = {};
+ int has_parent;
+
+ pi.cmd = PS_IOV_PARENT;
+ pi.dst_id = encode_pm_id(fd_type, id);
+
+ if (write(page_server_sk, &pi, sizeof(pi)) != sizeof(pi)) {
+ pr_perror("Can't write to page server");
+ return -1;
+ }
+
+ tcp_nodelay(page_server_sk, true);
+
+ if (read(page_server_sk, &has_parent, sizeof(int)) != sizeof(int)) {
+ pr_perror("The page server doesn't answer");
+ return -1;
+ }
+
+ return has_parent;
+}
+
+int check_parent_page_xfer(int fd_type, long id)
+{
+ if (opts.use_page_server)
+ return check_parent_server_xfer(fd_type, id);
+ else
+ return check_parent_local_xfer(fd_type, id);
+}
diff --git a/criu/pagemap-cache.c b/criu/pagemap-cache.c
new file mode 100644
index 000000000000..c2e467b673be
--- /dev/null
+++ b/criu/pagemap-cache.c
@@ -0,0 +1,173 @@
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "pagemap-cache.h"
+#include "compiler.h"
+#include "xmalloc.h"
+#include "util.h"
+#include "log.h"
+#include "vma.h"
+#include "kerndat.h"
+
+#undef LOG_PREFIX
+#define LOG_PREFIX "pagemap-cache: "
+
+/* To carry up to 2M of physical memory */
+#define PMC_SHIFT (21)
+#define PMC_SIZE (1ul << PMC_SHIFT)
+#define PMC_MASK (~(PMC_SIZE - 1))
+#define PMC_SIZE_GAP (PMC_SIZE / 4)
+
+#define PAGEMAP_LEN(addr) (PAGE_PFN(addr) * sizeof(u64))
+
+static inline void pmc_reset(pmc_t *pmc)
+{
+ memzero(pmc, sizeof(*pmc));
+ pmc->fd = -1;
+}
+
+static inline void pmc_zap(pmc_t *pmc)
+{
+ pmc->start = pmc->end = 0;
+}
+
+int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t size)
+{
+ size_t map_size = max(size, (size_t)PMC_SIZE);
+ pmc_reset(pmc);
+
+ BUG_ON(!vma_head);
+
+ pmc->pid = pid;
+ pmc->map_len = PAGEMAP_LEN(map_size);
+ pmc->vma_head = vma_head;
+
+ pmc->map = xmalloc(pmc->map_len);
+ if (!pmc->map)
+ goto err;
+
+ if (kdat.pmap == PM_DISABLED) {
+ pmc->fd = -1;
+ pr_warn("No pagemap for %d available, "
+ "switching to greedy mode\n", pid);
+ } else {
+ pmc->fd = open_proc(pid, "pagemap");
+ if (pmc->fd < 0)
+ goto err;
+ }
+
+ pr_debug("created for pid %d (takes %zu bytes)\n", pid, pmc->map_len);
+ return 0;
+
+err:
+ pr_err("Failed to init pagemap for %d\n", pid);
+ pmc_fini(pmc);
+ return -1;
+}
+
+static inline u64 *__pmc_get_map(pmc_t *pmc, unsigned long addr)
+{
+ return &pmc->map[PAGE_PFN(addr - pmc->start)];
+}
+
+static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma)
+{
+ unsigned long low = vma->e->start & PMC_MASK;
+ unsigned long high = low + PMC_SIZE;
+ size_t len = vma_area_len(vma);
+ size_t size_map;
+
+ if (high > kdat.task_size)
+ high = kdat.task_size;
+
+ pmc->start = vma->e->start;
+ pmc->end = vma->e->end;
+
+ pr_debug("filling VMA %lx-%lx (%zuK) [l:%lx h:%lx]\n",
+ (long)vma->e->start, (long)vma->e->end, len >> 10, low, high);
+
+ /*
+ * If we meet a small VMA, lets try to fit 2M cache
+ * window at least 75% full, otherwise left as a plain
+ * "one vma at a time" read. Note the VMAs in cache must
+ * fit in solid manner, iow -- either the whole vma fits
+ * the cache window, either plain read is used.
+ *
+ * The benefit (apart redusing the number of read() calls)
+ * is to walk page tables less.
+ */
+ if (len < PMC_SIZE && (vma->e->start - low) < PMC_SIZE_GAP) {
+ size_t size_cov = len;
+ size_t nr_vmas = 1;
+
+ pr_debug("\t%16lx-%-16lx nr:%-5zu cov:%zu\n",
+ (long)vma->e->start, (long)vma->e->end, nr_vmas, size_cov);
+
+ list_for_each_entry_continue(vma, pmc->vma_head, list) {
+ if (vma->e->start > high || vma->e->end > high)
+ break;
+
+ BUG_ON(vma->e->start < low);
+ size_cov += vma_area_len(vma);
+ nr_vmas++;
+
+ pr_debug("\t%16lx-%-16lx nr:%-5zu cov:%zu\n",
+ (long)vma->e->start, (long)vma->e->end, nr_vmas, size_cov);
+ }
+
+ if (nr_vmas > 1) {
+ /*
+ * Note we don't touch low bound since it's set
+ * to first VMA start already and not updating it
+ * allows us to save a couple of code bytes.
+ */
+ pmc->end = high;
+ pr_debug("\tcache mode [l:%lx h:%lx]\n", pmc->start, pmc->end);
+ } else
+ pr_debug("\tsimple mode [l:%lx h:%lx]\n", pmc->start, pmc->end);
+ }
+
+ size_map = PAGEMAP_LEN(pmc->end - pmc->start);
+ BUG_ON(pmc->map_len < size_map);
+
+ if (unlikely(pmc->fd < 0)) {
+ /*
+ * We don't have access to the dumpee pagemap so fill
+ * everything as present. It's better than refuse
+ * to dump because it simply disables optimisation.
+ */
+ memset(pmc->map, 1, size_map);
+ } else {
+ if (pread(pmc->fd, pmc->map, size_map, PAGEMAP_PFN_OFF(pmc->start)) != size_map) {
+ pmc_zap(pmc);
+ pr_perror("Can't read %d's pagemap file", pmc->pid);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+u64 *pmc_get_map(pmc_t *pmc, const struct vma_area *vma)
+{
+ /* Hit */
+ if (likely(pmc->start <= vma->e->start && pmc->end >= vma->e->end))
+ return __pmc_get_map(pmc, vma->e->start);
+
+ /* Miss, refill the cache */
+ if (pmc_fill_cache(pmc, vma)) {
+ pr_err("Failed to fill cache for %d (%lx-%lx)\n",
+ pmc->pid, (long)vma->e->start, (long)vma->e->end);
+ return NULL;
+ }
+
+ /* Hit for sure */
+ return __pmc_get_map(pmc, vma->e->start);
+}
+
+void pmc_fini(pmc_t *pmc)
+{
+ close_safe(&pmc->fd);
+ xfree(pmc->map);
+ pmc_reset(pmc);
+}
diff --git a/criu/parasite-syscall.c b/criu/parasite-syscall.c
new file mode 100644
index 000000000000..feb77b53b93d
--- /dev/null
+++ b/criu/parasite-syscall.c
@@ -0,0 +1,1408 @@
+#include <unistd.h>
+#include <inttypes.h>
+
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <sys/mman.h>
+
+#include "protobuf.h"
+#include "protobuf/sa.pb-c.h"
+#include "protobuf/timer.pb-c.h"
+#include "protobuf/creds.pb-c.h"
+#include "protobuf/core.pb-c.h"
+#include "protobuf/pagemap.pb-c.h"
+
+#include "imgset.h"
+#include "ptrace.h"
+#include "asm/processor-flags.h"
+#include "parasite-syscall.h"
+#include "parasite-blob.h"
+#include "parasite.h"
+#include "crtools.h"
+#include "namespaces.h"
+#include "kerndat.h"
+#include "config.h"
+#include "pstree.h"
+#include "posix-timer.h"
+#include "net.h"
+#include "mem.h"
+#include "vma.h"
+#include "proc_parse.h"
+#include "aio.h"
+
+#include <string.h>
+#include <stdlib.h>
+#include <elf.h>
+
+#include "asm/parasite-syscall.h"
+#include "asm/dump.h"
+#include "asm/restorer.h"
+#include "pie/pie-relocs.h"
+
+#define MEMFD_FNAME "CRIUMFD"
+#define MEMFD_FNAME_SZ sizeof(MEMFD_FNAME)
+
+static int can_run_syscall(unsigned long ip, unsigned long start,
+ unsigned long end, unsigned long pad)
+{
+ return ip >= start && ip < (end - code_syscall_size - pad);
+}
+
+static int syscall_fits_vma_area(struct vma_area *vma_area, unsigned long pad)
+{
+ return can_run_syscall((unsigned long)vma_area->e->start,
+ (unsigned long)vma_area->e->start,
+ (unsigned long)vma_area->e->end,
+ pad);
+}
+
+static struct vma_area *get_vma_by_ip(struct list_head *vma_area_list,
+ unsigned long ip,
+ unsigned long pad)
+{
+ struct vma_area *vma_area;
+
+ list_for_each_entry(vma_area, vma_area_list, list) {
+ if (vma_area->e->start >= kdat.task_size)
+ continue;
+ if (!(vma_area->e->prot & PROT_EXEC))
+ continue;
+ if (syscall_fits_vma_area(vma_area, pad))
+ return vma_area;
+ }
+
+ return NULL;
+}
+
+static inline int ptrace_get_regs(int pid, user_regs_struct_t *regs)
+{
+ struct iovec iov;
+
+ iov.iov_base = regs;
+ iov.iov_len = sizeof(user_regs_struct_t);
+ return ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov);
+}
+
+static inline int ptrace_set_regs(int pid, user_regs_struct_t *regs)
+{
+ struct iovec iov;
+
+ iov.iov_base = regs;
+ iov.iov_len = sizeof(user_regs_struct_t);
+ return ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov);
+}
+
+static int get_thread_ctx(int pid, struct thread_ctx *ctx)
+{
+ if (ptrace(PTRACE_GETSIGMASK, pid, sizeof(k_rtsigset_t), &ctx->sigmask)) {
+ pr_perror("can't get signal blocking mask for %d", pid);
+ return -1;
+ }
+
+ if (ptrace_get_regs(pid, &ctx->regs)) {
+ pr_perror("Can't obtain registers (pid: %d)", pid);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int restore_thread_ctx(int pid, struct thread_ctx *ctx)
+{
+ int ret = 0;
+
+ if (ptrace_set_regs(pid, &ctx->regs)) {
+ pr_perror("Can't restore registers (pid: %d)", pid);
+ ret = -1;
+ }
+ if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &ctx->sigmask)) {
+ pr_perror("Can't block signals");
+ ret = -1;
+ }
+
+ return ret;
+}
+
+static int parasite_run(pid_t pid, int cmd, unsigned long ip, void *stack,
+ user_regs_struct_t *regs, struct thread_ctx *octx)
+{
+ k_rtsigset_t block;
+
+ ksigfillset(&block);
+ if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &block)) {
+ pr_perror("Can't block signals for %d", pid);
+ goto err_sig;
+ }
+
+ parasite_setup_regs(ip, stack, regs);
+ if (ptrace_set_regs(pid, regs)) {
+ pr_perror("Can't set registers for %d", pid);
+ goto err_regs;
+ }
+
+ if (ptrace(cmd, pid, NULL, NULL)) {
+ pr_perror("Can't run parasite at %d", pid);
+ goto err_cont;
+ }
+
+ return 0;
+
+err_cont:
+ if (ptrace_set_regs(pid, &octx->regs))
+ pr_perror("Can't restore regs for %d", pid);
+err_regs:
+ if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &octx->sigmask))
+ pr_perror("Can't restore sigmask for %d", pid);
+err_sig:
+ return -1;
+}
+
+/* we run at @regs->ip */
+static int parasite_trap(struct parasite_ctl *ctl, pid_t pid,
+ user_regs_struct_t *regs,
+ struct thread_ctx *octx)
+{
+ siginfo_t siginfo;
+ int status;
+ int ret = -1;
+
+ /*
+ * Most ideas are taken from Tejun Heo's parasite thread
+ * https://code.google.com/p/ptrace-parasite/
+ */
+
+ if (wait4(pid, &status, __WALL, NULL) != pid) {
+ pr_perror("Waited pid mismatch (pid: %d)", pid);
+ goto err;
+ }
+
+ if (!WIFSTOPPED(status)) {
+ pr_err("Task is still running (pid: %d)\n", pid);
+ goto err;
+ }
+
+ if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &siginfo)) {
+ pr_perror("Can't get siginfo (pid: %d)", pid);
+ goto err;
+ }
+
+ if (ptrace_get_regs(pid, regs)) {
+ pr_perror("Can't obtain registers (pid: %d)", pid);
+ goto err;
+ }
+
+ if (WSTOPSIG(status) != SIGTRAP || siginfo.si_code != ARCH_SI_TRAP) {
+ pr_debug("** delivering signal %d si_code=%d\n",
+ siginfo.si_signo, siginfo.si_code);
+
+ pr_err("Unexpected %d task interruption, aborting\n", pid);
+ goto err;
+ }
+
+ /*
+ * We've reached this point if int3 is triggered inside our
+ * parasite code. So we're done.
+ */
+ ret = 0;
+err:
+ if (restore_thread_ctx(pid, octx))
+ ret = -1;
+
+ return ret;
+}
+
+int __parasite_execute_syscall(struct parasite_ctl *ctl, user_regs_struct_t *regs)
+{
+ pid_t pid = ctl->pid.real;
+ int err;
+ u8 code_orig[BUILTIN_SYSCALL_SIZE];
+
+ /*
+ * Inject syscall instruction and remember original code,
+ * we will need it to restore original program content.
+ */
+ memcpy(code_orig, code_syscall, sizeof(code_orig));
+ if (ptrace_swap_area(pid, (void *)ctl->syscall_ip,
+ (void *)code_orig, sizeof(code_orig))) {
+ pr_err("Can't inject syscall blob (pid: %d)\n", pid);
+ return -1;
+ }
+
+ err = parasite_run(pid, PTRACE_CONT, ctl->syscall_ip, 0, regs, &ctl->orig);
+ if (!err)
+ err = parasite_trap(ctl, pid, regs, &ctl->orig);
+
+ if (ptrace_poke_area(pid, (void *)code_orig,
+ (void *)ctl->syscall_ip, sizeof(code_orig))) {
+ pr_err("Can't restore syscall blob (pid: %d)\n", ctl->pid.real);
+ err = -1;
+ }
+
+ return err;
+}
+
+void *parasite_args_s(struct parasite_ctl *ctl, int args_size)
+{
+ BUG_ON(args_size > ctl->args_size);
+ return ctl->addr_args;
+}
+
+static int parasite_execute_trap_by_pid(unsigned int cmd,
+ struct parasite_ctl *ctl, pid_t pid,
+ void *stack,
+ struct thread_ctx *octx)
+{
+ user_regs_struct_t regs = octx->regs;
+ int ret;
+
+ *ctl->addr_cmd = cmd;
+
+ ret = parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, stack, ®s, octx);
+ if (ret == 0)
+ ret = parasite_trap(ctl, pid, ®s, octx);
+ if (ret == 0)
+ ret = (int)REG_RES(regs);
+
+ if (ret)
+ pr_err("Parasite exited with %d\n", ret);
+
+ return ret;
+}
+
+static int __parasite_send_cmd(int sockfd, struct ctl_msg *m)
+{
+ int ret;
+
+ ret = send(sockfd, m, sizeof(*m), 0);
+ if (ret == -1) {
+ pr_perror("Failed to send command %d to daemon", m->cmd);
+ return -1;
+ } else if (ret != sizeof(*m)) {
+ pr_err("Message to daemon is trimmed (%d/%d)\n",
+ (int)sizeof(*m), ret);
+ return -1;
+ }
+
+ pr_debug("Sent msg to daemon %d %d %d\n", m->cmd, m->ack, m->err);
+ return 0;
+}
+
+static int parasite_wait_ack(int sockfd, unsigned int cmd, struct ctl_msg *m)
+{
+ int ret;
+
+ pr_debug("Wait for ack %d on daemon socket\n", cmd);
+
+ while (1) {
+ memzero(m, sizeof(*m));
+
+ ret = recv(sockfd, m, sizeof(*m), MSG_WAITALL);
+ if (ret == -1) {
+ pr_perror("Failed to read ack");
+ return -1;
+ } else if (ret != sizeof(*m)) {
+ pr_err("Message reply from daemon is trimmed (%d/%d)\n",
+ (int)sizeof(*m), ret);
+ return -1;
+ }
+ pr_debug("Fetched ack: %d %d %d\n",
+ m->cmd, m->ack, m->err);
+
+ if (m->cmd != cmd || m->ack != cmd) {
+ pr_err("Communication error, this is not "
+ "the ack we expected\n");
+ return -1;
+ }
+ return 0;
+ }
+
+ return -1;
+}
+
+int __parasite_wait_daemon_ack(unsigned int cmd,
+ struct parasite_ctl *ctl)
+{
+ struct ctl_msg m;
+
+ if (parasite_wait_ack(ctl->tsock, cmd, &m))
+ return -1;
+
+ if (m.err != 0) {
+ pr_err("Command %d for daemon failed with %d\n",
+ cmd, m.err);
+ return -1;
+ }
+
+ return 0;
+}
+
+int __parasite_execute_daemon(unsigned int cmd, struct parasite_ctl *ctl)
+{
+ struct ctl_msg m;
+
+ m = ctl_msg_cmd(cmd);
+ return __parasite_send_cmd(ctl->tsock, &m);
+}
+
+int parasite_execute_daemon(unsigned int cmd, struct parasite_ctl *ctl)
+{
+ int ret;
+
+ ret = __parasite_execute_daemon(cmd, ctl);
+ if (!ret)
+ ret = __parasite_wait_daemon_ack(cmd, ctl);
+
+ return ret;
+}
+
+static int gen_parasite_saddr(struct sockaddr_un *saddr, int key)
+{
+ int sun_len;
+
+ saddr->sun_family = AF_UNIX;
+ snprintf(saddr->sun_path, UNIX_PATH_MAX,
+ "X/crtools-pr-%d", key);
+
+ sun_len = SUN_LEN(saddr);
+ *saddr->sun_path = '\0';
+
+ return sun_len;
+}
+
+int parasite_send_fd(struct parasite_ctl *ctl, int fd)
+{
+ if (send_fd(ctl->tsock, NULL, 0, fd) < 0) {
+ pr_perror("Can't send file descriptor");
+ return -1;
+ }
+ return 0;
+}
+
+/*
+ * We need to detect parasite crashes not to hang on socket operations.
+ * Since CRIU holds parasite with ptrace, it will receive SIGCHLD if the
+ * latter would crash.
+ *
+ * This puts a restriction on how to execute a sub-process on dump stage.
+ * One should use the cr_system helper, that blocks sigcild and waits
+ * for the spawned program to finish.
+ */
+static void sigchld_handler(int signal, siginfo_t *siginfo, void *data)
+{
+ int pid, status;
+
+ pid = waitpid(-1, &status, WNOHANG);
+ if (pid <= 0)
+ return;
+
+ pr_err("si_code=%d si_pid=%d si_status=%d\n",
+ siginfo->si_code, siginfo->si_pid, siginfo->si_status);
+
+ if (WIFEXITED(status))
+ pr_err("%d exited with %d unexpectedly\n", pid, WEXITSTATUS(status));
+ else if (WIFSIGNALED(status))
+ pr_err("%d was killed by %d unexpectedly\n", pid, WTERMSIG(status));
+ else if (WIFSTOPPED(status))
+ pr_err("%d was stopped by %d unexpectedly\n", pid, WSTOPSIG(status));
+
+ exit(1);
+}
+
+static int setup_child_handler()
+{
+ struct sigaction sa = {
+ .sa_sigaction = sigchld_handler,
+ .sa_flags = SA_SIGINFO | SA_RESTART,
+ };
+
+ sigemptyset(&sa.sa_mask);
+ sigaddset(&sa.sa_mask, SIGCHLD);
+ if (sigaction(SIGCHLD, &sa, NULL)) {
+ pr_perror("Unable to setup SIGCHLD handler");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int restore_child_handler()
+{
+ struct sigaction sa = {
+ .sa_handler = SIG_DFL,
+ .sa_flags = SA_SIGINFO | SA_RESTART,
+ };
+
+ sigemptyset(&sa.sa_mask);
+ sigaddset(&sa.sa_mask, SIGCHLD);
+ if (sigaction(SIGCHLD, &sa, NULL)) {
+ pr_perror("Unable to setup SIGCHLD handler");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int prepare_tsock(struct parasite_ctl *ctl, pid_t pid,
+ struct parasite_init_args *args, struct ns_id *net)
+{
+ static int ssock = -1;
+
+ pr_info("Putting tsock into pid %d\n", pid);
+ args->h_addr_len = gen_parasite_saddr(&args->h_addr, getpid());
+
+ if (ssock == -1) {
+ ssock = net->net.seqsk;
+ net->net.seqsk = -1;
+
+ if (bind(ssock, (struct sockaddr *)&args->h_addr, args->h_addr_len) < 0) {
+ pr_perror("Can't bind socket");
+ goto err;
+ }
+
+ if (listen(ssock, 1)) {
+ pr_perror("Can't listen on transport socket");
+ goto err;
+ }
+ }
+
+ /*
+ * Set to -1 to prevent any accidental misuse. The
+ * only valid user of it is accept_tsock().
+ */
+ ctl->tsock = -ssock;
+ return 0;
+err:
+ close_safe(&ssock);
+ return -1;
+}
+
+static int accept_tsock(struct parasite_ctl *ctl)
+{
+ int sock;
+ int ask = -ctl->tsock; /* this '-' is explained above */
+
+ sock = accept(ask, NULL, 0);
+ if (sock < 0) {
+ pr_perror("Can't accept connection to the transport socket");
+ close(ask);
+ return -1;
+ }
+
+ ctl->tsock = sock;
+ return 0;
+}
+
+static int parasite_init_daemon(struct parasite_ctl *ctl, struct ns_id *net)
+{
+ struct parasite_init_args *args;
+ pid_t pid = ctl->pid.real;
+ user_regs_struct_t regs;
+ struct ctl_msg m = { };
+
+ *ctl->addr_cmd = PARASITE_CMD_INIT_DAEMON;
+
+ args = parasite_args(ctl, struct parasite_init_args);
+
+ args->sigframe = ctl->rsigframe;
+ args->log_level = log_get_loglevel();
+
+ if (prepare_tsock(ctl, pid, args, net))
+ goto err;
+
+ /* after this we can catch parasite errors in chld handler */
+ if (setup_child_handler())
+ goto err;
+
+ regs = ctl->orig.regs;
+ if (parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, ctl->rstack, ®s, &ctl->orig))
+ goto err;
+
+ if (accept_tsock(ctl) < 0)
+ goto err;
+
+ if (parasite_send_fd(ctl, log_get_fd()))
+ goto err;
+
+ pr_info("Wait for parasite being daemonized...\n");
+
+ if (parasite_wait_ack(ctl->tsock, PARASITE_CMD_INIT_DAEMON, &m)) {
+ pr_err("Can't switch parasite %d to daemon mode %d\n",
+ pid, m.err);
+ goto err;
+ }
+
+ ctl->sigreturn_addr = args->sigreturn_addr;
+ ctl->daemonized = true;
+ pr_info("Parasite %d has been switched to daemon mode\n", pid);
+ return 0;
+err:
+ return -1;
+}
+
+static int alloc_groups_copy_creds(CredsEntry *ce, struct parasite_dump_creds *c)
+{
+ BUILD_BUG_ON(sizeof(ce->groups[0]) != sizeof(c->groups[0]));
+ BUILD_BUG_ON(sizeof(ce->cap_inh[0]) != sizeof(c->cap_inh[0]));
+ BUILD_BUG_ON(sizeof(ce->cap_prm[0]) != sizeof(c->cap_prm[0]));
+ BUILD_BUG_ON(sizeof(ce->cap_eff[0]) != sizeof(c->cap_eff[0]));
+ BUILD_BUG_ON(sizeof(ce->cap_bnd[0]) != sizeof(c->cap_bnd[0]));
+
+ BUG_ON(ce->n_cap_inh != CR_CAP_SIZE);
+ BUG_ON(ce->n_cap_prm != CR_CAP_SIZE);
+ BUG_ON(ce->n_cap_eff != CR_CAP_SIZE);
+ BUG_ON(ce->n_cap_bnd != CR_CAP_SIZE);
+
+ memcpy(ce->cap_inh, c->cap_inh, sizeof(c->cap_inh[0]) * CR_CAP_SIZE);
+ memcpy(ce->cap_prm, c->cap_prm, sizeof(c->cap_prm[0]) * CR_CAP_SIZE);
+ memcpy(ce->cap_eff, c->cap_eff, sizeof(c->cap_eff[0]) * CR_CAP_SIZE);
+ memcpy(ce->cap_bnd, c->cap_bnd, sizeof(c->cap_bnd[0]) * CR_CAP_SIZE);
+
+ ce->secbits = c->secbits;
+ ce->n_groups = c->ngroups;
+
+ ce->groups = xmemdup(c->groups, sizeof(c->groups[0]) * c->ngroups);
+
+ ce->uid = c->uids[0];
+ ce->gid = c->gids[0];
+ ce->euid = c->uids[1];
+ ce->egid = c->gids[1];
+ ce->suid = c->uids[2];
+ ce->sgid = c->gids[2];
+ ce->fsuid = c->uids[3];
+ ce->fsgid = c->gids[3];
+
+ return ce->groups ? 0 : -ENOMEM;
+}
+
+int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, CoreEntry *core)
+{
+ ThreadCoreEntry *tc = core->thread_core;
+ struct parasite_dump_thread *args;
+ struct parasite_dump_creds *pc;
+ int ret;
+
+ args = parasite_args(ctl, struct parasite_dump_thread);
+
+ pc = args->creds;
+ pc->cap_last_cap = kdat.last_cap;
+
+ ret = parasite_execute_daemon(PARASITE_CMD_DUMP_THREAD, ctl);
+ if (ret < 0)
+ return ret;
+
+ ret = alloc_groups_copy_creds(tc->creds, pc);
+ if (ret) {
+ pr_err("Can't copy creds for thread leader %d\n", pid);
+ return -1;
+ }
+
+ return dump_thread_core(pid, core, args);
+}
+
+int parasite_dump_thread_seized(struct parasite_ctl *ctl, int id,
+ struct pid *tid, CoreEntry *core)
+{
+ struct parasite_dump_thread *args;
+ pid_t pid = tid->real;
+ ThreadCoreEntry *tc = core->thread_core;
+ CredsEntry *creds = tc->creds;
+ struct parasite_dump_creds *pc;
+ int ret;
+ struct thread_ctx octx;
+
+ BUG_ON(id == 0); /* Leader is dumped in dump_task_core_all */
+
+ args = parasite_args(ctl, struct parasite_dump_thread);
+
+ pc = args->creds;
+ pc->cap_last_cap = kdat.last_cap;
+
+ ret = get_thread_ctx(pid, &octx);
+ if (ret)
+ return -1;
+
+ tc->has_blk_sigset = true;
+ memcpy(&tc->blk_sigset, &octx.sigmask, sizeof(k_rtsigset_t));
+
+ ret = parasite_execute_trap_by_pid(PARASITE_CMD_DUMP_THREAD, ctl,
+ pid, ctl->r_thread_stack, &octx);
+ if (ret) {
+ pr_err("Can't init thread in parasite %d\n", pid);
+ return -1;
+ }
+
+ ret = alloc_groups_copy_creds(creds, pc);
+ if (ret) {
+ pr_err("Can't copy creds for thread %d\n", pid);
+ return -1;
+ }
+
+ ret = get_task_regs(pid, octx.regs, core);
+ if (ret) {
+ pr_err("Can't obtain regs for thread %d\n", pid);
+ return -1;
+ }
+
+ tid->virt = args->tid;
+ return dump_thread_core(pid, core, args);
+}
+
+int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct cr_imgset *cr_imgset)
+{
+ struct parasite_dump_sa_args *args;
+ int ret, sig;
+ struct cr_img *img;
+ SaEntry se = SA_ENTRY__INIT;
+
+ args = parasite_args(ctl, struct parasite_dump_sa_args);
+
+ ret = parasite_execute_daemon(PARASITE_CMD_DUMP_SIGACTS, ctl);
+ if (ret < 0)
+ return ret;
+
+ img = img_from_set(cr_imgset, CR_FD_SIGACT);
+
+ for (sig = 1; sig <= SIGMAX; sig++) {
+ int i = sig - 1;
+
+ if (sig == SIGSTOP || sig == SIGKILL)
+ continue;
+
+ ASSIGN_TYPED(se.sigaction, encode_pointer(args->sas[i].rt_sa_handler));
+ ASSIGN_TYPED(se.flags, args->sas[i].rt_sa_flags);
+ ASSIGN_TYPED(se.restorer, encode_pointer(args->sas[i].rt_sa_restorer));
+ ASSIGN_TYPED(se.mask, args->sas[i].rt_sa_mask.sig[0]);
+
+ if (pb_write_one(img, &se, PB_SIGACT) < 0)
+ return -1;
+ }
+
+ return 0;
+}
+
+static void encode_itimer(struct itimerval *v, ItimerEntry *ie)
+{
+ ie->isec = v->it_interval.tv_sec;
+ ie->iusec = v->it_interval.tv_usec;
+ ie->vsec = v->it_value.tv_sec;
+ ie->vusec = v->it_value.tv_usec;
+}
+
+int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *item)
+{
+ CoreEntry *core = item->core[0];
+ struct parasite_dump_itimers_args *args;
+ int ret;
+
+ args = parasite_args(ctl, struct parasite_dump_itimers_args);
+
+ ret = parasite_execute_daemon(PARASITE_CMD_DUMP_ITIMERS, ctl);
+ if (ret < 0)
+ return ret;
+
+ encode_itimer(&args->real, core->tc->timers->real);
+ encode_itimer(&args->virt, core->tc->timers->virt);
+ encode_itimer(&args->prof, core->tc->timers->prof);
+
+ return 0;
+}
+
+static void encode_posix_timer(struct posix_timer *v, struct proc_posix_timer *vp, PosixTimerEntry *pte)
+{
+ pte->it_id = vp->spt.it_id;
+ pte->clock_id = vp->spt.clock_id;
+ pte->si_signo = vp->spt.si_signo;
+ pte->it_sigev_notify = vp->spt.it_sigev_notify;
+ pte->sival_ptr = encode_pointer(vp->spt.sival_ptr);
+
+ pte->overrun = v->overrun;
+
+ pte->isec = v->val.it_interval.tv_sec;
+ pte->insec = v->val.it_interval.tv_nsec;
+ pte->vsec = v->val.it_value.tv_sec;
+ pte->vnsec = v->val.it_value.tv_nsec;
+}
+
+static int core_alloc_posix_timers(TaskTimersEntry *tte, int n,
+ PosixTimerEntry **pte)
+{
+ int sz;
+
+ /*
+ * Will be free()-ed in core_entry_free()
+ */
+
+ sz = n * (sizeof(PosixTimerEntry *) + sizeof(PosixTimerEntry));
+ tte->posix = xmalloc(sz);
+ if (!tte->posix)
+ return -1;
+
+ tte->n_posix = n;
+ *pte = (PosixTimerEntry *)(tte->posix + n);
+ return 0;
+}
+
+int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args,
+ struct parasite_ctl *ctl, struct pstree_item *item)
+{
+ CoreEntry *core = item->core[0];
+ TaskTimersEntry *tte = core->tc->timers;
+ PosixTimerEntry *pte;
+ struct parasite_dump_posix_timers_args * args;
+ struct proc_posix_timer *temp;
+ int i;
+ int ret = 0;
+
+ if (core_alloc_posix_timers(tte, proc_args->timer_n, &pte))
+ return -1;
+
+ args = parasite_args_s(ctl, posix_timers_dump_size(proc_args->timer_n));
+ args->timer_n = proc_args->timer_n;
+
+ i = 0;
+ list_for_each_entry(temp, &proc_args->timers, list) {
+ args->timer[i].it_id = temp->spt.it_id;
+ i++;
+ }
+
+ ret = parasite_execute_daemon(PARASITE_CMD_DUMP_POSIX_TIMERS, ctl);
+ if (ret < 0)
+ goto end_posix;
+
+ i = 0;
+ list_for_each_entry(temp, &proc_args->timers, list) {
+ posix_timer_entry__init(&pte[i]);
+ encode_posix_timer(&args->timer[i], temp, &pte[i]);
+ tte->posix[i] = &pte[i];
+ i++;
+ }
+
+end_posix:
+ free_posix_timers(proc_args);
+ return ret;
+}
+
+int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_misc *misc)
+{
+ struct parasite_dump_misc *ma;
+
+ ma = parasite_args(ctl, struct parasite_dump_misc);
+ if (parasite_execute_daemon(PARASITE_CMD_DUMP_MISC, ctl) < 0)
+ return -1;
+
+ *misc = *ma;
+ return 0;
+}
+
+struct parasite_tty_args *parasite_dump_tty(struct parasite_ctl *ctl, int fd, int type)
+{
+ struct parasite_tty_args *p;
+
+ p = parasite_args(ctl, struct parasite_tty_args);
+ p->fd = fd;
+ p->type = type;
+
+ if (parasite_execute_daemon(PARASITE_CMD_DUMP_TTY, ctl) < 0)
+ return NULL;
+
+ return p;
+}
+
+int parasite_drain_fds_seized(struct parasite_ctl *ctl,
+ struct parasite_drain_fd *dfds, int *lfds, struct fd_opts *opts)
+{
+ int ret = -1, size;
+ struct parasite_drain_fd *args;
+
+ size = drain_fds_size(dfds);
+ args = parasite_args_s(ctl, size);
+ memcpy(args, dfds, size);
+
+ ret = __parasite_execute_daemon(PARASITE_CMD_DRAIN_FDS, ctl);
+ if (ret) {
+ pr_err("Parasite failed to drain descriptors\n");
+ goto err;
+ }
+
+ ret = recv_fds(ctl->tsock, lfds, dfds->nr_fds, opts);
+ if (ret)
+ pr_err("Can't retrieve FDs from socket\n");
+
+ ret |= __parasite_wait_daemon_ack(PARASITE_CMD_DRAIN_FDS, ctl);
+err:
+ return ret;
+}
+
+int parasite_get_proc_fd_seized(struct parasite_ctl *ctl)
+{
+ int ret = -1, fd;
+
+ ret = __parasite_execute_daemon(PARASITE_CMD_GET_PROC_FD, ctl);
+ if (ret) {
+ pr_err("Parasite failed to get proc fd\n");
+ return ret;
+ }
+
+ fd = recv_fd(ctl->tsock);
+ if (fd < 0)
+ pr_err("Can't retrieve FD from socket\n");
+ if (__parasite_wait_daemon_ack(PARASITE_CMD_GET_PROC_FD, ctl)) {
+ close_safe(&fd);
+ return -1;
+ }
+
+ return fd;
+}
+
+/* This is officially the 50000'th line in the CRIU source code */
+
+static bool task_in_parasite(struct parasite_ctl *ctl, user_regs_struct_t *regs)
+{
+ void *addr = (void *) REG_IP(*regs);
+ return addr >= ctl->remote_map &&
+ addr < ctl->remote_map + ctl->map_length;
+}
+
+static int parasite_fini_seized(struct parasite_ctl *ctl)
+{
+ pid_t pid = ctl->pid.real;
+ user_regs_struct_t regs;
+ int status, ret = 0;
+ enum trace_flags flag;
+
+ /* stop getting chld from parasite -- we're about to step-by-step it */
+ if (restore_child_handler())
+ return -1;
+
+ /* Start to trace syscalls for each thread */
+ if (ptrace(PTRACE_INTERRUPT, pid, NULL, NULL)) {
+ pr_perror("Unable to interrupt the process");
+ return -1;
+ }
+
+ pr_debug("Waiting for %d to trap\n", pid);
+ if (wait4(pid, &status, __WALL, NULL) != pid) {
+ pr_perror("Waited pid mismatch (pid: %d)", pid);
+ return -1;
+ }
+
+ pr_debug("Daemon %d exited trapping\n", pid);
+ if (!WIFSTOPPED(status)) {
+ pr_err("Task is still running (pid: %d)\n", pid);
+ return -1;
+ }
+
+ ret = ptrace_get_regs(pid, ®s);
+ if (ret) {
+ pr_perror("Unable to get registers");
+ return -1;
+ }
+
+ if (!task_in_parasite(ctl, ®s)) {
+ pr_err("The task is not in parasite code\n");
+ return -1;
+ }
+
+ ret = __parasite_execute_daemon(PARASITE_CMD_FINI, ctl);
+ close_safe(&ctl->tsock);
+ if (ret)
+ return -1;
+
+ /* Go to sigreturn as closer as we can */
+ ret = ptrace_stop_pie(pid, ctl->sigreturn_addr, &flag);
+ if (ret < 0)
+ return ret;
+
+ if (parasite_stop_on_syscall(1, __NR_rt_sigreturn, flag))
+ return -1;
+
+ if (ptrace_flush_breakpoints(pid))
+ return -1;
+
+ /*
+ * All signals are unblocked now. The kernel notifies about leaving
+ * syscall before starting to deliver signals. All parasite code are
+ * executed with blocked signals, so we can sefly unmap a parasite blob.
+ */
+
+ return 0;
+}
+
+/*
+ * Trap tasks on the exit from the specified syscall
+ *
+ * tasks - number of processes, which should be trapped
+ * sys_nr - the required syscall number
+ */
+int parasite_stop_on_syscall(int tasks, const int sys_nr, enum trace_flags trace)
+{
+ user_regs_struct_t regs;
+ int status, ret;
+ pid_t pid;
+
+ if (tasks > 1)
+ trace = TRACE_ALL;
+
+ /* Stop all threads on the enter point in sys_rt_sigreturn */
+ while (tasks) {
+ pid = wait4(-1, &status, __WALL, NULL);
+ if (pid == -1) {
+ pr_perror("wait4 failed");
+ return -1;
+ }
+
+ if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGTRAP) {
+ pr_err("Task is in unexpected state: %x\n", status);
+ return -1;
+ }
+
+ pr_debug("%d was trapped\n", pid);
+
+ if (trace == TRACE_EXIT) {
+ trace = TRACE_ENTER;
+ pr_debug("`- Expecting exit\n");
+ goto goon;
+ }
+ if (trace == TRACE_ENTER)
+ trace = TRACE_EXIT;
+
+ ret = ptrace_get_regs(pid, ®s);
+ if (ret) {
+ pr_perror("ptrace");
+ return -1;
+ }
+
+ pr_debug("%d is going to execute the syscall %lx\n", pid, REG_SYSCALL_NR(regs));
+ if (REG_SYSCALL_NR(regs) == sys_nr) {
+ /*
+ * The process is going to execute the required syscall,
+ * the next stop will be on the exit from this syscall
+ */
+ ret = ptrace(PTRACE_SYSCALL, pid, NULL, NULL);
+ if (ret) {
+ pr_perror("ptrace");
+ return -1;
+ }
+
+ pid = wait4(pid, &status, __WALL, NULL);
+ if (pid == -1) {
+ pr_perror("wait4 failed");
+ return -1;
+ }
+
+ if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGTRAP) {
+ pr_err("Task is in unexpected state: %x\n", status);
+ return -1;
+ }
+
+ pr_debug("%d was stopped\n", pid);
+ tasks--;
+ continue;
+ }
+goon:
+ ret = ptrace(PTRACE_SYSCALL, pid, NULL, NULL);
+ if (ret) {
+ pr_perror("ptrace");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+int parasite_stop_daemon(struct parasite_ctl *ctl)
+{
+ if (ctl->daemonized) {
+ /*
+ * Looks like a previous attempt failed, we should do
+ * nothing in this case. parasite will try to cure itself.
+ */
+ if (ctl->tsock < 0)
+ return -1;
+
+ if (parasite_fini_seized(ctl)) {
+ close_safe(&ctl->tsock);
+ return -1;
+ }
+ }
+
+ ctl->daemonized = false;
+
+ return 0;
+}
+
+int parasite_cure_remote(struct parasite_ctl *ctl)
+{
+ int ret = 0;
+
+ if (parasite_stop_daemon(ctl))
+ return -1;
+
+ if (ctl->remote_map) {
+ struct parasite_unmap_args *args;
+
+ *ctl->addr_cmd = PARASITE_CMD_UNMAP;
+
+ args = parasite_args(ctl, struct parasite_unmap_args);
+ args->parasite_start = ctl->remote_map;
+ args->parasite_len = ctl->map_length;
+ if (parasite_unmap(ctl, ctl->parasite_ip))
+ ret = -1;
+ }
+
+ return ret;
+}
+
+int parasite_cure_local(struct parasite_ctl *ctl)
+{
+ int ret = 0;
+
+ if (ctl->local_map) {
+ if (munmap(ctl->local_map, ctl->map_length)) {
+ pr_err("munmap failed (pid: %d)\n", ctl->pid.real);
+ ret = -1;
+ }
+ }
+
+ free(ctl);
+ return ret;
+}
+
+int parasite_cure_seized(struct parasite_ctl *ctl)
+{
+ int ret;
+
+ ret = parasite_cure_remote(ctl);
+ if (!ret)
+ ret = parasite_cure_local(ctl);
+
+ return ret;
+}
+
+/*
+ * parasite_unmap() is used for unmapping parasite and restorer blobs.
+ * A blob can contain code for unmapping itself, so the porcess is
+ * trapped on the exit from the munmap syscall.
+ */
+int parasite_unmap(struct parasite_ctl *ctl, unsigned long addr)
+{
+ user_regs_struct_t regs = ctl->orig.regs;
+ pid_t pid = ctl->pid.real;
+ int ret = -1;
+
+ ret = parasite_run(pid, PTRACE_SYSCALL, addr, NULL, ®s, &ctl->orig);
+ if (ret)
+ goto err;
+
+ ret = parasite_stop_on_syscall(1, __NR_munmap, TRACE_ENTER);
+
+ if (restore_thread_ctx(pid, &ctl->orig))
+ ret = -1;
+err:
+ return ret;
+}
+
+/* If vma_area_list is NULL, a place for injecting syscall will not be set. */
+struct parasite_ctl *parasite_prep_ctl(pid_t pid, struct vm_area_list *vma_area_list)
+{
+ struct parasite_ctl *ctl = NULL;
+ struct vma_area *vma_area;
+
+ if (!arch_can_dump_task(pid))
+ goto err;
+
+ /*
+ * Control block early setup.
+ */
+ ctl = xzalloc(sizeof(*ctl));
+ if (!ctl) {
+ pr_err("Parasite control block allocation failed (pid: %d)\n", pid);
+ goto err;
+ }
+
+ ctl->tsock = -1;
+
+ if (get_thread_ctx(pid, &ctl->orig))
+ goto err;
+
+ ctl->pid.real = pid;
+ ctl->pid.virt = 0;
+
+ if (vma_area_list == NULL)
+ return ctl;
+
+ /* Search a place for injecting syscall */
+ vma_area = get_vma_by_ip(&vma_area_list->h, REG_IP(ctl->orig.regs),
+ MEMFD_FNAME_SZ);
+ if (!vma_area) {
+ pr_err("No suitable VMA found to run parasite "
+ "bootstrap code (pid: %d)\n", pid);
+ goto err;
+ }
+
+ ctl->syscall_ip = vma_area->e->start;
+ pr_debug("Parasite syscall_ip at %p\n", (void *)ctl->syscall_ip);
+
+ return ctl;
+
+err:
+ xfree(ctl);
+ return NULL;
+}
+
+static int parasite_mmap_exchange(struct parasite_ctl *ctl, unsigned long size)
+{
+ int fd;
+
+ ctl->remote_map = mmap_seized(ctl, NULL, size,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_ANONYMOUS | MAP_SHARED, -1, 0);
+ if (!ctl->remote_map) {
+ pr_err("Can't allocate memory for parasite blob (pid: %d)\n", ctl->pid.real);
+ return -1;
+ }
+
+ ctl->map_length = round_up(size, page_size());
+
+ fd = open_proc_rw(ctl->pid.real, "map_files/%p-%p",
+ ctl->remote_map, ctl->remote_map + ctl->map_length);
+ if (fd < 0)
+ return -1;
+
+ ctl->local_map = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_FILE, fd, 0);
+ close(fd);
+
+ if (ctl->local_map == MAP_FAILED) {
+ ctl->local_map = NULL;
+ pr_perror("Can't map remote parasite map");
+ return -1;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_HAS_MEMFD
+static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size)
+{
+ void *where = (void *)ctl->syscall_ip + BUILTIN_SYSCALL_SIZE;
+ u8 orig_code[MEMFD_FNAME_SZ] = MEMFD_FNAME;
+ pid_t pid = ctl->pid.real;
+ unsigned long sret = -ENOSYS;
+ int ret, fd, lfd;
+
+ BUILD_BUG_ON(sizeof(orig_code) < sizeof(long));
+
+ if (ptrace_swap_area(pid, where, (void *)orig_code, sizeof(orig_code))) {
+ pr_err("Can't inject memfd args (pid: %d)\n", pid);
+ return -1;
+ }
+
+ ret = syscall_seized(ctl, __NR_memfd_create, &sret,
+ (unsigned long)where, 0, 0, 0, 0, 0);
+
+ if (ptrace_poke_area(pid, orig_code, where, sizeof(orig_code))) {
+ fd = (int)(long)sret;
+ if (fd >= 0)
+ syscall_seized(ctl, __NR_close, &sret, fd, 0, 0, 0, 0, 0);
+ pr_err("Can't restore memfd args (pid: %d)\n", pid);
+ return -1;
+ }
+
+ if (ret < 0)
+ return ret;
+
+ fd = (int)(long)sret;
+ if (fd == -ENOSYS)
+ return 1;
+ if (fd < 0)
+ return fd;
+
+ ctl->map_length = round_up(size, page_size());
+ lfd = open_proc_rw(ctl->pid.real, "fd/%d", fd);
+ if (lfd < 0)
+ goto err_cure;
+
+ if (ftruncate(lfd, ctl->map_length) < 0) {
+ pr_perror("Fail to truncate memfd for parasite");
+ goto err_cure;
+ }
+
+ ctl->remote_map = mmap_seized(ctl, NULL, size,
+ PROT_READ | PROT_WRITE | PROT_EXEC,
+ MAP_FILE | MAP_SHARED, fd, 0);
+ if (!ctl->remote_map) {
+ pr_err("Can't rmap memfd for parasite blob\n");
+ goto err_curef;
+ }
+
+ ctl->local_map = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_FILE, lfd, 0);
+ if (ctl->local_map == MAP_FAILED) {
+ ctl->local_map = NULL;
+ pr_perror("Can't lmap memfd for parasite blob");
+ goto err_curef;
+ }
+
+ syscall_seized(ctl, __NR_close, &sret, fd, 0, 0, 0, 0, 0);
+ close(lfd);
+
+ pr_info("Set up parasite blob using memfd\n");
+ return 0;
+
+err_curef:
+ close(lfd);
+err_cure:
+ syscall_seized(ctl, __NR_close, &sret, fd, 0, 0, 0, 0, 0);
+ return -1;
+}
+#else
+static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size)
+{
+ return 1;
+}
+#endif
+
+int parasite_map_exchange(struct parasite_ctl *ctl, unsigned long size)
+{
+ int ret;
+
+ ret = parasite_memfd_exchange(ctl, size);
+ if (ret == 1) {
+ pr_info("MemFD parasite doesn't work, goto legacy mmap\n");
+ ret = parasite_mmap_exchange(ctl, size);
+ }
+ return ret;
+}
+
+static unsigned long parasite_args_size = PARASITE_ARG_SIZE_MIN;
+void parasite_ensure_args_size(unsigned long sz)
+{
+ if (parasite_args_size < sz)
+ parasite_args_size = sz;
+}
+
+static int parasite_start_daemon(struct parasite_ctl *ctl, struct pstree_item *item)
+{
+ pid_t pid = ctl->pid.real;
+
+ /*
+ * Get task registers before going daemon, since the
+ * get_task_regs needs to call ptrace on _stopped_ task,
+ * while in daemon it is not such.
+ */
+
+ if (get_task_regs(pid, ctl->orig.regs, item->core[0])) {
+ pr_err("Can't obtain regs for thread %d\n", pid);
+ return -1;
+ }
+
+ if (construct_sigframe(ctl->sigframe, ctl->rsigframe, item->core[0]))
+ return -1;
+
+ if (parasite_init_daemon(ctl, dmpi(item)->netns))
+ return -1;
+
+ return 0;
+}
+
+struct parasite_ctl *parasite_infect_seized(pid_t pid, struct pstree_item *item,
+ struct vm_area_list *vma_area_list)
+{
+ int ret;
+ struct parasite_ctl *ctl;
+ unsigned long p, map_exchange_size;
+
+ BUG_ON(item->threads[0].real != pid);
+
+ ctl = parasite_prep_ctl(pid, vma_area_list);
+ if (!ctl)
+ return NULL;
+
+ parasite_ensure_args_size(dump_pages_args_size(vma_area_list));
+ parasite_ensure_args_size(aio_rings_args_size(vma_area_list));
+
+ /*
+ * Inject a parasite engine. Ie allocate memory inside alien
+ * space and copy engine code there. Then re-map the engine
+ * locally, so we will get an easy way to access engine memory
+ * without using ptrace at all.
+ */
+
+ ctl->args_size = round_up(parasite_args_size, PAGE_SIZE);
+ parasite_args_size = PARASITE_ARG_SIZE_MIN; /* reset for next task */
+ map_exchange_size = pie_size(parasite_blob) + ctl->args_size;
+ map_exchange_size += RESTORE_STACK_SIGFRAME + PARASITE_STACK_SIZE;
+ if (item->nr_threads > 1)
+ map_exchange_size += PARASITE_STACK_SIZE;
+
+ memcpy(&item->core[0]->tc->blk_sigset, &ctl->orig.sigmask, sizeof(k_rtsigset_t));
+
+ ret = parasite_map_exchange(ctl, map_exchange_size);
+ if (ret)
+ goto err_restore;
+
+ pr_info("Putting parasite blob into %p->%p\n", ctl->local_map, ctl->remote_map);
+ memcpy(ctl->local_map, parasite_blob, sizeof(parasite_blob));
+
+ ELF_RELOCS_APPLY_PARASITE(ctl->local_map, ctl->remote_map);
+
+ /* Setup the rest of a control block */
+ ctl->parasite_ip = (unsigned long)parasite_sym(ctl->remote_map, __export_parasite_head_start);
+ ctl->addr_cmd = parasite_sym(ctl->local_map, __export_parasite_cmd);
+ ctl->addr_args = parasite_sym(ctl->local_map, __export_parasite_args);
+
+ p = pie_size(parasite_blob) + ctl->args_size;
+
+ ctl->rsigframe = ctl->remote_map + p;
+ ctl->sigframe = ctl->local_map + p;
+
+ p += RESTORE_STACK_SIGFRAME;
+ p += PARASITE_STACK_SIZE;
+ ctl->rstack = ctl->remote_map + p;
+
+ if (item->nr_threads > 1) {
+ p += PARASITE_STACK_SIZE;
+ ctl->r_thread_stack = ctl->remote_map + p;
+ }
+
+ if (parasite_start_daemon(ctl, item))
+ goto err_restore;
+
+ return ctl;
+
+err_restore:
+ parasite_cure_seized(ctl);
+ return NULL;
+}
+
+int ptrace_stop_pie(pid_t pid, void *addr, enum trace_flags *tf)
+{
+ int ret;
+
+ ret = ptrace_set_breakpoint(pid, addr);
+ if (ret < 0)
+ return ret;
+
+ if (ret > 0) {
+ /*
+ * PIE will stop on a breakpoint, next
+ * stop after that will be syscall enter.
+ */
+ *tf = TRACE_EXIT;
+ return 0;
+ }
+
+ /*
+ * No breakpoints available -- start tracing it
+ * in a per-syscall manner.
+ */
+ ret = ptrace(PTRACE_SYSCALL, pid, NULL, NULL);
+ if (ret) {
+ pr_perror("Unable to restart the %d process", pid);
+ return -1;
+ }
+
+ *tf = TRACE_ENTER;
+ return 0;
+}
diff --git a/criu/pie-util-fd.c b/criu/pie-util-fd.c
new file mode 120000
index 000000000000..4af261ede48c
--- /dev/null
+++ b/criu/pie-util-fd.c
@@ -0,0 +1 @@
+pie/util-fd.c
\ No newline at end of file
diff --git a/criu/pie-util-vdso.c b/criu/pie-util-vdso.c
new file mode 120000
index 000000000000..6e56238c237c
--- /dev/null
+++ b/criu/pie-util-vdso.c
@@ -0,0 +1 @@
+pie/util-vdso.c
\ No newline at end of file
diff --git a/criu/pie-util.c b/criu/pie-util.c
new file mode 120000
index 000000000000..238f297c82e7
--- /dev/null
+++ b/criu/pie-util.c
@@ -0,0 +1 @@
+pie/util.c
\ No newline at end of file
diff --git a/criu/pie/Makefile b/criu/pie/Makefile
new file mode 100644
index 000000000000..5197e1b2d442
--- /dev/null
+++ b/criu/pie/Makefile
@@ -0,0 +1,103 @@
+target += parasite
+target += restorer
+
+parasite-obj-y += parasite.o
+parasite-obj-y += ./$(ARCH_DIR)/parasite-head.o
+parasite-obj-e += ./$(ARCH_DIR)/syscalls.built-in.o
+
+restorer-obj-y += restorer.o
+restorer-obj-y += ./$(ARCH_DIR)/restorer.o
+restorer-obj-e += ./$(ARCH_DIR)/syscalls.built-in.o
+
+#
+# We can't provide proper mount implementation
+# in parasite code -- it requires run-time rellocation
+# applications, which is not the target of the
+# project.
+#
+CFLAGS := $(filter-out -pg,$(CFLAGS)) -iquote pie/piegen -iquote arch/$(ARCH)/include -iquote $(SRC_DIR) -iquote $(SRC_DIR)/criu/include
+
+ifneq ($(filter-out ia32,$(ARCH)),)
+ ccflags-y += -DCR_NOGLIBC -fpie -Wa,--noexecstack -fno-stack-protector
+else
+ ccflags-y += -DCR_NOGLIBC -fno-pic -Wa,--noexecstack -fno-stack-protector
+endif
+
+ifeq ($(SRCARCH),arm)
+ ccflags-y += -marm
+endif
+
+asflags-y += -D__ASSEMBLY__
+
+GEN-OFFSETS := $(obj)/../../scripts/gen-offsets.sh
+BLOBS := $(obj)/parasite-blob.h $(obj)/restorer-blob.h
+
+PIELDS := pie.lds.S
+
+.SECONDARY:
+
+ifeq ($(piegen-y),y)
+target-name = $(patsubst pie/%-blob.h,%,$(1))
+
+ifeq ($(SRCARCH),ppc64)
+$(obj)/$(PIELDS): $(obj)/pie-reloc.lds.S.in
+ $(call msg-gen, $@)
+ $(Q) echo "OUTPUT_ARCH($(LDARCH))" > $(obj)/$(PIELDS)
+ $(Q) cat $< >> $(obj)/$(PIELDS)
+else
+ifeq ($(ARCH),x86)
+$(obj)/$(PIELDS): $(obj)/pie-reloc.lds.S.in
+ $(call msg-gen, $@)
+ $(Q) echo "OUTPUT_ARCH(i386:x86-64)" > $(obj)/$(PIELDS)
+ $(Q) echo "TARGET(elf64-x86-64)" >> $(obj)/$(PIELDS)
+ $(Q) cat $< >> $(obj)/$(PIELDS)
+else # i386 ia32
+$(obj)/$(PIELDS): $(obj)/pie-reloc.lds.S.in
+ $(call msg-gen, $@)
+ $(Q) echo "OUTPUT_ARCH(i386)" > $(obj)/$(PIELDS)
+ $(Q) echo "TARGET(elf32-i386)" >> $(obj)/$(PIELDS)
+ $(Q) cat $< >> $(obj)/$(PIELDS)
+endif
+endif
+
+ifeq ($(strip $(V)),)
+ piegen_stdout := >/dev/null
+endif
+
+$(obj)/%.built-in.bin.o: $(obj)/%.built-in.o $(obj)/lib.a $(obj)/$(PIELDS)
+ $(call msg-gen, $@)
+ $(Q) $(LD) -r -T $(obj)/$(PIELDS) -o $@ $< $(obj)/lib.a
+
+$(obj)/%-blob.h: $(obj)/%.built-in.bin.o $(obj)/$(PIELDS) pie/piegen
+ $(call msg-gen, $@)
+ $(Q) pie/piegen/piegen -f $< -v $(call target-name,$@)_relocs -p $(call target-name,$@)_blob_offset__ -s $(call target-name,$@)_blob -o $@ $(piegen_stdout)
+
+else
+
+$(obj)/$(PIELDS): $(obj)/$(PIELDS).in
+ $(call msg-gen, $@)
+ $(Q) $(SH) -c "echo 'OUTPUT_ARCH($(LDARCH))' > $(obj)/$(PIELDS)"
+ $(Q) $(SH) -c "cat $(obj)/$(PIELDS).in >> $(obj)/$(PIELDS)"
+
+$(obj)/%.built-in.bin.o: $(obj)/%.built-in.o $(obj)/$(PIELDS)
+ $(call msg-gen, $@)
+ $(Q) $(LD) -r -T $(obj)/$(PIELDS) -o $@ $<
+
+$(obj)/%.built-in.bin: $(obj)/%.built-in.bin.o
+ $(call msg-gen, $@)
+ $(Q) $(OBJCOPY) -O binary $^ $@
+
+$(obj)/%-blob.h: $(obj)/%.built-in.bin $(GEN-OFFSETS)
+ $(call msg-gen, $@)
+ $(Q) $(SH) $(GEN-OFFSETS) $(@:-blob.h=) $(notdir $(@:-blob.h=)) $(CROSS_COMPILE) > $@
+
+endif
+
+$(BLOBS): $(obj)/$(PIELDS)
+all-y += $(BLOBS)
+
+cleanup-y += $(obj)/$(PIELDS)
+cleanup-y += $(obj)/*.bin
+cleanup-y += $(BLOBS)
+cleanup-y += $(obj)/*.built-in.bin.o
+cleanup-y += $(obj)/*.built-in.bin
diff --git a/criu/pie/Makefile.library b/criu/pie/Makefile.library
new file mode 100644
index 000000000000..b96d2f05b578
--- /dev/null
+++ b/criu/pie/Makefile.library
@@ -0,0 +1,43 @@
+lib-y += log-simple.o
+lib-y += util-fd.o
+lib-y += util.o
+
+ifeq ($(VDSO),y)
+ lib-y += util-vdso.o
+ lib-y += parasite-vdso.o
+ lib-y += ./$(ARCH_DIR)/vdso-pie.o
+
+ ifeq ($(SRCARCH),aarch64)
+ lib-y += ./$(ARCH_DIR)/intraprocedure.o
+ endif
+
+ ifeq ($(SRCARCH),ppc64)
+ lib-y += ./$(ARCH_DIR)/vdso-trampoline.o
+ endif
+endif
+
+ifeq ($(SRCARCH),ppc64)
+ lib-y += ./$(ARCH_DIR)/memcpy_power7.o
+ lib-y += ./$(ARCH_DIR)/memcmp_64.o
+ lib-y += ./$(ARCH_DIR)/misc.o
+endif
+
+#
+# We can't provide proper mount implementation
+# in parasite code -- it requires run-time rellocation
+# applications, which is not the target of the
+# project.
+#
+iquotes := -iquote pie/piegen -iquote arch/$(ARCH)/include -iquote $(SRC_DIR) -iquote $(SRC_DIR)/criu/include
+ccflags-y := $(filter-out -pg,$(CFLAGS)) $(iquotes)
+asflags-y := -D__ASSEMBLY__ $(iquotes)
+
+ifeq ($(SRCARCH),arm)
+ ccflags-y += -marm
+endif
+
+ifneq ($(filter-out ia32,$(ARCH)),)
+ ccflags-y += -DCR_NOGLIBC -fpie -Wa,--noexecstack -fno-stack-protector
+else
+ ccflags-y += -DCR_NOGLIBC -fno-pic -Wa,--noexecstack -fno-stack-protector
+endif
diff --git a/criu/pie/log-simple.c b/criu/pie/log-simple.c
new file mode 100644
index 000000000000..1cc877d2817f
--- /dev/null
+++ b/criu/pie/log-simple.c
@@ -0,0 +1,291 @@
+#include <stdarg.h>
+
+#include "asm/bitsperlong.h"
+
+#include "syscall.h"
+#include "log.h"
+
+struct simple_buf {
+ char buf[LOG_SIMPLE_CHUNK];
+ char *bp;
+ void (*flush)(struct simple_buf *b);
+};
+
+static int logfd = -1;
+static int cur_loglevel = DEFAULT_LOGLEVEL;
+
+static void sbuf_log_flush(struct simple_buf *b);
+
+static void sbuf_log_init(struct simple_buf *b)
+{
+ b->buf[0] = 'p';
+ b->buf[1] = 'i';
+ b->buf[2] = 'e';
+ b->buf[3] = ':';
+ b->buf[4] = ' ';
+ b->bp = b->buf + 5;
+ b->flush = sbuf_log_flush;
+}
+
+static void sbuf_log_flush(struct simple_buf *b)
+{
+ if (b->bp == b->buf + 5)
+ return;
+
+ sys_write(logfd, b->buf, b->bp - b->buf);
+ sbuf_log_init(b);
+}
+
+static void sbuf_putc(struct simple_buf *b, char c)
+{
+ /* TODO: maybe some warning or error here? */
+ if (b->bp - b->buf >= LOG_SIMPLE_CHUNK)
+ return;
+
+ *b->bp = c;
+ b->bp++;
+ if (b->bp - b->buf >= LOG_SIMPLE_CHUNK - 2) {
+ b->bp[0] = '>';
+ b->bp[1] = '\n';
+ b->bp += 2;
+ if (b->flush)
+ b->flush(b);
+ }
+}
+
+void log_set_fd(int fd)
+{
+ sys_close(logfd);
+ logfd = fd;
+}
+
+void log_set_loglevel(unsigned int level)
+{
+ cur_loglevel = level;
+}
+
+static void print_string(const char *msg, struct simple_buf *b)
+{
+ while (*msg) {
+ sbuf_putc(b, *msg);
+ msg++;
+ }
+}
+
+int vprint_num(char *buf, int blen, int num, char **ps)
+{
+ int neg = 0;
+ char *s;
+
+ s = &buf[blen - 1];
+
+ if (num < 0) {
+ neg = 1;
+ num = -num;
+ } else if (num == 0) {
+ *s = '0';
+ s--;
+ goto done;
+ }
+
+ while (num > 0) {
+ *s = (num % 10) + '0';
+ s--;
+ num /= 10;
+ }
+
+ if (neg) {
+ *s = '-';
+ s--;
+ }
+done:
+ s++;
+ *ps = s;
+ return blen - (s - buf);
+}
+
+static void print_num(int num, struct simple_buf *b)
+{
+ char buf[12], *s;
+
+ buf[11] = '\0';
+ vprint_num(buf, sizeof(buf) - 1, num, &s);
+ print_string(s, b);
+}
+
+static void print_num_l(long num, struct simple_buf *b)
+{
+ int neg = 0;
+ char buf[22], *s;
+
+ buf[21] = '\0';
+ s = &buf[20];
+
+ if (num < 0) {
+ neg = 1;
+ num = -num;
+ } else if (num == 0) {
+ *s = '0';
+ s--;
+ goto done;
+ }
+
+ while (num > 0) {
+ *s = (num % 10) + '0';
+ s--;
+ num /= 10;
+ }
+
+ if (neg) {
+ *s = '-';
+ s--;
+ }
+done:
+ s++;
+ print_string(s, b);
+}
+
+static void hexdigit(unsigned int v, char *to, char **z)
+{
+ *to = "0123456789abcdef"[v & 0xf];
+ if (*to != '0')
+ *z = to;
+}
+
+static void print_hex(unsigned int num, struct simple_buf *b)
+{
+ char buf[11], *z = &buf[9];
+
+ buf[10] = '\0';
+ hexdigit(num >> 0, &buf[9], &z);
+ hexdigit(num >> 4, &buf[8], &z);
+ hexdigit(num >> 8, &buf[7], &z);
+ hexdigit(num >> 12, &buf[6], &z);
+ hexdigit(num >> 16, &buf[5], &z);
+ hexdigit(num >> 20, &buf[4], &z);
+ hexdigit(num >> 24, &buf[3], &z);
+ hexdigit(num >> 28, &buf[2], &z);
+ z -= 2;
+ z[0] = '0';
+ z[1] = 'x';
+
+ print_string(z, b);
+}
+
+static void print_hex_l(unsigned long num, struct simple_buf *b)
+{
+ char buf[19], *z = &buf[17];
+
+ buf[18] = '\0';
+ hexdigit(num >> 0, &buf[17], &z);
+ hexdigit(num >> 4, &buf[16], &z);
+ hexdigit(num >> 8, &buf[15], &z);
+ hexdigit(num >> 12, &buf[14], &z);
+ hexdigit(num >> 16, &buf[13], &z);
+ hexdigit(num >> 20, &buf[12], &z);
+ hexdigit(num >> 24, &buf[11], &z);
+ hexdigit(num >> 28, &buf[10], &z);
+
+#if BITS_PER_LONG == 64
+ hexdigit(num >> 32, &buf[9], &z);
+ hexdigit(num >> 36, &buf[8], &z);
+ hexdigit(num >> 40, &buf[7], &z);
+ hexdigit(num >> 44, &buf[6], &z);
+ hexdigit(num >> 48, &buf[5], &z);
+ hexdigit(num >> 52, &buf[4], &z);
+ hexdigit(num >> 56, &buf[3], &z);
+ hexdigit(num >> 60, &buf[2], &z);
+#endif
+
+ z -= 2;
+ z[0] = '0';
+ z[1] = 'x';
+
+ print_string(z, b);
+}
+
+void sbuf_printf(struct simple_buf *b, const char *format, va_list args)
+{
+ const char *s = format;
+ while (1) {
+ int along = 0;
+
+ if (*s == '\0')
+ break;
+
+ if (*s != '%') {
+ sbuf_putc(b, *s);
+ s++;
+ continue;
+ }
+
+ s++;
+ if (*s == 'l') {
+ along = 1;
+ s++;
+ if (*s == 'l')
+ s++;
+ }
+
+ switch (*s) {
+ case 's':
+ print_string(va_arg(args, char *), b);
+ break;
+ case 'd':
+ if (along)
+ print_num_l(va_arg(args, long), b);
+ else
+ print_num(va_arg(args, int), b);
+ break;
+ case 'x':
+ if (along)
+ print_hex_l(va_arg(args, long), b);
+ else
+ print_hex(va_arg(args, unsigned int), b);
+ break;
+ case 'p':
+ print_hex_l((unsigned long)va_arg(args, void *), b);
+ break;
+ default:
+ print_string("UNKNOWN FORMAT ", b);
+ sbuf_putc(b, *s);
+ break;
+ }
+ s++;
+ }
+}
+
+void print_on_level(unsigned int loglevel, const char *format, ...)
+{
+ va_list args;
+ struct simple_buf b;
+
+ if (loglevel > cur_loglevel)
+ return;
+
+ sbuf_log_init(&b);
+
+ va_start(args, format);
+ sbuf_printf(&b, format, args);
+ va_end(args);
+
+ sbuf_log_flush(&b);
+}
+
+void simple_sprintf(char output[LOG_SIMPLE_CHUNK], const char *format, ...)
+{
+ va_list args;
+ struct simple_buf b;
+ char *p;
+
+ b.bp = b.buf;
+ b.flush = NULL;
+
+ va_start(args, format);
+ sbuf_printf(&b, format, args);
+ va_end(args);
+ *b.bp = 0;
+
+ for (p = b.buf; p <= b.bp; p++)
+ output[p - b.buf] = *p;
+}
diff --git a/criu/pie/parasite-vdso.c b/criu/pie/parasite-vdso.c
new file mode 100644
index 000000000000..9ee42e52875a
--- /dev/null
+++ b/criu/pie/parasite-vdso.c
@@ -0,0 +1,218 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <elf.h>
+#include <fcntl.h>
+#include <errno.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+#include "asm/string.h"
+#include "asm/types.h"
+
+#include "syscall.h"
+#include "image.h"
+#include "parasite-vdso.h"
+#include "vma.h"
+#include "log.h"
+#include "bug.h"
+
+#ifdef LOG_PREFIX
+# undef LOG_PREFIX
+#endif
+#define LOG_PREFIX "vdso: "
+
+
+static int vdso_remap(char *who, unsigned long from, unsigned long to, size_t size)
+{
+ unsigned long addr;
+
+ pr_debug("Remap %s %lx -> %lx\n", who, from, to);
+
+ addr = sys_mremap(from, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, to);
+ if (addr != to) {
+ pr_err("Unable to remap %lx -> %lx %lx\n",
+ from, to, addr);
+ return -1;
+ }
+
+ return 0;
+}
+
+/* Park runtime vDSO in some safe place where it can be accessible from restorer */
+int vdso_do_park(struct vdso_symtable *sym_rt, unsigned long park_at, unsigned long park_size)
+{
+ int ret;
+
+ BUG_ON((vdso_vma_size(sym_rt) + vvar_vma_size(sym_rt)) < park_size);
+
+ if (sym_rt->vvar_start != VDSO_BAD_ADDR) {
+ if (sym_rt->vma_start < sym_rt->vvar_start) {
+ ret = vdso_remap("rt-vdso", sym_rt->vma_start,
+ park_at, vdso_vma_size(sym_rt));
+ park_at += vdso_vma_size(sym_rt);
+ ret |= vdso_remap("rt-vvar", sym_rt->vvar_start,
+ park_at, vvar_vma_size(sym_rt));
+ } else {
+ ret = vdso_remap("rt-vvar", sym_rt->vvar_start,
+ park_at, vvar_vma_size(sym_rt));
+ park_at += vvar_vma_size(sym_rt);
+ ret |= vdso_remap("rt-vdso", sym_rt->vma_start,
+ park_at, vdso_vma_size(sym_rt));
+ }
+ } else
+ ret = vdso_remap("rt-vdso", sym_rt->vma_start,
+ park_at, vdso_vma_size(sym_rt));
+ return ret;
+}
+
+int vdso_proxify(char *who, struct vdso_symtable *sym_rt,
+ unsigned long vdso_rt_parked_at, size_t index,
+ VmaEntry *vmas, size_t nr_vmas)
+{
+ VmaEntry *vma_vdso = NULL, *vma_vvar = NULL;
+ struct vdso_symtable s = VDSO_SYMTABLE_INIT;
+ bool remap_rt = false;
+
+ /*
+ * Figure out which kind of vdso tuple we get.
+ */
+ if (vma_entry_is(&vmas[index], VMA_AREA_VDSO))
+ vma_vdso = &vmas[index];
+ else if (vma_entry_is(&vmas[index], VMA_AREA_VVAR))
+ vma_vvar = &vmas[index];
+
+ if (index < (nr_vmas - 1)) {
+ if (vma_entry_is(&vmas[index + 1], VMA_AREA_VDSO))
+ vma_vdso = &vmas[index + 1];
+ else if (vma_entry_is(&vmas[index + 1], VMA_AREA_VVAR))
+ vma_vvar = &vmas[index + 1];
+ }
+
+ if (!vma_vdso) {
+ pr_err("Can't find vDSO area in image\n");
+ return -1;
+ }
+
+ /*
+ * vDSO mark overwrites Elf program header of proxy vDSO thus
+ * it must never ever be greater in size.
+ */
+ BUILD_BUG_ON(sizeof(struct vdso_mark) > sizeof(Elf64_Phdr));
+
+ /*
+ * Find symbols in vDSO zone read from image.
+ */
+ if (vdso_fill_symtable((void *)vma_vdso->start, vma_entry_len(vma_vdso), &s))
+ return -1;
+
+ /*
+ * Proxification strategy
+ *
+ * - There might be two vDSO zones: vdso code and optionally vvar data
+ * - To be able to use in-place remapping we need
+ *
+ * a) Size and order of vDSO zones are to match
+ * b) Symbols offsets must match
+ * c) Have same number of vDSO zones
+ */
+ if (vma_entry_len(vma_vdso) == vdso_vma_size(sym_rt)) {
+ size_t i;
+
+ for (i = 0; i < ARRAY_SIZE(s.symbols); i++) {
+ if (s.symbols[i].offset != sym_rt->symbols[i].offset)
+ break;
+ }
+
+ if (i == ARRAY_SIZE(s.symbols)) {
+ if (vma_vvar && sym_rt->vvar_start != VVAR_BAD_ADDR) {
+ remap_rt = (vvar_vma_size(sym_rt) == vma_entry_len(vma_vvar));
+ if (remap_rt) {
+ long delta_rt = sym_rt->vvar_start - sym_rt->vma_start;
+ long delta_this = vma_vvar->start - vma_vdso->start;
+
+ remap_rt = (delta_rt ^ delta_this) < 0 ? false : true;
+ }
+ } else
+ remap_rt = true;
+ }
+ }
+
+ pr_debug("image [vdso] %lx-%lx [vvar] %lx-%lx\n",
+ vma_vdso->start, vma_vdso->end,
+ vma_vvar ? vma_vvar->start : VVAR_BAD_ADDR,
+ vma_vvar ? vma_vvar->end : VVAR_BAD_ADDR);
+
+ /*
+ * Easy case -- the vdso from image has same offsets, order and size
+ * as runtime, so we simply remap runtime vdso to dumpee position
+ * without generating any proxy.
+ *
+ * Note we may remap VVAR vdso as well which might not yet been mapped
+ * by a caller code. So drop VMA_AREA_REGULAR from it and caller would
+ * not touch it anymore.
+ */
+ if (remap_rt) {
+ int ret = 0;
+
+ pr_info("Runtime vdso/vvar matches dumpee, remap inplace\n");
+
+ if (sys_munmap((void *)vma_vdso->start, vma_entry_len(vma_vdso))) {
+ pr_err("Failed to unmap %s\n", who);
+ return -1;
+ }
+
+ if (vma_vvar) {
+ if (sys_munmap((void *)vma_vvar->start, vma_entry_len(vma_vvar))) {
+ pr_err("Failed to unmap %s\n", who);
+ return -1;
+ }
+
+ if (vma_vdso->start < vma_vvar->start) {
+ ret = vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
+ vdso_rt_parked_at += vdso_vma_size(sym_rt);
+ ret |= vdso_remap(who, vdso_rt_parked_at, vma_vvar->start, vvar_vma_size(sym_rt));
+ } else {
+ ret = vdso_remap(who, vdso_rt_parked_at, vma_vvar->start, vvar_vma_size(sym_rt));
+ vdso_rt_parked_at += vvar_vma_size(sym_rt);
+ ret |= vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
+ }
+ } else
+ ret = vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
+
+ return ret;
+ }
+
+ /*
+ * Now complex case -- we need to proxify calls. We redirect
+ * calls from dumpee vdso to runtime vdso, making dumpee
+ * to operate as proxy vdso.
+ */
+ pr_info("Runtime vdso mismatches dumpee, generate proxy\n");
+
+ /*
+ * Don't forget to shift if vvar is before vdso.
+ */
+ if (sym_rt->vvar_start != VDSO_BAD_ADDR &&
+ sym_rt->vvar_start < sym_rt->vma_start)
+ vdso_rt_parked_at += vvar_vma_size(sym_rt);
+
+ if (vdso_redirect_calls(vdso_rt_parked_at,
+ vma_vdso->start,
+ sym_rt, &s)) {
+ pr_err("Failed to proxify dumpee contents\n");
+ return -1;
+ }
+
+ /*
+ * Put a special mark into runtime vdso, thus at next checkpoint
+ * routine we could detect this vdso and do not dump it, since
+ * it's auto-generated every new session if proxy required.
+ */
+ sys_mprotect((void *)vdso_rt_parked_at, vdso_vma_size(sym_rt), PROT_WRITE);
+ vdso_put_mark((void *)vdso_rt_parked_at, vma_vdso->start, vma_vvar ? vma_vvar->start : VVAR_BAD_ADDR);
+ sys_mprotect((void *)vdso_rt_parked_at, vdso_vma_size(sym_rt), VDSO_PROT);
+ return 0;
+}
diff --git a/criu/pie/parasite.c b/criu/pie/parasite.c
new file mode 100644
index 000000000000..7b1e324c05b6
--- /dev/null
+++ b/criu/pie/parasite.c
@@ -0,0 +1,727 @@
+#include <sys/mman.h>
+#include <errno.h>
+#include <signal.h>
+#include <linux/limits.h>
+#include <linux/capability.h>
+#include <sys/mount.h>
+#include <stdarg.h>
+#include <sys/ioctl.h>
+
+#include "syscall.h"
+#include "parasite.h"
+#include "config.h"
+#include "fcntl.h"
+#include "prctl.h"
+#include "lock.h"
+#include "parasite-vdso.h"
+#include "log.h"
+#include "tty.h"
+
+#include <string.h>
+
+#include "asm/types.h"
+#include "asm/parasite.h"
+#include "asm/restorer.h"
+
+static int tsock = -1;
+
+static struct rt_sigframe *sigframe;
+
+/*
+ * PARASITE_CMD_DUMPPAGES is called many times and the parasite args contains
+ * an array of VMAs at this time, so VMAs can be unprotected in any moment
+ */
+static struct parasite_dump_pages_args *mprotect_args = NULL;
+
+#ifndef SPLICE_F_GIFT
+#define SPLICE_F_GIFT 0x08
+#endif
+
+#ifndef PR_GET_PDEATHSIG
+#define PR_GET_PDEATHSIG 2
+#endif
+
+static int mprotect_vmas(struct parasite_dump_pages_args *args)
+{
+ struct parasite_vma_entry *vmas, *vma;
+ int ret = 0, i;
+
+ vmas = pargs_vmas(args);
+ for (i = 0; i < args->nr_vmas; i++) {
+ vma = vmas + i;
+ ret = sys_mprotect((void *)vma->start, vma->len, vma->prot | args->add_prot);
+ if (ret) {
+ pr_err("mprotect(%08lx, %lu) failed with code %d\n",
+ vma->start, vma->len, ret);
+ break;
+ }
+ }
+
+ if (args->add_prot)
+ mprotect_args = args;
+ else
+ mprotect_args = NULL;
+
+ return ret;
+}
+
+static int dump_pages(struct parasite_dump_pages_args *args)
+{
+ int p, ret;
+ struct iovec *iovs;
+
+ p = recv_fd(tsock);
+ if (p < 0)
+ return -1;
+
+ iovs = pargs_iovs(args);
+ ret = sys_vmsplice(p, &iovs[args->off], args->nr_segs,
+ SPLICE_F_GIFT | SPLICE_F_NONBLOCK);
+ if (ret != PAGE_SIZE * args->nr_pages) {
+ sys_close(p);
+ pr_err("Can't splice pages to pipe (%d/%d)\n", ret, args->nr_pages);
+ return -1;
+ }
+
+ sys_close(p);
+ return 0;
+}
+
+static int dump_sigact(struct parasite_dump_sa_args *da)
+{
+ int sig, ret = 0;
+
+ for (sig = 1; sig <= SIGMAX; sig++) {
+ int i = sig - 1;
+
+ if (sig == SIGKILL || sig == SIGSTOP)
+ continue;
+
+ ret = sys_sigaction(sig, NULL, &da->sas[i], sizeof(k_rtsigset_t));
+ if (ret < 0) {
+ pr_err("sys_sigaction failed (%d)\n", ret);
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static int dump_itimers(struct parasite_dump_itimers_args *args)
+{
+ int ret;
+
+ ret = sys_getitimer(ITIMER_REAL, &args->real);
+ if (!ret)
+ ret = sys_getitimer(ITIMER_VIRTUAL, &args->virt);
+ if (!ret)
+ ret = sys_getitimer(ITIMER_PROF, &args->prof);
+
+ if (ret)
+ pr_err("getitimer failed (%d)\n", ret);
+
+ return ret;
+}
+
+static int dump_posix_timers(struct parasite_dump_posix_timers_args *args)
+{
+ int i;
+ int ret = 0;
+
+ for(i = 0; i < args->timer_n; i++) {
+ ret = sys_timer_gettime(args->timer[i].it_id, &args->timer[i].val);
+ if (ret < 0) {
+ pr_err("sys_timer_gettime failed (%d)\n", ret);
+ return ret;
+ }
+ args->timer[i].overrun = sys_timer_getoverrun(args->timer[i].it_id);
+ ret = args->timer[i].overrun;
+ if (ret < 0) {
+ pr_err("sys_timer_getoverrun failed (%d)\n", ret);
+ return ret;
+ }
+ }
+
+ return ret;
+}
+
+static int dump_creds(struct parasite_dump_creds *args);
+
+static int dump_thread_common(struct parasite_dump_thread *ti)
+{
+ int ret;
+
+ arch_get_tls(&ti->tls);
+ ret = sys_prctl(PR_GET_TID_ADDRESS, (unsigned long) &ti->tid_addr, 0, 0, 0);
+ if (ret)
+ goto out;
+
+ ret = sys_sigaltstack(NULL, &ti->sas);
+ if (ret)
+ goto out;
+
+ ret = sys_prctl(PR_GET_PDEATHSIG, (unsigned long)&ti->pdeath_sig, 0, 0, 0);
+ if (ret)
+ goto out;
+
+ ret = dump_creds(ti->creds);
+out:
+ return ret;
+}
+
+static int dump_misc(struct parasite_dump_misc *args)
+{
+ args->brk = sys_brk(0);
+
+ args->pid = sys_getpid();
+ args->sid = sys_getsid();
+ args->pgid = sys_getpgid(0);
+ args->umask = sys_umask(0);
+ sys_umask(args->umask); /* never fails */
+ args->dumpable = sys_prctl(PR_GET_DUMPABLE, 0, 0, 0, 0);
+
+ return 0;
+}
+
+static int dump_creds(struct parasite_dump_creds *args)
+{
+ int ret, i, j;
+ struct cap_data data[_LINUX_CAPABILITY_U32S_3];
+ struct cap_header hdr = {_LINUX_CAPABILITY_VERSION_3, 0};
+
+ ret = sys_capget(&hdr, data);
+ if (ret < 0) {
+ pr_err("Unable to get capabilities: %d\n", ret);
+ return -1;
+ }
+
+ /*
+ * Loop through the capability constants until we reach cap_last_cap.
+ * The cap_bnd set is stored as a bitmask comprised of CR_CAP_SIZE number of
+ * 32-bit uints, hence the inner loop from 0 to 32.
+ */
+ for (i = 0; i < CR_CAP_SIZE; i++) {
+ args->cap_eff[i] = data[i].eff;
+ args->cap_prm[i] = data[i].prm;
+ args->cap_inh[i] = data[i].inh;
+ args->cap_bnd[i] = 0;
+
+ for (j = 0; j < 32; j++) {
+ if (j + i * 32 > args->cap_last_cap)
+ break;
+ ret = sys_prctl(PR_CAPBSET_READ, j + i * 32, 0, 0, 0);
+ if (ret < 0) {
+ pr_err("Unable to read capability %d: %d\n",
+ j + i * 32, ret);
+ return -1;
+ }
+ if (ret)
+ args->cap_bnd[i] |= (1 << j);
+ }
+ }
+
+ args->secbits = sys_prctl(PR_GET_SECUREBITS, 0, 0, 0, 0);
+
+ ret = sys_getgroups(0, NULL);
+ if (ret < 0)
+ goto grps_err;
+
+ args->ngroups = ret;
+ if (args->ngroups >= PARASITE_MAX_GROUPS) {
+ pr_err("Too many groups in task %d\n", (int)args->ngroups);
+ return -1;
+ }
+
+ ret = sys_getgroups(args->ngroups, args->groups);
+ if (ret < 0)
+ goto grps_err;
+
+ if (ret != args->ngroups) {
+ pr_err("Groups changed on the fly %d -> %d\n",
+ args->ngroups, ret);
+ return -1;
+ }
+
+ ret = sys_getresuid(&args->uids[0], &args->uids[1], &args->uids[2]);
+ if (ret) {
+ pr_err("Unable to get uids: %d\n", ret);
+ return -1;
+ }
+
+ args->uids[3] = sys_setfsuid(-1L);
+
+ ret = sys_getresgid(&args->gids[0], &args->gids[1], &args->gids[2]);
+ if (ret) {
+ pr_err("Unable to get uids: %d\n", ret);
+ return -1;
+ }
+
+ args->gids[3] = sys_setfsgid(-1L);
+
+ return 0;
+
+grps_err:
+ pr_err("Error calling getgroups (%d)\n", ret);
+ return -1;
+}
+
+static int drain_fds(struct parasite_drain_fd *args)
+{
+ int ret;
+
+ ret = send_fds(tsock, NULL, 0,
+ args->fds, args->nr_fds, true);
+ if (ret)
+ pr_err("send_fds failed (%d)\n", ret);
+
+ return ret;
+}
+
+static int dump_thread(struct parasite_dump_thread *args)
+{
+ args->tid = sys_gettid();
+ return dump_thread_common(args);
+}
+
+static char proc_mountpoint[] = "proc.crtools";
+static int parasite_get_proc_fd()
+{
+ int ret, fd = -1;
+ char buf[2];
+
+ ret = sys_readlinkat(AT_FDCWD, "/proc/self", buf, sizeof(buf));
+ if (ret < 0 && ret != -ENOENT) {
+ pr_err("Can't readlink /proc/self (%d)\n", ret);
+ return ret;
+ }
+
+ /* Fast path -- if /proc belongs to this pidns */
+ if (ret == 1 && buf[0] == '1') {
+ fd = sys_open("/proc", O_RDONLY, 0);
+ goto out_send_fd;
+ }
+
+ ret = sys_mkdir(proc_mountpoint, 0700);
+ if (ret) {
+ pr_err("Can't create a directory (%d)\n", ret);
+ return -1;
+ }
+
+ ret = sys_mount("proc", proc_mountpoint, "proc", MS_MGC_VAL, NULL);
+ if (ret) {
+ pr_err("mount failed (%d)\n", ret);
+ sys_rmdir(proc_mountpoint);
+ return -1;
+ }
+
+ fd = open_detach_mount(proc_mountpoint);
+out_send_fd:
+ if (fd < 0)
+ return fd;
+ ret = send_fd(tsock, NULL, 0, fd);
+ sys_close(fd);
+ return ret;
+}
+
+static inline int tty_ioctl(int fd, int cmd, int *arg)
+{
+ int ret;
+
+ ret = sys_ioctl(fd, cmd, (unsigned long)arg);
+ if (ret < 0) {
+ if (ret != -ENOTTY)
+ return ret;
+ *arg = 0;
+ }
+ return 0;
+}
+
+/*
+ * Stolen from kernel/fs/aio.c
+ *
+ * Is it valid to go to memory and check it? Should be,
+ * as libaio does the same.
+ */
+
+#define AIO_RING_MAGIC 0xa10a10a1
+#define AIO_RING_COMPAT_FEATURES 1
+#define AIO_RING_INCOMPAT_FEATURES 0
+
+struct aio_ring {
+ unsigned id; /* kernel internal index number */
+ unsigned nr; /* number of io_events */
+ unsigned head; /* Written to by userland or under ring_lock
+ * mutex by aio_read_events_ring(). */
+ unsigned tail;
+
+ unsigned magic;
+ unsigned compat_features;
+ unsigned incompat_features;
+ unsigned header_length; /* size of aio_ring */
+
+
+ /* struct io_event io_events[0]; */
+};
+
+static int sane_ring(struct aio_ring *ring)
+{
+ return ring->magic == AIO_RING_MAGIC &&
+ ring->compat_features == AIO_RING_COMPAT_FEATURES &&
+ ring->incompat_features == AIO_RING_INCOMPAT_FEATURES &&
+ ring->header_length == sizeof(struct aio_ring);
+}
+
+static int parasite_check_aios(struct parasite_check_aios_args *args)
+{
+ int i;
+
+ for (i = 0; i < args->nr_rings; i++) {
+ struct aio_ring *ring;
+
+ ring = (struct aio_ring *)args->ring[i].ctx;
+ if (!sane_ring(ring)) {
+ pr_err("Not valid ring #%d\n", i);
+ pr_info(" `- magic %x\n", ring->magic);
+ pr_info(" `- cf %d\n", ring->compat_features);
+ pr_info(" `- if %d\n", ring->incompat_features);
+ pr_info(" `- size %d (%zd)\n", ring->header_length, sizeof(struct aio_ring));
+ return -1;
+ }
+
+ /*
+ * XXX what else can we do if there are requests
+ * in the ring?
+ */
+ if (ring->head != ring->tail) {
+ pr_err("Pending AIO requests in ring #%d\n", i);
+ return -1;
+ }
+
+ args->ring[i].max_reqs = ring->nr;
+ }
+
+ return 0;
+}
+
+static int parasite_dump_tty(struct parasite_tty_args *args)
+{
+ int ret;
+
+#ifndef TIOCGPKT
+# define TIOCGPKT _IOR('T', 0x38, int)
+#endif
+
+#ifndef TIOCGPTLCK
+# define TIOCGPTLCK _IOR('T', 0x39, int)
+#endif
+
+#ifndef TIOCGEXCL
+# define TIOCGEXCL _IOR('T', 0x40, int)
+#endif
+
+ args->sid = 0;
+ args->pgrp = 0;
+ args->st_pckt = 0;
+ args->st_lock = 0;
+ args->st_excl = 0;
+
+#define __tty_ioctl(cmd, arg) \
+ do { \
+ ret = tty_ioctl(args->fd, cmd, &arg); \
+ if (ret < 0) { \
+ if (ret == -ENOTTY) \
+ arg = 0; \
+ else if (ret == -EIO) \
+ goto err_io; \
+ else \
+ goto err; \
+ } \
+ } while (0)
+
+ __tty_ioctl(TIOCGSID, args->sid);
+ __tty_ioctl(TIOCGPGRP, args->pgrp);
+ __tty_ioctl(TIOCGEXCL, args->st_excl);
+
+ if (args->type == TTY_TYPE__PTY) {
+ __tty_ioctl(TIOCGPKT, args->st_pckt);
+ __tty_ioctl(TIOCGPTLCK, args->st_lock);
+ }
+
+ args->hangup = false;
+ return 0;
+
+err:
+ pr_err("tty: Can't fetch params: err = %d\n", ret);
+ return -1;
+err_io:
+
+ /* kernel reports EIO for get ioctls on pair-less ptys */
+ args->hangup = true;
+ return 0;
+#undef __tty_ioctl
+}
+
+#ifdef CONFIG_VDSO
+static int parasite_check_vdso_mark(struct parasite_vdso_vma_entry *args)
+{
+ struct vdso_mark *m = (void *)args->start;
+
+ if (is_vdso_mark(m)) {
+ /*
+ * Make sure we don't meet some corrupted entry
+ * where signature matches but verions is not!
+ */
+ if (m->version != VDSO_MARK_CUR_VERSION) {
+ pr_err("vdso: Mark version mismatch!\n");
+ return -EINVAL;
+ }
+ args->is_marked = 1;
+ args->proxy_vdso_addr = m->proxy_vdso_addr;
+ args->proxy_vvar_addr = m->proxy_vvar_addr;
+ } else {
+ args->is_marked = 0;
+ args->proxy_vdso_addr = VDSO_BAD_ADDR;
+ args->proxy_vvar_addr = VVAR_BAD_ADDR;
+
+ if (args->try_fill_symtable) {
+ struct vdso_symtable t;
+
+ if (vdso_fill_symtable((void *)args->start, args->len, &t))
+ args->is_vdso = false;
+ else
+ args->is_vdso = true;
+ }
+ }
+
+ return 0;
+}
+#else
+static inline int parasite_check_vdso_mark(struct parasite_vdso_vma_entry *args)
+{
+ pr_err("Unexpected VDSO check command\n");
+ return -1;
+}
+#endif
+
+static int __parasite_daemon_reply_ack(unsigned int cmd, int err)
+{
+ struct ctl_msg m;
+ int ret;
+
+ m = ctl_msg_ack(cmd, err);
+ ret = sys_sendto(tsock, &m, sizeof(m), 0, NULL, 0);
+ if (ret != sizeof(m)) {
+ pr_err("Sent only %d bytes while %zd expected\n", ret, sizeof(m));
+ return -1;
+ }
+
+ pr_debug("__sent ack msg: %d %d %d\n",
+ m.cmd, m.ack, m.err);
+
+ return 0;
+}
+
+static int __parasite_daemon_wait_msg(struct ctl_msg *m)
+{
+ int ret;
+
+ pr_debug("Daemon waits for command\n");
+
+ while (1) {
+ *m = (struct ctl_msg){ };
+ ret = sys_recvfrom(tsock, m, sizeof(*m), MSG_WAITALL, NULL, 0);
+ if (ret != sizeof(*m)) {
+ pr_err("Trimmed message received (%d/%d)\n",
+ (int)sizeof(*m), ret);
+ return -1;
+ }
+
+ pr_debug("__fetched msg: %d %d %d\n",
+ m->cmd, m->ack, m->err);
+ return 0;
+ }
+
+ return -1;
+}
+
+static noinline void fini_sigreturn(unsigned long new_sp)
+{
+ ARCH_RT_SIGRETURN(new_sp);
+}
+
+static int fini()
+{
+ unsigned long new_sp;
+
+ if (mprotect_args) {
+ mprotect_args->add_prot = 0;
+ mprotect_vmas(mprotect_args);
+ }
+
+ new_sp = (long)sigframe + SIGFRAME_OFFSET;
+ pr_debug("%ld: new_sp=%lx ip %lx\n", sys_gettid(),
+ new_sp, RT_SIGFRAME_REGIP(sigframe));
+
+ sys_close(tsock);
+ log_set_fd(-1);
+
+ fini_sigreturn(new_sp);
+
+ BUG();
+
+ return -1;
+}
+
+static noinline __used int noinline parasite_daemon(void *args)
+{
+ struct ctl_msg m = { };
+ int ret = -1;
+
+ pr_debug("Running daemon thread leader\n");
+
+ /* Reply we're alive */
+ if (__parasite_daemon_reply_ack(PARASITE_CMD_INIT_DAEMON, 0))
+ goto out;
+
+ ret = 0;
+
+ while (1) {
+ if (__parasite_daemon_wait_msg(&m))
+ break;
+
+ if (ret && m.cmd != PARASITE_CMD_FINI) {
+ pr_err("Command rejected\n");
+ continue;
+ }
+
+ switch (m.cmd) {
+ case PARASITE_CMD_FINI:
+ goto out;
+ case PARASITE_CMD_DUMPPAGES:
+ ret = dump_pages(args);
+ break;
+ case PARASITE_CMD_MPROTECT_VMAS:
+ ret = mprotect_vmas(args);
+ break;
+ case PARASITE_CMD_DUMP_SIGACTS:
+ ret = dump_sigact(args);
+ break;
+ case PARASITE_CMD_DUMP_ITIMERS:
+ ret = dump_itimers(args);
+ break;
+ case PARASITE_CMD_DUMP_POSIX_TIMERS:
+ ret = dump_posix_timers(args);
+ break;
+ case PARASITE_CMD_DUMP_THREAD:
+ ret = dump_thread(args);
+ break;
+ case PARASITE_CMD_DUMP_MISC:
+ ret = dump_misc(args);
+ break;
+ case PARASITE_CMD_DRAIN_FDS:
+ ret = drain_fds(args);
+ break;
+ case PARASITE_CMD_GET_PROC_FD:
+ ret = parasite_get_proc_fd();
+ break;
+ case PARASITE_CMD_DUMP_TTY:
+ ret = parasite_dump_tty(args);
+ break;
+ case PARASITE_CMD_CHECK_AIOS:
+ ret = parasite_check_aios(args);
+ break;
+ case PARASITE_CMD_CHECK_VDSO_MARK:
+ ret = parasite_check_vdso_mark(args);
+ break;
+ default:
+ pr_err("Unknown command in parasite daemon thread leader: %d\n", m.cmd);
+ ret = -1;
+ break;
+ }
+
+ if (__parasite_daemon_reply_ack(m.cmd, ret))
+ break;
+
+ if (ret) {
+ pr_err("Close the control socket for writing\n");
+ sys_shutdown(tsock, SHUT_WR);
+ }
+ }
+
+out:
+ fini();
+
+ return 0;
+}
+
+static noinline int unmap_itself(void *data)
+{
+ struct parasite_unmap_args *args = data;
+
+ sys_munmap(args->parasite_start, args->parasite_len);
+ /*
+ * This call to sys_munmap must never return. Instead, the controlling
+ * process must trap us on the exit from munmap.
+ */
+
+ BUG();
+ return -1;
+}
+
+static noinline __used int parasite_init_daemon(void *data)
+{
+ struct parasite_init_args *args = data;
+ int ret;
+
+ args->sigreturn_addr = fini_sigreturn;
+ sigframe = args->sigframe;
+
+ tsock = sys_socket(PF_UNIX, SOCK_SEQPACKET, 0);
+ if (tsock < 0) {
+ pr_err("Can't create socket: %d\n", tsock);
+ goto err;
+ }
+
+ ret = sys_connect(tsock, (struct sockaddr *)&args->h_addr, args->h_addr_len);
+ if (ret < 0) {
+ pr_err("Can't connect the control socket\n");
+ goto err;
+ }
+
+ ret = recv_fd(tsock);
+ if (ret >= 0) {
+ log_set_fd(ret);
+ log_set_loglevel(args->log_level);
+ ret = 0;
+ } else
+ goto err;
+
+ parasite_daemon(data);
+
+err:
+ fini();
+ BUG();
+
+ return -1;
+}
+
+#ifndef __parasite_entry
+# define __parasite_entry
+#endif
+
+int __used __parasite_entry parasite_service(unsigned int cmd, void *args)
+{
+ pr_info("Parasite cmd %d/%x process\n", cmd, cmd);
+
+ switch (cmd) {
+ case PARASITE_CMD_DUMP_THREAD:
+ return dump_thread(args);
+ case PARASITE_CMD_INIT_DAEMON:
+ return parasite_init_daemon(args);
+ case PARASITE_CMD_UNMAP:
+ return unmap_itself(args);
+ }
+
+ pr_err("Unknown command to parasite: %d\n", cmd);
+ return -EINVAL;
+}
diff --git a/criu/pie/pie-reloc.lds.S.in b/criu/pie/pie-reloc.lds.S.in
new file mode 100644
index 000000000000..051d1d42740d
--- /dev/null
+++ b/criu/pie/pie-reloc.lds.S.in
@@ -0,0 +1,30 @@
+SECTIONS
+{
+ .text : {
+ *(.head.text)
+ *(.text*)
+ }
+
+ .data : {
+ *(.data*)
+ *(.bss*)
+ }
+
+ .rodata : {
+ *(.rodata*)
+ *(.got*)
+ }
+
+ .toc : ALIGN(8) {
+ *(.toc*)
+ }
+
+ /DISCARD/ : {
+ *(.debug*)
+ *(.comment*)
+ *(.note*)
+ *(.group*)
+ *(.eh_frame*)
+ }
+__export_parasite_args = .;
+}
diff --git a/criu/pie/pie-relocs.c b/criu/pie/pie-relocs.c
new file mode 100644
index 000000000000..7e825b2320d9
--- /dev/null
+++ b/criu/pie/pie-relocs.c
@@ -0,0 +1,47 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <fcntl.h>
+#include <elf.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+#include "asm-generic/int.h"
+
+#include "compiler.h"
+#include "piegen/uapi/types.h"
+#include "bug.h"
+
+__maybe_unused void elf_relocs_apply(void *mem, void *vbase, size_t size, elf_reloc_t *elf_relocs, size_t nr_relocs)
+{
+ size_t i, j;
+
+ for (i = 0, j = 0; i < nr_relocs; i++) {
+ if (elf_relocs[i].type & PIEGEN_TYPE_LONG) {
+ long *where = mem + elf_relocs[i].offset;
+ long *p = mem + size;
+
+ if (elf_relocs[i].type & PIEGEN_TYPE_GOTPCREL) {
+ int *value = (int *)where;
+ int rel;
+
+ p[j] = (long)vbase + elf_relocs[i].value;
+ rel = (unsigned)((void *)&p[j] - (void *)mem) - elf_relocs[i].offset + elf_relocs[i].addend;
+
+ *value = rel;
+ j++;
+ } else
+ *where = elf_relocs[i].value + elf_relocs[i].addend + (unsigned long)vbase;
+ } else if (elf_relocs[i].type & PIEGEN_TYPE_INT) {
+ int *where = (mem + elf_relocs[i].offset);
+ *where = elf_relocs[i].value + elf_relocs[i].addend + (unsigned long)vbase;
+ } else
+ BUG();
+ }
+}
diff --git a/criu/pie/pie-relocs.h b/criu/pie/pie-relocs.h
new file mode 100644
index 000000000000..1449ca630908
--- /dev/null
+++ b/criu/pie/pie-relocs.h
@@ -0,0 +1,29 @@
+#ifndef __PIE_RELOCS_H__
+#define __PIE_RELOCS_H__
+
+#include "piegen/uapi/types.h"
+
+#include "compiler.h"
+#include "config.h"
+
+#ifdef CONFIG_PIEGEN
+
+extern __maybe_unused void elf_relocs_apply(void *mem, void *vbase, size_t size,
+ elf_reloc_t *elf_relocs, size_t nr_relocs);
+#define pie_size(__blob_name) (round_up(sizeof(__blob_name) + nr_gotpcrel * sizeof(long), page_size()))
+#define ELF_RELOCS_APPLY_PARASITE(__mem, __vbase) \
+ elf_relocs_apply(__mem, __vbase, sizeof(parasite_blob), \
+ parasite_relocs, ARRAY_SIZE(parasite_relocs))
+#define ELF_RELOCS_APPLY_RESTORER(__mem, __vbase) \
+ elf_relocs_apply(__mem, __vbase, sizeof(restorer_blob), \
+ restorer_relocs, ARRAY_SIZE(restorer_relocs))
+
+#else
+
+#define pie_size(__blob_name) (round_up(sizeof(__blob_name), page_size()))
+#define ELF_RELOCS_APPLY_PARASITE(__mem, __vbase)
+#define ELF_RELOCS_APPLY_RESTORER(__mem, __vbase)
+
+#endif
+
+#endif /* __PIE_RELOCS_H__ */
diff --git a/criu/pie/pie.lds.S.in b/criu/pie/pie.lds.S.in
new file mode 100644
index 000000000000..9e9c97f003c3
--- /dev/null
+++ b/criu/pie/pie.lds.S.in
@@ -0,0 +1,29 @@
+SECTIONS
+{
+ .crblob 0x0 : {
+ *(.head.text)
+ *(.text*)
+ . = ALIGN(32);
+ *(.data*)
+ . = ALIGN(32);
+ *(.rodata*)
+ . = ALIGN(32);
+ *(.bss*)
+ . = ALIGN(32);
+ *(.got*)
+ . = ALIGN(32);
+ *(.toc*)
+ . = ALIGN(32);
+ } =0x00000000,
+
+ /DISCARD/ : {
+ *(.debug*)
+ *(.comment*)
+ *(.note*)
+ *(.group*)
+ *(.eh_frame*)
+ *(*)
+ }
+
+__export_parasite_args = .;
+}
diff --git a/criu/pie/piegen/Makefile b/criu/pie/piegen/Makefile
new file mode 100644
index 000000000000..5c3d68b84817
--- /dev/null
+++ b/criu/pie/piegen/Makefile
@@ -0,0 +1,17 @@
+CFLAGS += -iquote pie/piegen
+
+obj-y += main.o
+ifneq ($(filter ia32 x86, $(ARCH)),)
+obj-y += elf-x86-32.o
+obj-y += elf-x86-64.o
+endif
+ifeq ($(SRCARCH),ppc64)
+obj-y += elf-ppc64.o
+endif
+
+cleanup-y += $(obj)/piegen
+cleanup-y += $(obj)/*.o
+
+ifneq ($(MAKECMDGOALS),clean)
+incdeps := y
+endif
diff --git a/criu/pie/piegen/elf-ppc64.c b/criu/pie/piegen/elf-ppc64.c
new file mode 100644
index 000000000000..472725f9fe7c
--- /dev/null
+++ b/criu/pie/piegen/elf-ppc64.c
@@ -0,0 +1,16 @@
+#define ELF_PPC64
+#define handle_elf handle_elf_ppc64
+
+#define Ehdr_t Elf64_Ehdr
+#define Shdr_t Elf64_Shdr
+#define Sym_t Elf64_Sym
+#define Rel_t Elf64_Rel
+#define Rela_t Elf64_Rela
+
+#define ELF_ST_TYPE ELF64_ST_TYPE
+#define ELF_ST_BIND ELF64_ST_BIND
+
+#define ELF_R_SYM ELF64_R_SYM
+#define ELF_R_TYPE ELF64_R_TYPE
+
+#include "elf.c"
diff --git a/criu/pie/piegen/elf-x86-32.c b/criu/pie/piegen/elf-x86-32.c
new file mode 100644
index 000000000000..413113ef396b
--- /dev/null
+++ b/criu/pie/piegen/elf-x86-32.c
@@ -0,0 +1,16 @@
+#define ELF_X86_32
+#define handle_elf handle_elf_x86_32
+
+#define Ehdr_t Elf32_Ehdr
+#define Shdr_t Elf32_Shdr
+#define Sym_t Elf32_Sym
+#define Rel_t Elf32_Rel
+#define Rela_t Elf32_Rela
+
+#define ELF_ST_TYPE ELF32_ST_TYPE
+#define ELF_ST_BIND ELF32_ST_BIND
+
+#define ELF_R_SYM ELF32_R_SYM
+#define ELF_R_TYPE ELF32_R_TYPE
+
+#include "elf.c"
diff --git a/criu/pie/piegen/elf-x86-64.c b/criu/pie/piegen/elf-x86-64.c
new file mode 100644
index 000000000000..8ba26672bc82
--- /dev/null
+++ b/criu/pie/piegen/elf-x86-64.c
@@ -0,0 +1,16 @@
+#define ELF_X86_64
+#define handle_elf handle_elf_x86_64
+
+#define Ehdr_t Elf64_Ehdr
+#define Shdr_t Elf64_Shdr
+#define Sym_t Elf64_Sym
+#define Rel_t Elf64_Rel
+#define Rela_t Elf64_Rela
+
+#define ELF_ST_TYPE ELF64_ST_TYPE
+#define ELF_ST_BIND ELF64_ST_BIND
+
+#define ELF_R_SYM ELF64_R_SYM
+#define ELF_R_TYPE ELF64_R_TYPE
+
+#include "elf.c"
diff --git a/criu/pie/piegen/elf.c b/criu/pie/piegen/elf.c
new file mode 100644
index 000000000000..c6b97257ba61
--- /dev/null
+++ b/criu/pie/piegen/elf.c
@@ -0,0 +1,512 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <string.h>
+
+#include <fcntl.h>
+#include <elf.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+#include "asm-generic/int.h"
+
+#include "compiler.h"
+#include "piegen.h"
+
+static bool __ptr_oob(const void *ptr, const void *start, const size_t size)
+{
+ const void *end = (const void *)((const unsigned long)start + size);
+ return ptr > end || ptr < start;
+}
+
+static bool test_pointer(const void *ptr, const void *start, const size_t size,
+ const char *name, const char *file, const int line)
+{
+ if (__ptr_oob(ptr, start, size)) {
+ pr_err("Corrupted pointer %p (%s) at %s:%d\n",
+ ptr, name, file, line);
+ return true;
+ }
+ return false;
+}
+
+#define ptr_func_exit(__ptr) \
+ do { \
+ if (test_pointer((__ptr), mem, size, #__ptr, \
+ __FILE__, __LINE__)) { \
+ free(sec_hdrs); \
+ return -1; \
+ } \
+ } while (0)
+
+#ifdef ELF_PPC64
+static int do_relative_toc(long value, uint16_t *location,
+ unsigned long mask, int complain_signed)
+{
+ if (complain_signed && (value + 0x8000 > 0xffff)) {
+ pr_err("TOC16 relocation overflows (%ld)\n", value);
+ return -1;
+ }
+
+ if ((~mask & 0xffff) & value) {
+ pr_err("bad TOC16 relocation (%ld) (0x%lx)\n", value, (~mask & 0xffff) & value);
+ return -1;
+ }
+
+ *location = (*location & ~mask) | (value & mask);
+ return 0;
+}
+#endif
+
+int handle_elf(void *mem, size_t size)
+{
+ const char *symstrings = NULL;
+ Shdr_t *symtab_hdr = NULL;
+ Sym_t *symbols = NULL;
+ Ehdr_t *hdr = mem;
+
+ Shdr_t *secstrings_hdr = NULL;
+ Shdr_t *strtab_hdr = NULL;
+ Shdr_t **sec_hdrs = NULL;
+ const char *secstrings;
+
+ size_t i, k, nr_gotpcrel = 0;
+#ifdef ELF_PPC64
+ s64 toc_offset = 0;
+#endif
+
+ pr_debug("Header\n");
+ pr_debug("------------\n");
+ pr_debug("\ttype 0x%x machine 0x%x version 0x%x\n",
+ (unsigned)hdr->e_type, (unsigned)hdr->e_machine, (unsigned)hdr->e_version);
+
+#ifdef ELF_X86_64
+ if (hdr->e_type != ET_REL || hdr->e_machine != EM_X86_64 || hdr->e_version != EV_CURRENT) {
+ pr_err("Unsupported header detected\n");
+ goto err;
+ }
+#endif
+
+#ifdef ELF_X86_32
+ if (hdr->e_type != ET_REL || hdr->e_machine != EM_386 || hdr->e_version != EV_CURRENT) {
+ pr_err("Unsupported header detected\n");
+ goto err;
+ }
+#endif
+
+ sec_hdrs = malloc(sizeof(*sec_hdrs) * hdr->e_shnum);
+ if (!sec_hdrs) {
+ pr_err("No memory for section headers\n");
+ goto err;
+ }
+
+ secstrings_hdr = mem + hdr->e_shoff + hdr->e_shentsize * hdr->e_shstrndx;
+ secstrings = mem + secstrings_hdr->sh_offset;
+ ptr_func_exit(secstrings_hdr);
+ ptr_func_exit(secstrings);
+
+ pr_debug("Sections\n");
+ pr_debug("------------\n");
+ for (i = 0; i < hdr->e_shnum; i++) {
+ Shdr_t *sh = mem + hdr->e_shoff + hdr->e_shentsize * i;
+ ptr_func_exit(sh);
+
+ if (sh->sh_type == SHT_SYMTAB)
+ symtab_hdr = sh;
+
+ ptr_func_exit(&secstrings[sh->sh_name]);
+ pr_debug("\t index %-2zd type 0x%-2x name %s\n", i,
+ (unsigned)sh->sh_type, &secstrings[sh->sh_name]);
+
+ sec_hdrs[i] = sh;
+
+#ifdef ELF_PPC64
+ if (!strcmp(&secstrings[sh->sh_name], ".toc")) {
+ toc_offset = sh->sh_addr + 0x8000;
+ pr_debug("\t\tTOC offset 0x%lx\n", toc_offset);
+ }
+#endif
+ }
+
+ if (!symtab_hdr) {
+ pr_err("No symbol table present\n");
+ goto err;
+ }
+
+ if (!symtab_hdr->sh_link || symtab_hdr->sh_link >= hdr->e_shnum) {
+ pr_err("Corrupted symtab header\n");
+ goto err;
+ }
+
+ pr_debug("Symbols\n");
+ pr_debug("------------\n");
+ strtab_hdr = sec_hdrs[symtab_hdr->sh_link];
+ ptr_func_exit(strtab_hdr);
+
+ symbols = mem + symtab_hdr->sh_offset;
+ ptr_func_exit(symbols);
+ symstrings = mem + strtab_hdr->sh_offset;
+ ptr_func_exit(symstrings);
+
+ if (sizeof(*symbols) != symtab_hdr->sh_entsize) {
+ pr_err("Symbol table align differ\n");
+ goto err;
+ }
+
+ pr_out("/* Autogenerated from %s */\n", opts.input_filename);
+ pr_out("#include \"piegen/uapi/types.h\"\n");
+
+ for (i = 0; i < symtab_hdr->sh_size / symtab_hdr->sh_entsize; i++) {
+ Sym_t *sym = &symbols[i];
+ const char *name;
+ Shdr_t *sh_src;
+
+ ptr_func_exit(sym);
+ name = &symstrings[sym->st_name];
+ ptr_func_exit(name);
+
+ if (*name) {
+ pr_debug("\ttype 0x%-2x bind 0x%-2x shndx 0x%-4x value 0x%-2lx name %s\n",
+ (unsigned)ELF_ST_TYPE(sym->st_info), (unsigned)ELF_ST_BIND(sym->st_info),
+ (unsigned)sym->st_shndx, (unsigned long)sym->st_value, name);
+#ifdef ELF_PPC64
+ if (!sym->st_value && !strncmp(name, ".TOC.", 6)) {
+ if (!toc_offset) {
+ pr_err("No TOC pointer\n");
+ goto err;
+ }
+ sym->st_value = toc_offset;
+ continue;
+ }
+#endif
+ if (strncmp(name, "__export", 8))
+ continue;
+ if ((sym->st_shndx && sym->st_shndx < hdr->e_shnum) || sym->st_shndx == SHN_ABS) {
+ if (sym->st_shndx == SHN_ABS) {
+ sh_src = NULL;
+ } else {
+ sh_src = sec_hdrs[sym->st_shndx];
+ ptr_func_exit(sh_src);
+ }
+ pr_out("#define %s%s 0x%lx\n",
+ opts.prefix_name, name,
+ (unsigned long)(sym->st_value + (sh_src ? sh_src->sh_addr : 0)));
+ }
+ }
+ }
+
+ pr_out("static __maybe_unused elf_reloc_t %s[] = {\n", opts.var_name);
+
+ pr_debug("Relocations\n");
+ pr_debug("------------\n");
+ for (i = 0; i < hdr->e_shnum; i++) {
+ Shdr_t *sh = sec_hdrs[i];
+ Shdr_t *sh_rel;
+
+ if (sh->sh_type != SHT_REL && sh->sh_type != SHT_RELA)
+ continue;
+
+ sh_rel = sec_hdrs[sh->sh_info];
+ ptr_func_exit(sh_rel);
+
+ pr_debug("\tsection %2zd type 0x%-2x link 0x%-2x info 0x%-2x name %s\n", i,
+ (unsigned)sh->sh_type, (unsigned)sh->sh_link,
+ (unsigned)sh->sh_info, &secstrings[sh->sh_name]);
+
+ for (k = 0; k < sh->sh_size / sh->sh_entsize; k++) {
+ s64 __maybe_unused addend64, __maybe_unused value64;
+ s32 addend32, value32;
+ unsigned long place;
+ const char *name;
+ void *where;
+ Sym_t *sym;
+
+ union {
+ Rel_t rel;
+ Rela_t rela;
+ } *r = mem + sh->sh_offset + sh->sh_entsize * k;
+ ptr_func_exit(r);
+
+ sym = &symbols[ELF_R_SYM(r->rel.r_info)];
+ ptr_func_exit(sym);
+
+ name = &symstrings[sym->st_name];
+ ptr_func_exit(name);
+
+ where = mem + sh_rel->sh_offset + r->rel.r_offset;
+ ptr_func_exit(where);
+
+ pr_debug("\t\tr_offset 0x%-4lx r_info 0x%-4lx / sym 0x%-2lx type 0x%-2lx symsecoff 0x%-4lx\n",
+ (unsigned long)r->rel.r_offset, (unsigned long)r->rel.r_info,
+ (unsigned long)ELF_R_SYM(r->rel.r_info),
+ (unsigned long)ELF_R_TYPE(r->rel.r_info),
+ (unsigned long)sh_rel->sh_addr);
+
+ if (sym->st_shndx == SHN_UNDEF) {
+#ifdef ELF_PPC64
+ /* On PowerPC, TOC symbols appear to be
+ * undefined but should be processed as well.
+ * Their type is STT_NOTYPE, so report any
+ * other one.
+ */
+ if (ELF32_ST_TYPE(sym->st_info) != STT_NOTYPE
+ || strncmp(name, ".TOC.", 6)) {
+ pr_err("Unexpected undefined symbol:%s\n", name);
+ goto err;
+ }
+#else
+ continue;
+#endif
+ }
+
+ if (sh->sh_type == SHT_REL) {
+ addend32 = *(s32 *)where;
+ addend64 = *(s64 *)where;
+ } else {
+ addend32 = (s32)r->rela.r_addend;
+ addend64 = (s64)r->rela.r_addend;
+ }
+
+ place = sh_rel->sh_addr + r->rel.r_offset;
+
+ pr_debug("\t\t\tvalue 0x%-8lx addend32 %-4d addend64 %-8ld place %-8lx symname %s\n",
+ (unsigned long)sym->st_value, addend32, (long)addend64, (long)place, name);
+
+ if (sym->st_shndx == SHN_ABS) {
+ value32 = (s32)sym->st_value;
+ value64 = (s64)sym->st_value;
+ } else {
+ Shdr_t *sh_src;
+
+ if ((unsigned)sym->st_shndx > (unsigned)hdr->e_shnum) {
+ pr_err("Unexpected symbol section index %u/%u\n",
+ (unsigned)sym->st_shndx, hdr->e_shnum);
+ goto err;
+ }
+ sh_src = sec_hdrs[sym->st_shndx];
+ ptr_func_exit(sh_src);
+
+ value32 = (s32)sh_src->sh_addr + (s32)sym->st_value;
+ value64 = (s64)sh_src->sh_addr + (s64)sym->st_value;
+ }
+
+#ifdef ELF_PPC64
+/* Snippet from the OpenPOWER ABI for Linux Supplement:
+ * The OpenPOWER ABI uses the three most-significant bits in the symbol
+ * st_other field specifies the number of instructions between a function's
+ * global entry point and local entry point. The global entry point is used
+ * when it is necessary to set up the TOC pointer (r2) for the function. The
+ * local entry point is used when r2 is known to already be valid for the
+ * function. A value of zero in these bits asserts that the function does
+ * not use r2.
+ * The st_other values have the following meanings:
+ * 0 and 1, the local and global entry points are the same.
+ * 2, the local entry point is at 1 instruction past the global entry point.
+ * 3, the local entry point is at 2 instructions past the global entry point.
+ * 4, the local entry point is at 4 instructions past the global entry point.
+ * 5, the local entry point is at 8 instructions past the global entry point.
+ * 6, the local entry point is at 16 instructions past the global entry point.
+ * 7, reserved.
+ *
+ * Here we are only handle the case '3' which is the most commonly seen.
+ */
+#define LOCAL_OFFSET(s) ((s->st_other >> 5) & 0x7)
+ if (LOCAL_OFFSET(sym)) {
+ if (LOCAL_OFFSET(sym) != 3) {
+ pr_err("Unexpected local offset value %d\n",
+ LOCAL_OFFSET(sym));
+ goto err;
+ }
+ pr_debug("\t\t\tUsing local offset\n");
+ value64 += 8;
+ value32 += 8;
+ }
+#endif
+
+ switch (ELF_R_TYPE(r->rel.r_info)) {
+#ifdef ELF_PPC64
+ case R_PPC64_REL24:
+ /* Update PC relative offset, linker has not done this yet */
+ pr_debug("\t\t\tR_PPC64_REL24 at 0x%-4lx val 0x%lx\n",
+ place, value64);
+ /* Convert value to relative */
+ value64 -= place;
+ if (value64 + 0x2000000 > 0x3ffffff || (value64 & 3) != 0) {
+ pr_err("REL24 %li out of range!\n", (long int)value64);
+ goto err;
+ }
+ /* Only replace bits 2 through 26 */
+ *(uint32_t *)where = (*(uint32_t *)where & ~0x03fffffc) |
+ (value64 & 0x03fffffc);
+ break;
+
+ case R_PPC64_ADDR32:
+ pr_debug("\t\t\tR_PPC64_ADDR32 at 0x%-4lx val 0x%x\n",
+ place, (unsigned int)(value32 + addend32));
+ pr_out(" { .offset = 0x%-8x, .type = PIEGEN_TYPE_INT, "
+ " .addend = %-8d, .value = 0x%-16x, "
+ "}, /* R_PPC64_ADDR32 */\n",
+ (unsigned int) place, addend32, value32);
+ break;
+
+ case R_PPC64_ADDR64:
+ case R_PPC64_REL64:
+ pr_debug("\t\t\tR_PPC64_ADDR64 at 0x%-4lx val 0x%lx\n",
+ place, value64 + addend64);
+ pr_out("\t{ .offset = 0x%-8x, .type = PIEGEN_TYPE_LONG,"
+ " .addend = %-8ld, .value = 0x%-16lx, "
+ "}, /* R_PPC64_ADDR64 */\n",
+ (unsigned int) place, (long)addend64, (long)value64);
+ break;
+
+ case R_PPC64_TOC16_HA:
+ pr_debug("\t\t\tR_PPC64_TOC16_HA at 0x%-4lx val 0x%lx\n",
+ place, value64 + addend64 - toc_offset + 0x8000);
+ if (do_relative_toc((value64 + addend64 - toc_offset + 0x8000) >> 16,
+ where, 0xffff, 1))
+ goto err;
+ break;
+
+ case R_PPC64_TOC16_LO:
+ pr_debug("\t\t\tR_PPC64_TOC16_LO at 0x%-4lx val 0x%lx\n",
+ place, value64 + addend64 - toc_offset);
+ if (do_relative_toc(value64 + addend64 - toc_offset,
+ where, 0xffff, 1))
+ goto err;
+ break;
+
+ case R_PPC64_TOC16_LO_DS:
+ pr_debug("\t\t\tR_PPC64_TOC16_LO_DS at 0x%-4lx val 0x%lx\n",
+ place, value64 + addend64 - toc_offset);
+ if (do_relative_toc(value64 + addend64 - toc_offset,
+ where, 0xfffc, 0))
+ goto err;
+ break;
+
+ case R_PPC64_REL16_HA:
+ value64 += addend64 - place;
+ pr_debug("\t\t\tR_PPC64_REL16_HA at 0x%-4lx val 0x%lx\n",
+ place, value64);
+ /* check that we are dealing with the addis 2,12 instruction */
+ if (((*(uint32_t*)where) & 0xffff0000) != 0x3c4c0000) {
+ pr_err("Unexpected instruction for R_PPC64_REL16_HA\n");
+ goto err;
+ }
+ *(uint16_t *)where = ((value64 + 0x8000) >> 16) & 0xffff;
+ break;
+
+ case R_PPC64_REL16_LO:
+ value64 += addend64 - place;
+ pr_debug("\t\t\tR_PPC64_REL16_LO at 0x%-4lx val 0x%lx\n",
+ place, value64);
+ /* check that we are dealing with the addi 2,2 instruction */
+ if (((*(uint32_t*)where) & 0xffff0000) != 0x38420000) {
+ pr_err("Unexpected instruction for R_PPC64_REL16_LO\n");
+ goto err;
+ }
+ *(uint16_t *)where = value64 & 0xffff;
+ break;
+
+#endif /* ELF_PPC64 */
+
+#ifdef ELF_X86_64
+ case R_X86_64_32: /* Symbol + Addend (4 bytes) */
+ pr_debug("\t\t\t\tR_X86_64_32 at 0x%-4lx val 0x%x\n", place, value32);
+ pr_out(" { .offset = 0x%-8x, .type = PIEGEN_TYPE_INT, "
+ ".addend = %-8d, .value = 0x%-16x, }, /* R_X86_64_32 */\n",
+ (unsigned int)place, addend32, value32);
+ break;
+ case R_X86_64_64: /* Symbol + Addend (8 bytes) */
+ pr_debug("\t\t\t\tR_X86_64_64 at 0x%-4lx val 0x%lx\n", place, (long)value64);
+ pr_out(" { .offset = 0x%-8x, .type = PIEGEN_TYPE_LONG, "
+ ".addend = %-8ld, .value = 0x%-16lx, }, /* R_X86_64_64 */\n",
+ (unsigned int)place, (long)addend64, (long)value64);
+ break;
+ case R_X86_64_PC32: /* Symbol + Addend - Place (4 bytes) */
+ pr_debug("\t\t\t\tR_X86_64_PC32 at 0x%-4lx val 0x%x\n", place, value32 + addend32 - (s32)place);
+ /*
+ * R_X86_64_PC32 are relative, patch them inplace.
+ */
+ *((s32 *)where) = value32 + addend32 - place;
+ break;
+ case R_X86_64_PLT32: /* ProcLinkage + Addend - Place (4 bytes) */
+ pr_debug("\t\t\t\tR_X86_64_PLT32 at 0x%-4lx val 0x%x\n", place, value32 + addend32 - (s32)place);
+ /*
+ * R_X86_64_PLT32 are relative, patch them inplace.
+ */
+ *((s32 *)where) = value32 + addend32 - place;
+ break;
+ case R_X86_64_GOTPCREL: /* SymbolOffsetInGot + GOT + Addend - Place (4 bytes) */
+ pr_debug("\t\t\t\tR_X86_64_GOTPCREL at 0x%-4lx val 0x%x\n", place, value32);
+ pr_out(" { .offset = 0x%-8x, .type = PIEGEN_TYPE_LONG | PIEGEN_TYPE_GOTPCREL, "
+ ".addend = %-8d, .value = 0x%-16x, }, /* R_X86_64_GOTPCREL */\n",
+ (unsigned int)place, addend32, value32);
+ nr_gotpcrel++;
+ break;
+#endif
+
+#ifdef ELF_X86_32
+ case R_386_32: /* Symbol + Addend */
+ pr_debug("\t\t\t\tR_386_32 at 0x%-4lx val 0x%x\n", place, value32 + addend32);
+ pr_out(" { .offset = 0x%-8x, .type = PIEGEN_TYPE_INT, "
+ ".addend = %-4d, .value = 0x%x, },\n",
+ (unsigned int)place, addend32, value32);
+ break;
+ case R_386_PC32: /* Symbol + Addend - Place */
+ pr_debug("\t\t\t\tR_386_PC32 at 0x%-4lx val 0x%x\n", place, value32 + addend32 - (s32)place);
+ /*
+ * R_386_PC32 are relative, patch them inplace.
+ */
+ *((s32 *)where) = value32 + addend32 - place;
+ break;
+#endif
+
+ default:
+ pr_err("Unsupported relocation\n");
+ goto err;
+ }
+ }
+ }
+ pr_out("};\n");
+ pr_out("static __maybe_unused size_t %s = %zd;\n", opts.nrgotpcrel_name, nr_gotpcrel);
+
+ pr_out("static __maybe_unused const char %s[] = {\n\t", opts.stream_name);
+
+ for (i=0, k=0; i < hdr->e_shnum; i++) {
+ Shdr_t *sh = sec_hdrs[i];
+ unsigned char *shdata;
+ size_t j;
+
+ if (!(sh->sh_flags & SHF_ALLOC) || !sh->sh_size)
+ continue;
+
+ shdata = mem + sh->sh_offset;
+ pr_debug("Copying section '%s'\n" \
+ "\tstart:0x%lx (gap:0x%lx) size:0x%lx\n",
+ &secstrings[sh->sh_name], (unsigned long) sh->sh_addr,
+ (unsigned long)(sh->sh_addr - k), (unsigned long) sh->sh_size);
+
+ /* write 0 in the gap between the 2 sections */
+ for (;k < sh->sh_addr; k++) {
+ if (k && (k % 8) == 0)
+ pr_out("\n\t");
+ pr_out("0x00,");
+ }
+
+ for (j=0; j < sh->sh_size; j++, k++) {
+ if (k && (k % 8) == 0)
+ pr_out("\n\t");
+ pr_out("0x%02x,", shdata[j]);
+ }
+ }
+ pr_out("};\n");
+ free(sec_hdrs);
+ return 0;
+err:
+ free(sec_hdrs);
+ return -1;
+}
diff --git a/criu/pie/piegen/main.c b/criu/pie/piegen/main.c
new file mode 100644
index 000000000000..d3ad823339bb
--- /dev/null
+++ b/criu/pie/piegen/main.c
@@ -0,0 +1,154 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <getopt.h>
+#include <string.h>
+
+#include <fcntl.h>
+#include <elf.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+#include "compiler.h"
+#include "config.h"
+#include "piegen.h"
+
+piegen_opt_t opts = {
+ .input_filename = "file.o",
+ .stream_name = "stream",
+ .prefix_name = "__",
+ .var_name = "elf_relocs",
+ .nrgotpcrel_name = "nr_gotpcrel",
+};
+
+FILE *fout;
+
+static int handle_elf(void *mem, size_t size)
+{
+#if defined(CONFIG_X86_32) || defined(CONFIG_X86_64)
+ unsigned char elf_ident_x86_32[EI_NIDENT] = {
+ 0x7f, 0x45, 0x4c, 0x46, 0x01, 0x01, 0x01, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ };
+
+ unsigned char elf_ident_x86_64[EI_NIDENT] = {
+ 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ };
+
+ if (memcmp(mem, elf_ident_x86_32, sizeof(elf_ident_x86_32)) == 0)
+ return handle_elf_x86_32(mem, size);
+ else if (memcmp(mem, elf_ident_x86_64, sizeof(elf_ident_x86_64)) == 0)
+ return handle_elf_x86_64(mem, size);
+#endif
+
+#if defined(CONFIG_PPC64)
+ const unsigned char elf_ident[EI_NIDENT] = {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+#else
+ 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x02, 0x01, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+#endif
+ };
+
+ if (memcmp(mem, elf_ident, sizeof(elf_ident)) == 0)
+ return handle_elf_ppc64(mem, size);
+#endif /* CONFIG_PPC64 */
+
+ pr_err("Unsupported Elf format detected\n");
+ return -1;
+}
+
+/*
+ * That;s the tool to generate patches object files.
+ */
+int main(int argc, char *argv[])
+{
+ struct stat st;
+ int opt, idx;
+ void *mem;
+ int fd;
+
+ static const char short_opts[] = "f:o:s:p:v:h";
+ static struct option long_opts[] = {
+ { "file", required_argument, 0, 'f' },
+ { "output", required_argument, 0, 'o' },
+ { "stream", required_argument, 0, 's' },
+ { "sym-prefix", required_argument, 0, 'p' },
+ { "variable", required_argument, 0, 'v' },
+ { "help", required_argument, 0, 'h' },
+ { },
+ };
+
+ if (argc < 3)
+ goto usage;
+
+ while (1) {
+ idx = -1;
+ opt = getopt_long(argc, argv, short_opts, long_opts, &idx);
+ if (opt == -1)
+ break;
+ switch (opt) {
+ case 'f':
+ opts.input_filename = optarg;
+ break;
+ case 'o':
+ opts.output_filename = optarg;
+ break;
+ case 's':
+ opts.stream_name = optarg;
+ break;
+ case 'p':
+ opts.prefix_name = optarg;
+ break;
+ case 'v':
+ opts.var_name = optarg;
+ break;
+ case 'h':
+ default:
+ goto usage;
+ }
+ }
+
+ fd = open(opts.input_filename, O_RDONLY);
+ if (fd < 0) {
+ pr_perror("Can't open file %s", opts.input_filename);
+ goto err;
+ }
+
+ if (fstat(fd, &st)) {
+ pr_perror("Can't stat file %s", opts.input_filename);
+ goto err;
+ }
+
+ fout = fopen(opts.output_filename, "w");
+ if (fout == NULL) {
+ pr_perror("Can't open %s", opts.output_filename);
+ goto err;
+ }
+
+ mem = mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FILE, fd, 0);
+ if (mem == MAP_FAILED) {
+ pr_perror("Can't mmap file %s", opts.input_filename);
+ goto err;
+ }
+
+ if (handle_elf(mem, st.st_size)) {
+ fclose(fout);
+ unlink(opts.output_filename);
+ goto err;
+ }
+ fclose(fout);
+ printf("%s generated successfully.\n", opts.output_filename);
+ return 0;
+usage:
+ fprintf(stderr, "Usage: %s -f filename\n", argv[0]);
+err:
+ return 1;
+}
diff --git a/criu/pie/piegen/piegen.h b/criu/pie/piegen/piegen.h
new file mode 100644
index 000000000000..8488c0abb989
--- /dev/null
+++ b/criu/pie/piegen/piegen.h
@@ -0,0 +1,35 @@
+#ifndef __ELFTIL_H__
+#define __ELFTIL_H__
+
+#include <stdio.h>
+#include <unistd.h>
+
+typedef struct {
+ char *input_filename;
+ char *output_filename;
+ char *stream_name;
+ char *prefix_name;
+ char *var_name;
+ char *nrgotpcrel_name;
+} piegen_opt_t;
+
+extern piegen_opt_t opts;
+extern FILE *fout;
+
+#if defined(CONFIG_X86_32) || defined(CONFIG_X86_64)
+extern int handle_elf_x86_32(void *mem, size_t size);
+extern int handle_elf_x86_64(void *mem, size_t size);
+#endif
+
+#if defined(CONFIG_PPC64)
+extern int handle_elf_ppc64(void *mem, size_t size);
+#endif
+
+#define pr_out(fmt, ...) fprintf(fout, fmt, ##__VA_ARGS__)
+
+#define pr_debug(fmt, ...) printf("%s: "fmt, opts.stream_name, ##__VA_ARGS__)
+
+#define pr_err(fmt, ...) fprintf(stderr, "%s: Error (%s:%d): "fmt, opts.stream_name, __FILE__, __LINE__, ##__VA_ARGS__)
+#define pr_perror(fmt, ...) fprintf(stderr, "%s: Error (%s:%d): "fmt "%m\n", opts.stream_name, __FILE__, __LINE__, ##__VA_ARGS__)
+
+#endif /* __ELFTIL_H__ */
diff --git a/criu/pie/piegen/uapi/types.h b/criu/pie/piegen/uapi/types.h
new file mode 100644
index 000000000000..34696e8c6aa5
--- /dev/null
+++ b/criu/pie/piegen/uapi/types.h
@@ -0,0 +1,15 @@
+#ifndef __PIEGEN_TYPES_H__
+#define __PIEGEN_TYPES_H__
+
+#define PIEGEN_TYPE_INT (1u << 0)
+#define PIEGEN_TYPE_LONG (1u << 1)
+#define PIEGEN_TYPE_GOTPCREL (1u << 2)
+
+typedef struct {
+ unsigned int offset;
+ unsigned int type;
+ long addend;
+ long value;
+} elf_reloc_t;
+
+#endif /* __PIEGEN_TYPES_H__ */
diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
new file mode 100644
index 000000000000..8cec7f9c419b
--- /dev/null
+++ b/criu/pie/restorer.c
@@ -0,0 +1,1335 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <linux/securebits.h>
+#include <linux/capability.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <sys/time.h>
+#include <sys/shm.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sched.h>
+#include <sys/resource.h>
+#include <signal.h>
+
+#include "compiler.h"
+#include "asm/types.h"
+#include "syscall.h"
+#include "config.h"
+#include "prctl.h"
+#include "log.h"
+#include "util.h"
+#include "image.h"
+#include "sk-inet.h"
+#include "vma.h"
+
+#include "crtools.h"
+#include "lock.h"
+#include "restorer.h"
+#include "aio.h"
+#include "seccomp.h"
+
+#include "protobuf/creds.pb-c.h"
+#include "protobuf/mm.pb-c.h"
+
+#include "asm/restorer.h"
+
+#ifndef PR_SET_PDEATHSIG
+#define PR_SET_PDEATHSIG 1
+#endif
+
+#define sys_prctl_safe(opcode, val1, val2, val3) \
+ ({ \
+ long __ret = sys_prctl(opcode, val1, val2, val3, 0); \
+ if (__ret) \
+ pr_err("prctl failed @%d with %ld\n", __LINE__, __ret);\
+ __ret; \
+ })
+
+static struct task_entries *task_entries;
+static futex_t thread_inprogress;
+static pid_t *helpers;
+static int n_helpers;
+static pid_t *zombies;
+static int n_zombies;
+
+extern void cr_restore_rt (void) asm ("__cr_restore_rt")
+ __attribute__ ((visibility ("hidden")));
+
+static void sigchld_handler(int signal, siginfo_t *siginfo, void *data)
+{
+ char *r;
+ int i;
+
+ /* We can ignore helpers that die, we expect them to after
+ * CR_STATE_RESTORE is finished. */
+ for (i = 0; i < n_helpers; i++)
+ if (siginfo->si_pid == helpers[i])
+ return;
+
+ for (i = 0; i < n_zombies; i++)
+ if (siginfo->si_pid == zombies[i])
+ return;
+
+ if (siginfo->si_code & CLD_EXITED)
+ r = " exited, status=";
+ else if (siginfo->si_code & CLD_KILLED)
+ r = " killed by signal ";
+ else
+ r = "disappeared with ";
+
+ pr_info("Task %d %s %d\n", siginfo->si_pid, r, siginfo->si_status);
+
+ futex_abort_and_wake(&task_entries->nr_in_progress);
+ /* sa_restorer may be unmaped, so we can't go back to userspace*/
+ sys_kill(sys_getpid(), SIGSTOP);
+ sys_exit_group(1);
+}
+
+static int lsm_set_label(char *label, int procfd)
+{
+ int ret = -1, len, lsmfd;
+ char path[LOG_SIMPLE_CHUNK];
+
+ if (!label)
+ return 0;
+
+ pr_info("restoring lsm profile %s\n", label);
+
+ simple_sprintf(path, "self/task/%ld/attr/current", sys_gettid());
+
+ lsmfd = sys_openat(procfd, path, O_WRONLY, 0);
+ if (lsmfd < 0) {
+ pr_err("failed openat %d\n", lsmfd);
+ return -1;
+ }
+
+ for (len = 0; label[len]; len++)
+ ;
+
+ ret = sys_write(lsmfd, label, len);
+ sys_close(lsmfd);
+ if (ret < 0) {
+ pr_err("can't write lsm profile %d\n", ret);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int restore_creds(struct thread_creds_args *args, int procfd)
+{
+ CredsEntry *ce = &args->creds;
+ int b, i, ret;
+ struct cap_header hdr;
+ struct cap_data data[_LINUX_CAPABILITY_U32S_3];
+
+ /*
+ * We're still root here and thus can do it without failures.
+ */
+
+ /*
+ * Setup supplementary group IDs early.
+ */
+ if (args->groups) {
+ ret = sys_setgroups(ce->n_groups, args->groups);
+ if (ret) {
+ pr_err("Can't setup supplementary group IDs: %d\n", ret);
+ return -1;
+ }
+ }
+
+ /*
+ * First -- set the SECURE_NO_SETUID_FIXUP bit not to
+ * lose caps bits when changing xids.
+ */
+
+ ret = sys_prctl(PR_SET_SECUREBITS, 1 << SECURE_NO_SETUID_FIXUP, 0, 0, 0);
+ if (ret) {
+ pr_err("Unable to set SECURE_NO_SETUID_FIXUP: %d\n", ret);
+ return -1;
+ }
+
+ /*
+ * Second -- restore xids. Since we still have the CAP_SETUID
+ * capability nothing should fail. But call the setfsXid last
+ * to override the setresXid settings.
+ */
+
+ ret = sys_setresuid(ce->uid, ce->euid, ce->suid);
+ if (ret) {
+ pr_err("Unable to set real, effective and saved user ID: %d\n", ret);
+ return -1;
+ }
+
+ sys_setfsuid(ce->fsuid);
+ if (sys_setfsuid(-1) != ce->fsuid) {
+ pr_err("Unable to set fsuid\n");
+ return -1;
+ }
+
+ ret = sys_setresgid(ce->gid, ce->egid, ce->sgid);
+ if (ret) {
+ pr_err("Unable to set real, effective and saved group ID: %d\n", ret);
+ return -1;
+ }
+
+ sys_setfsgid(ce->fsgid);
+ if (sys_setfsgid(-1) != ce->fsgid) {
+ pr_err("Unable to set fsgid\n");
+ return -1;
+ }
+
+ /*
+ * Third -- restore securebits. We don't need them in any
+ * special state any longer.
+ */
+
+ ret = sys_prctl(PR_SET_SECUREBITS, ce->secbits, 0, 0, 0);
+ if (ret) {
+ pr_err("Unable to set PR_SET_SECUREBITS: %d\n", ret);
+ return -1;
+ }
+
+ /*
+ * Fourth -- trim bset. This can only be done while
+ * having the CAP_SETPCAP capablity.
+ */
+
+ for (b = 0; b < CR_CAP_SIZE; b++) {
+ for (i = 0; i < 32; i++) {
+ if (b * 32 + i > args->cap_last_cap)
+ break;
+ if (args->cap_bnd[b] & (1 << i))
+ /* already set */
+ continue;
+ ret = sys_prctl(PR_CAPBSET_DROP, i + b * 32, 0, 0, 0);
+ if (ret) {
+ pr_err("Unable to drop capability %d: %d\n",
+ i + b * 32, ret);
+ return -1;
+ }
+ }
+ }
+
+ /*
+ * Fifth -- restore caps. Nothing but cap bits are changed
+ * at this stage, so just do it.
+ */
+
+ hdr.version = _LINUX_CAPABILITY_VERSION_3;
+ hdr.pid = 0;
+
+ BUILD_BUG_ON(_LINUX_CAPABILITY_U32S_3 != CR_CAP_SIZE);
+
+ for (i = 0; i < CR_CAP_SIZE; i++) {
+ data[i].eff = args->cap_eff[i];
+ data[i].prm = args->cap_prm[i];
+ data[i].inh = args->cap_inh[i];
+ }
+
+ ret = sys_capset(&hdr, data);
+ if (ret) {
+ pr_err("Unable to restore capabilities: %d\n", ret);
+ return -1;
+ }
+
+ if (lsm_set_label(args->lsm_profile, procfd) < 0)
+ return -1;
+ return 0;
+}
+
+/*
+ * This should be done after creds restore, as
+ * some creds changes might drop the value back
+ * to zero.
+ */
+
+static inline int restore_pdeath_sig(struct thread_restore_args *ta)
+{
+ if (ta->pdeath_sig)
+ return sys_prctl(PR_SET_PDEATHSIG, ta->pdeath_sig, 0, 0, 0);
+ else
+ return 0;
+}
+
+static int restore_dumpable_flag(MmEntry *mme)
+{
+ int current_dumpable;
+ int ret;
+
+ if (!mme->has_dumpable) {
+ pr_warn("Dumpable flag not present in criu dump.\n");
+ return 0;
+ }
+
+ if (mme->dumpable == 0 || mme->dumpable == 1) {
+ ret = sys_prctl(PR_SET_DUMPABLE, mme->dumpable, 0, 0, 0);
+ if (ret) {
+ pr_err("Unable to set PR_SET_DUMPABLE: %d\n", ret);
+ return -1;
+ }
+ return 0;
+ }
+
+ /*
+ * If dumpable flag is present but it is not 0 or 1, then we can not
+ * use prctl to set it back. Try to see if it is already correct
+ * (which is likely if sysctl fs.suid_dumpable is the same when dump
+ * and restore are run), in which case there is nothing to do.
+ * Otherwise, set dumpable to 0 which should be a secure fallback.
+ */
+ current_dumpable = sys_prctl(PR_GET_DUMPABLE, 0, 0, 0, 0);
+ if (mme->dumpable != current_dumpable) {
+ pr_warn("Dumpable flag [%d] does not match current [%d]. "
+ "Will fallback to setting it to 0 to disable it.\n",
+ mme->dumpable, current_dumpable);
+ ret = sys_prctl(PR_SET_DUMPABLE, 0, 0, 0, 0);
+ if (ret) {
+ pr_err("Unable to set PR_SET_DUMPABLE: %d\n", ret);
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static void restore_sched_info(struct rst_sched_param *p)
+{
+ struct sched_param parm;
+
+ pr_info("Restoring scheduler params %d.%d.%d\n",
+ p->policy, p->nice, p->prio);
+
+ sys_setpriority(PRIO_PROCESS, 0, p->nice);
+ parm.sched_priority = p->prio;
+ sys_sched_setscheduler(0, p->policy, &parm);
+}
+
+static void restore_rlims(struct task_restore_args *ta)
+{
+ int r;
+
+ for (r = 0; r < ta->rlims_n; r++) {
+ struct krlimit krlim;
+
+ krlim.rlim_cur = ta->rlims[r].rlim_cur;
+ krlim.rlim_max = ta->rlims[r].rlim_max;
+ sys_setrlimit(r, &krlim);
+ }
+}
+
+static int restore_signals(siginfo_t *ptr, int nr, bool group)
+{
+ int ret, i;
+
+ for (i = 0; i < nr; i++) {
+ siginfo_t *info = ptr + i;
+
+ pr_info("Restore signal %d group %d\n", info->si_signo, group);
+ if (group)
+ ret = sys_rt_sigqueueinfo(sys_getpid(), info->si_signo, info);
+ else
+ ret = sys_rt_tgsigqueueinfo(sys_getpid(),
+ sys_gettid(), info->si_signo, info);
+ if (ret) {
+ pr_err("Unable to send siginfo %d %x with code %d\n",
+ info->si_signo, info->si_code, ret);
+ return -1;;
+ }
+ }
+
+ return 0;
+}
+
+static int restore_seccomp(struct task_restore_args *args)
+{
+ int ret;
+
+ switch (args->seccomp_mode) {
+ case SECCOMP_MODE_DISABLED:
+ return 0;
+ case SECCOMP_MODE_STRICT:
+ ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
+ if (ret < 0) {
+ pr_err("prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT) returned %d\n", ret);
+ goto die;
+ }
+ return 0;
+ case SECCOMP_MODE_FILTER: {
+ int i;
+ void *filter_data;
+
+ filter_data = &args->seccomp_filters[args->seccomp_filters_n];
+
+ for (i = 0; i < args->seccomp_filters_n; i++) {
+ struct sock_fprog *fprog = &args->seccomp_filters[i];
+
+ fprog->filter = filter_data;
+
+ /* We always TSYNC here, since we require that the
+ * creds for all threads be the same; this means we
+ * don't have to restore_seccomp() in threads, and that
+ * future TSYNC behavior will be correct.
+ */
+ ret = sys_seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, (char *) fprog);
+ if (ret < 0) {
+ pr_err("sys_seccomp() returned %d\n", ret);
+ goto die;
+ }
+
+ filter_data += fprog->len * sizeof(struct sock_filter);
+ }
+
+ return 0;
+ }
+ default:
+ goto die;
+ }
+
+ return 0;
+die:
+ return -1;
+}
+
+static int restore_thread_common(struct rt_sigframe *sigframe,
+ struct thread_restore_args *args)
+{
+ sys_set_tid_address((int *)decode_pointer(args->clear_tid_addr));
+
+ if (args->has_futex && args->futex_rla_len) {
+ int ret;
+
+ ret = sys_set_robust_list(decode_pointer(args->futex_rla),
+ args->futex_rla_len);
+ if (ret) {
+ pr_err("Failed to recover futex robust list: %d\n", ret);
+ return -1;
+ }
+ }
+
+ restore_sched_info(&args->sp);
+
+ if (restore_nonsigframe_gpregs(&args->gpregs))
+ return -1;
+
+ restore_tls(&args->tls);
+
+ return 0;
+}
+
+static void noinline rst_sigreturn(unsigned long new_sp)
+{
+ ARCH_RT_SIGRETURN(new_sp);
+}
+
+/*
+ * Threads restoration via sigreturn. Note it's locked
+ * routine and calls for unlock at the end.
+ */
+long __export_restore_thread(struct thread_restore_args *args)
+{
+ struct rt_sigframe *rt_sigframe;
+ k_rtsigset_t to_block;
+ unsigned long new_sp;
+ int my_pid = sys_gettid();
+ int ret;
+
+ if (my_pid != args->pid) {
+ pr_err("Thread pid mismatch %d/%d\n", my_pid, args->pid);
+ goto core_restore_end;
+ }
+
+ /* All signals must be handled by thread leader */
+ ksigfillset(&to_block);
+ ret = sys_sigprocmask(SIG_SETMASK, &to_block, NULL, sizeof(k_rtsigset_t));
+ if (ret) {
+ pr_err("Unable to block signals %d\n", ret);
+ goto core_restore_end;
+ }
+
+ rt_sigframe = (void *)args->mem_zone.rt_sigframe;
+
+ if (restore_thread_common(rt_sigframe, args))
+ goto core_restore_end;
+
+ ret = restore_creds(args->creds_args, args->ta->proc_fd);
+ if (ret)
+ goto core_restore_end;
+
+ ret = restore_dumpable_flag(&args->ta->mm);
+ if (ret)
+ goto core_restore_end;
+
+ pr_info("%ld: Restored\n", sys_gettid());
+
+ restore_finish_stage(CR_STATE_RESTORE);
+
+ if (restore_signals(args->siginfo, args->siginfo_n, false))
+ goto core_restore_end;
+
+ restore_finish_stage(CR_STATE_RESTORE_SIGCHLD);
+ restore_pdeath_sig(args);
+
+ if (args->ta->seccomp_mode != SECCOMP_MODE_DISABLED)
+ pr_info("Restoring seccomp mode %d for %ld\n", args->ta->seccomp_mode, sys_getpid());
+
+ restore_finish_stage(CR_STATE_RESTORE_CREDS);
+
+ futex_dec_and_wake(&thread_inprogress);
+
+ new_sp = (long)rt_sigframe + SIGFRAME_OFFSET;
+ rst_sigreturn(new_sp);
+
+core_restore_end:
+ pr_err("Restorer abnormal termination for %ld\n", sys_getpid());
+ futex_abort_and_wake(&task_entries->nr_in_progress);
+ sys_exit_group(1);
+ return -1;
+}
+
+static long restore_self_exe_late(struct task_restore_args *args)
+{
+ int fd = args->fd_exe_link, ret;
+
+ pr_info("Restoring EXE link\n");
+ ret = sys_prctl_safe(PR_SET_MM, PR_SET_MM_EXE_FILE, fd, 0);
+ if (ret)
+ pr_err("Can't restore EXE link (%d)\n", ret);
+ sys_close(fd);
+
+ return ret;
+}
+
+static unsigned long restore_mapping(const VmaEntry *vma_entry)
+{
+ int prot = vma_entry->prot;
+ int flags = vma_entry->flags | MAP_FIXED;
+ unsigned long addr;
+
+ if (vma_entry_is(vma_entry, VMA_AREA_SYSVIPC))
+ return sys_shmat(vma_entry->fd, decode_pointer(vma_entry->start),
+ (vma_entry->prot & PROT_WRITE) ? 0 : SHM_RDONLY);
+
+ /*
+ * Restore or shared mappings are tricky, since
+ * we open anonymous mapping via map_files/
+ * MAP_ANONYMOUS should be eliminated so fd would
+ * be taken into account by a kernel.
+ */
+ if (vma_entry_is(vma_entry, VMA_ANON_SHARED) && (vma_entry->fd != -1UL))
+ flags &= ~MAP_ANONYMOUS;
+
+ /* A mapping of file with MAP_SHARED is up to date */
+ if (vma_entry->fd == -1 || !(vma_entry->flags & MAP_SHARED))
+ prot |= PROT_WRITE;
+
+ pr_debug("\tmmap(%"PRIx64" -> %"PRIx64", %x %x %d)\n",
+ vma_entry->start, vma_entry->end,
+ prot, flags, (int)vma_entry->fd);
+ /*
+ * Should map memory here. Note we map them as
+ * writable since we're going to restore page
+ * contents.
+ */
+ addr = sys_mmap(decode_pointer(vma_entry->start),
+ vma_entry_len(vma_entry),
+ prot, flags,
+ vma_entry->fd,
+ vma_entry->pgoff);
+
+ if (vma_entry->fd != -1)
+ sys_close(vma_entry->fd);
+
+ return addr;
+}
+
+static void rst_tcp_repair_off(struct rst_tcp_sock *rts)
+{
+ int aux, ret;
+
+ aux = rts->reuseaddr;
+ pr_debug("pie: Turning repair off for %d (reuse %d)\n", rts->sk, aux);
+ tcp_repair_off(rts->sk);
+
+ ret = sys_setsockopt(rts->sk, SOL_SOCKET, SO_REUSEADDR, &aux, sizeof(aux));
+ if (ret < 0)
+ pr_err("Failed to restore of SO_REUSEADDR on socket (%d)\n", ret);
+}
+
+static void rst_tcp_socks_all(struct task_restore_args *ta)
+{
+ int i;
+
+ for (i = 0; i < ta->tcp_socks_n; i++)
+ rst_tcp_repair_off(&ta->tcp_socks[i]);
+}
+
+static int vma_remap(unsigned long src, unsigned long dst, unsigned long len)
+{
+ unsigned long guard = 0, tmp;
+
+ pr_info("Remap %lx->%lx len %lx\n", src, dst, len);
+
+ if (src - dst < len)
+ guard = dst;
+ else if (dst - src < len)
+ guard = dst + len - PAGE_SIZE;
+
+ if (src == dst)
+ return 0;
+
+ if (guard != 0) {
+ /*
+ * mremap() returns an error if a target and source vma-s are
+ * overlapped. In this case the source vma are remapped in
+ * a temporary place and then remapped to the target address.
+ * Here is one hack to find non-ovelapped temporary place.
+ *
+ * 1. initial placement. We need to move src -> tgt.
+ * | |+++++src+++++|
+ * |-----tgt-----| |
+ *
+ * 2. map a guard page at the non-ovelapped border of a target vma.
+ * | |+++++src+++++|
+ * |G|----tgt----| |
+ *
+ * 3. remap src to any other place.
+ * G prevents src from being remaped on tgt again
+ * | |-------------| -> |+++++src+++++|
+ * |G|---tgt-----| |
+ *
+ * 4. remap src to tgt, no overlapping any longer
+ * |+++++src+++++| <---- |-------------|
+ * |G|---tgt-----| |
+ */
+
+ unsigned long addr;
+
+ /* Map guard page (step 2) */
+ tmp = sys_mmap((void *) guard, PAGE_SIZE, PROT_NONE,
+ MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+ if (tmp != guard) {
+ pr_err("Unable to map a guard page %lx (%lx)\n", guard, tmp);
+ return -1;
+ }
+
+ /* Move src to non-overlapping place (step 3) */
+ addr = sys_mmap(NULL, len, PROT_NONE,
+ MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+ if (addr == (unsigned long) MAP_FAILED) {
+ pr_err("Unable to reserve memory (%lx)\n", addr);
+ return -1;
+ }
+
+ tmp = sys_mremap(src, len, len,
+ MREMAP_MAYMOVE | MREMAP_FIXED, addr);
+ if (tmp != addr) {
+ pr_err("Unable to remap %lx -> %lx (%lx)\n", src, addr, tmp);
+ return -1;
+ }
+
+ src = addr;
+ }
+
+ tmp = sys_mremap(src, len, len, MREMAP_MAYMOVE | MREMAP_FIXED, dst);
+ if (tmp != dst) {
+ pr_err("Unable to remap %lx -> %lx\n", src, dst);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int timerfd_arm(struct task_restore_args *args)
+{
+ int i;
+
+ for (i = 0; i < args->timerfd_n; i++) {
+ struct restore_timerfd *t = &args->timerfd[i];
+ int ret;
+
+ pr_debug("timerfd: arm for fd %d (%d)\n", t->fd, i);
+
+ if (t->settime_flags & TFD_TIMER_ABSTIME) {
+ struct timespec ts = { };
+
+ /*
+ * We might need to adjust value because the checkpoint
+ * and restore procedure takes some time itself. Note
+ * we don't adjust nanoseconds, since the result may
+ * overflow the limit NSEC_PER_SEC FIXME
+ */
+ if (sys_clock_gettime(t->clockid, &ts)) {
+ pr_err("Can't get current time\n");
+ return -1;
+ }
+
+ t->val.it_value.tv_sec += (time_t)ts.tv_sec;
+
+ pr_debug("Ajust id %#x it_value(%llu, %llu) -> it_value(%llu, %llu)\n",
+ t->id, (unsigned long long)ts.tv_sec,
+ (unsigned long long)ts.tv_nsec,
+ (unsigned long long)t->val.it_value.tv_sec,
+ (unsigned long long)t->val.it_value.tv_nsec);
+ }
+
+ ret = sys_timerfd_settime(t->fd, t->settime_flags, &t->val, NULL);
+ if (t->ticks)
+ ret |= sys_ioctl(t->fd, TFD_IOC_SET_TICKS, (unsigned long)&t->ticks);
+ if (ret) {
+ pr_err("Can't restore ticks/time for timerfd - %d\n", i);
+ return ret;
+ }
+ }
+ return 0;
+}
+
+static int create_posix_timers(struct task_restore_args *args)
+{
+ int ret, i;
+ kernel_timer_t next_id;
+ struct sigevent sev;
+
+ for (i = 0; i < args->posix_timers_n; i++) {
+ sev.sigev_notify = args->posix_timers[i].spt.it_sigev_notify;
+ sev.sigev_signo = args->posix_timers[i].spt.si_signo;
+ sev.sigev_value.sival_ptr = args->posix_timers[i].spt.sival_ptr;
+
+ while (1) {
+ ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &next_id);
+ if (ret < 0) {
+ pr_err("Can't create posix timer - %d\n", i);
+ return ret;
+ }
+
+ if (next_id == args->posix_timers[i].spt.it_id)
+ break;
+
+ ret = sys_timer_delete(next_id);
+ if (ret < 0) {
+ pr_err("Can't remove temporaty posix timer 0x%x\n", next_id);
+ return ret;
+ }
+
+ if ((long)next_id > args->posix_timers[i].spt.it_id) {
+ pr_err("Can't create timers, kernel don't give them consequently\n");
+ return -1;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static void restore_posix_timers(struct task_restore_args *args)
+{
+ int i;
+ struct restore_posix_timer *rt;
+
+ for (i = 0; i < args->posix_timers_n; i++) {
+ rt = &args->posix_timers[i];
+ sys_timer_settime((kernel_timer_t)rt->spt.it_id, 0, &rt->val, NULL);
+ }
+}
+static void *bootstrap_start;
+static unsigned int bootstrap_len;
+
+/*
+ * sys_munmap must not return here. The controll process must
+ * trap us on the exit from sys_munmap.
+ */
+#ifdef CONFIG_VDSO
+static unsigned long vdso_rt_size;
+#else
+#define vdso_rt_size (0)
+#endif
+
+void __export_unmap(void)
+{
+ sys_munmap(bootstrap_start, bootstrap_len - vdso_rt_size);
+}
+
+/*
+ * This function unmaps all VMAs, which don't belong to
+ * the restored process or the restorer.
+ *
+ * The restorer memory is two regions -- area with restorer, its stack
+ * and arguments and the one with private vmas of the tasks we restore
+ * (a.k.a. premmaped area):
+ *
+ * 0 task_size
+ * +----+====+----+====+---+
+ *
+ * Thus to unmap old memory we have to do 3 unmaps:
+ * [ 0 -- 1st area start ]
+ * [ 1st end -- 2nd start ]
+ * [ 2nd start -- task_size ]
+ */
+static int unmap_old_vmas(void *premmapped_addr, unsigned long premmapped_len,
+ void *bootstrap_start, unsigned long bootstrap_len,
+ unsigned long task_size)
+{
+ unsigned long s1, s2;
+ void *p1, *p2;
+ int ret;
+
+ if (premmapped_addr < bootstrap_start) {
+ p1 = premmapped_addr;
+ s1 = premmapped_len;
+ p2 = bootstrap_start;
+ s2 = bootstrap_len;
+ } else {
+ p2 = premmapped_addr;
+ s2 = premmapped_len;
+ p1 = bootstrap_start;
+ s1 = bootstrap_len;
+ }
+
+ ret = sys_munmap(NULL, p1 - NULL);
+ if (ret) {
+ pr_err("Unable to unmap (%p-%p): %d\n", NULL, p1, ret);
+ return -1;
+ }
+
+ ret = sys_munmap(p1 + s1, p2 - (p1 + s1));
+ if (ret) {
+ pr_err("Unable to unmap (%p-%p): %d\n", p1 + s1, p2, ret);
+ return -1;
+ }
+
+ ret = sys_munmap(p2 + s2, task_size - (unsigned long)(p2 + s2));
+ if (ret) {
+ pr_err("Unable to unmap (%p-%p): %d\n",
+ p2 + s2, (void *)task_size, ret);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int wait_helpers(struct task_restore_args *task_args)
+{
+ int i;
+
+ for (i = 0; i < task_args->helpers_n; i++) {
+ int status;
+ pid_t pid = task_args->helpers[i];
+
+ /* Check that a helper completed. */
+ if (sys_wait4(pid, &status, 0, NULL) == -1) {
+ /* It has been waited in sigchld_handler */
+ continue;
+ }
+ if (!WIFEXITED(status) || WEXITSTATUS(status)) {
+ pr_err("%d exited with non-zero code (%d,%d)\n", pid,
+ WEXITSTATUS(status), WTERMSIG(status));
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int wait_zombies(struct task_restore_args *task_args)
+{
+ int i;
+
+ atomic_add(task_args->zombies_n, &task_entries->nr_zombies);
+
+ for (i = 0; i < task_args->zombies_n; i++) {
+ if (sys_waitid(P_PID, task_args->zombies[i], NULL, WNOWAIT | WEXITED, NULL) < 0) {
+ pr_err("Wait on %d zombie failed\n", task_args->zombies[i]);
+ return -1;
+ }
+ pr_debug("%ld: Collect a zombie with pid %d\n",
+ sys_getpid(), task_args->zombies[i]);
+ futex_dec_and_wake(&task_entries->nr_in_progress);
+ }
+
+ return 0;
+}
+
+/*
+ * The main routine to restore task via sigreturn.
+ * This one is very special, we never return there
+ * but use sigreturn facility to restore core registers
+ * and jump execution to some predefined ip read from
+ * core file.
+ */
+long __export_restore_task(struct task_restore_args *args)
+{
+ long ret = -1;
+ int i;
+ VmaEntry *vma_entry;
+ unsigned long va;
+
+ struct rt_sigframe *rt_sigframe;
+ struct prctl_mm_map prctl_map;
+ unsigned long new_sp;
+ k_rtsigset_t to_block;
+ pid_t my_pid = sys_getpid();
+ rt_sigaction_t act;
+
+ bootstrap_start = args->bootstrap_start;
+ bootstrap_len = args->bootstrap_len;
+
+#ifdef CONFIG_VDSO
+ vdso_rt_size = args->vdso_rt_size;
+#endif
+
+ task_entries = args->task_entries;
+ helpers = args->helpers;
+ n_helpers = args->helpers_n;
+ zombies = args->zombies;
+ n_zombies = args->zombies_n;
+ *args->breakpoint = rst_sigreturn;
+
+ ksigfillset(&act.rt_sa_mask);
+ act.rt_sa_handler = sigchld_handler;
+ act.rt_sa_flags = SA_SIGINFO | SA_RESTORER | SA_RESTART;
+ act.rt_sa_restorer = cr_restore_rt;
+ sys_sigaction(SIGCHLD, &act, NULL, sizeof(k_rtsigset_t));
+
+ log_set_fd(args->logfd);
+ log_set_loglevel(args->loglevel);
+
+ pr_info("Switched to the restorer %d\n", my_pid);
+
+ if (vdso_do_park(&args->vdso_sym_rt, args->vdso_rt_parked_at, vdso_rt_size))
+ goto core_restore_end;
+
+ if (unmap_old_vmas((void *)args->premmapped_addr, args->premmapped_len,
+ bootstrap_start, bootstrap_len, args->task_size))
+ goto core_restore_end;
+
+ /* Shift private vma-s to the left */
+ for (i = 0; i < args->vmas_n; i++) {
+ vma_entry = args->vmas + i;
+
+ if (!vma_entry_is_private(vma_entry, args->task_size))
+ continue;
+
+ if (vma_entry->end >= args->task_size)
+ continue;
+
+ if (vma_entry->start > vma_entry->shmid)
+ break;
+
+ if (vma_remap(vma_premmaped_start(vma_entry),
+ vma_entry->start, vma_entry_len(vma_entry)))
+ goto core_restore_end;
+ }
+
+ /* Shift private vma-s to the right */
+ for (i = args->vmas_n - 1; i >= 0; i--) {
+ vma_entry = args->vmas + i;
+
+ if (!vma_entry_is_private(vma_entry, args->task_size))
+ continue;
+
+ if (vma_entry->start > args->task_size)
+ continue;
+
+ if (vma_entry->start < vma_entry->shmid)
+ break;
+
+ if (vma_remap(vma_premmaped_start(vma_entry),
+ vma_entry->start, vma_entry_len(vma_entry)))
+ goto core_restore_end;
+ }
+
+ /*
+ * OK, lets try to map new one.
+ */
+ for (i = 0; i < args->vmas_n; i++) {
+ vma_entry = args->vmas + i;
+
+ if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR))
+ continue;
+
+ if (vma_entry_is_private(vma_entry, args->task_size))
+ continue;
+
+ va = restore_mapping(vma_entry);
+
+ if (va != vma_entry->start) {
+ pr_err("Can't restore %"PRIx64" mapping with %lx\n", vma_entry->start, va);
+ goto core_restore_end;
+ }
+ }
+
+#ifdef CONFIG_VDSO
+ /*
+ * Proxify vDSO.
+ */
+ for (i = 0; i < args->vmas_n; i++) {
+ if (vma_entry_is(&args->vmas[i], VMA_AREA_VDSO) ||
+ vma_entry_is(&args->vmas[i], VMA_AREA_VVAR)) {
+ if (vdso_proxify("dumpee", &args->vdso_sym_rt,
+ args->vdso_rt_parked_at,
+ i, args->vmas, args->vmas_n))
+ goto core_restore_end;
+ break;
+ }
+ }
+#endif
+
+ /*
+ * Walk though all VMAs again to drop PROT_WRITE
+ * if it was not there.
+ */
+ for (i = 0; i < args->vmas_n; i++) {
+ vma_entry = args->vmas + i;
+
+ if (!(vma_entry_is(vma_entry, VMA_AREA_REGULAR)))
+ continue;
+
+ if (vma_entry->prot & PROT_WRITE)
+ continue;
+
+ sys_mprotect(decode_pointer(vma_entry->start),
+ vma_entry_len(vma_entry),
+ vma_entry->prot);
+ }
+
+ /*
+ * Finally restore madivse() bits
+ */
+ for (i = 0; i < args->vmas_n; i++) {
+ unsigned long m;
+
+ vma_entry = args->vmas + i;
+ if (!vma_entry->has_madv || !vma_entry->madv)
+ continue;
+
+ for (m = 0; m < sizeof(vma_entry->madv) * 8; m++) {
+ if (vma_entry->madv & (1ul << m)) {
+ ret = sys_madvise(vma_entry->start,
+ vma_entry_len(vma_entry),
+ m);
+ if (ret) {
+ pr_err("madvise(%"PRIx64", %"PRIu64", %ld) "
+ "failed with %ld\n",
+ vma_entry->start,
+ vma_entry_len(vma_entry),
+ m, ret);
+ goto core_restore_end;
+ }
+ }
+ }
+ }
+
+ /*
+ * Now when all VMAs are in their places time to set
+ * up AIO rings.
+ */
+
+ for (i = 0; i < args->rings_n; i++) {
+ struct rst_aio_ring *raio = &args->rings[i];
+ unsigned long ctx = 0;
+ int ret;
+
+ ret = sys_io_setup(raio->nr_req, &ctx);
+ if (ret < 0) {
+ pr_err("Ring setup failed with %d\n", ret);
+ goto core_restore_end;
+ }
+
+ if (ctx == raio->addr) /* Lucky bastards we are! */
+ continue;
+
+ /*
+ * If we failed to get the proper nr_req right and
+ * created smaller or larger ring, then this remap
+ * will (should) fail, since AIO rings has immutable
+ * size.
+ *
+ * This is not great, but anyway better than putting
+ * a ring of wrong size into correct place.
+ */
+
+ ctx = sys_mremap(ctx, raio->len, raio->len,
+ MREMAP_FIXED | MREMAP_MAYMOVE,
+ raio->addr);
+ if (ctx != raio->addr) {
+ pr_err("Ring remap failed with %ld\n", ctx);
+ goto core_restore_end;
+ }
+
+ /*
+ * Now check that kernel not just remapped the
+ * ring into new place, but updated the internal
+ * context state respectively.
+ */
+
+ ret = sys_io_getevents(ctx, 0, 1, NULL, NULL);
+ if (ret != 0) {
+ if (ret < 0)
+ pr_err("Kernel doesn't remap AIO rings\n");
+ else
+ pr_err("AIO context screwed up\n");
+
+ goto core_restore_end;
+ }
+ }
+
+ ret = 0;
+
+ /*
+ * Tune up the task fields.
+ */
+ ret = sys_prctl_safe(PR_SET_NAME, (long)args->comm, 0, 0);
+ if (ret)
+ goto core_restore_end;
+
+ /*
+ * New kernel interface with @PR_SET_MM_MAP will become
+ * more widespread once kernel get deployed over the world.
+ * Thus lets be opportunistic and use new inteface as a try.
+ */
+ prctl_map = (struct prctl_mm_map) {
+ .start_code = args->mm.mm_start_code,
+ .end_code = args->mm.mm_end_code,
+ .start_data = args->mm.mm_start_data,
+ .end_data = args->mm.mm_end_data,
+ .start_stack = args->mm.mm_start_stack,
+ .start_brk = args->mm.mm_start_brk,
+ .brk = args->mm.mm_brk,
+ .arg_start = args->mm.mm_arg_start,
+ .arg_end = args->mm.mm_arg_end,
+ .env_start = args->mm.mm_env_start,
+ .env_end = args->mm.mm_env_end,
+ .auxv = (void *)args->mm_saved_auxv,
+ .auxv_size = args->mm_saved_auxv_size,
+ .exe_fd = args->fd_exe_link,
+ };
+ ret = sys_prctl(PR_SET_MM, PR_SET_MM_MAP, (long)&prctl_map, sizeof(prctl_map), 0);
+ if (ret == -EINVAL) {
+ ret = sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_CODE, (long)args->mm.mm_start_code, 0);
+ ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_CODE, (long)args->mm.mm_end_code, 0);
+ ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_DATA, (long)args->mm.mm_start_data, 0);
+ ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_DATA, (long)args->mm.mm_end_data, 0);
+ ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_STACK, (long)args->mm.mm_start_stack, 0);
+ ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_BRK, (long)args->mm.mm_start_brk, 0);
+ ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_BRK, (long)args->mm.mm_brk, 0);
+ ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_START, (long)args->mm.mm_arg_start, 0);
+ ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_END, (long)args->mm.mm_arg_end, 0);
+ ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_START, (long)args->mm.mm_env_start, 0);
+ ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_END, (long)args->mm.mm_env_end, 0);
+ ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_AUXV, (long)args->mm_saved_auxv, args->mm_saved_auxv_size);
+
+ /*
+ * Because of requirements applied from kernel side
+ * we need to restore /proc/pid/exe symlink late,
+ * after old existing VMAs are superseded with
+ * new ones from image file.
+ */
+ ret |= restore_self_exe_late(args);
+ } else
+ sys_close(args->fd_exe_link);
+
+ if (ret)
+ goto core_restore_end;
+
+ /*
+ * We need to prepare a valid sigframe here, so
+ * after sigreturn the kernel will pick up the
+ * registers from the frame, set them up and
+ * finally pass execution to the new IP.
+ */
+ rt_sigframe = (void *)args->t->mem_zone.rt_sigframe;
+
+ if (restore_thread_common(rt_sigframe, args->t))
+ goto core_restore_end;
+
+ /*
+ * Threads restoration. This requires some more comments. This
+ * restorer routine and thread restorer routine has the following
+ * memory map, prepared by a caller code.
+ *
+ * | <-- low addresses high addresses --> |
+ * +-------------------------------------------------------+-----------------------+
+ * | this proc body | own stack | rt_sigframe space | thread restore zone |
+ * +-------------------------------------------------------+-----------------------+
+ *
+ * where each thread restore zone is the following
+ *
+ * | <-- low addresses high addresses --> |
+ * +--------------------------------------------------------------------------+
+ * | thread restore proc | thread1 stack | thread1 rt_sigframe |
+ * +--------------------------------------------------------------------------+
+ */
+
+ if (args->nr_threads > 1) {
+ struct thread_restore_args *thread_args = args->thread_args;
+ long clone_flags = CLONE_VM | CLONE_FILES | CLONE_SIGHAND |
+ CLONE_THREAD | CLONE_SYSVSEM;
+ long last_pid_len;
+ long parent_tid;
+ int i, fd;
+
+ fd = sys_openat(args->proc_fd, LAST_PID_PATH, O_RDWR, 0);
+ if (fd < 0) {
+ pr_err("can't open last pid fd %d\n", fd);
+ goto core_restore_end;
+ }
+
+ ret = sys_flock(fd, LOCK_EX);
+ if (ret) {
+ pr_err("Can't lock last_pid %d\n", fd);
+ sys_close(fd);
+ goto core_restore_end;
+ }
+
+ for (i = 0; i < args->nr_threads; i++) {
+ char last_pid_buf[16], *s;
+
+ /* skip self */
+ if (thread_args[i].pid == args->t->pid)
+ continue;
+
+ new_sp = restorer_stack(thread_args + i);
+ last_pid_len = vprint_num(last_pid_buf, sizeof(last_pid_buf), thread_args[i].pid - 1, &s);
+ sys_lseek(fd, 0, SEEK_SET);
+ ret = sys_write(fd, s, last_pid_len);
+ if (ret < 0) {
+ pr_err("Can't set last_pid %ld/%s\n", ret, last_pid_buf);
+ sys_close(fd);
+ goto core_restore_end;
+ }
+
+ /*
+ * To achieve functionality like libc's clone()
+ * we need a pure assembly here, because clone()'ed
+ * thread will run with own stack and we must not
+ * have any additional instructions... oh, dear...
+ */
+
+ RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, thread_args, args->clone_restore_fn);
+ }
+
+ ret = sys_flock(fd, LOCK_UN);
+ if (ret) {
+ pr_err("Can't unlock last_pid %ld\n", ret);
+ sys_close(fd);
+ goto core_restore_end;
+ }
+
+ sys_close(fd);
+ }
+
+ restore_rlims(args);
+
+ ret = create_posix_timers(args);
+ if (ret < 0) {
+ pr_err("Can't restore posix timers %ld\n", ret);
+ goto core_restore_end;
+ }
+
+ ret = timerfd_arm(args);
+ if (ret < 0) {
+ pr_err("Can't restore timerfd %ld\n", ret);
+ goto core_restore_end;
+ }
+
+ pr_info("%ld: Restored\n", sys_getpid());
+
+ restore_finish_stage(CR_STATE_RESTORE);
+
+ if (wait_zombies(args) < 0)
+ goto core_restore_end;
+
+ if (wait_helpers(args) < 0)
+ goto core_restore_end;
+
+ ksigfillset(&to_block);
+ ret = sys_sigprocmask(SIG_SETMASK, &to_block, NULL, sizeof(k_rtsigset_t));
+ if (ret) {
+ pr_err("Unable to block signals %ld\n", ret);
+ goto core_restore_end;
+ }
+
+ sys_sigaction(SIGCHLD, &args->sigchld_act, NULL, sizeof(k_rtsigset_t));
+
+ ret = restore_signals(args->siginfo, args->siginfo_n, true);
+ if (ret)
+ goto core_restore_end;
+
+ ret = restore_signals(args->t->siginfo, args->t->siginfo_n, false);
+ if (ret)
+ goto core_restore_end;
+
+ restore_finish_stage(CR_STATE_RESTORE_SIGCHLD);
+
+ rst_tcp_socks_all(args);
+
+ /* The kernel restricts setting seccomp to uid 0 in the current user
+ * ns, so we must do this before restore_creds.
+ */
+ pr_info("restoring seccomp mode %d for %ld\n", args->seccomp_mode, sys_getpid());
+ if (restore_seccomp(args))
+ goto core_restore_end;
+
+ /*
+ * Writing to last-pid is CAP_SYS_ADMIN protected,
+ * turning off TCP repair is CAP_SYS_NED_ADMIN protected,
+ * thus restore* creds _after_ all of the above.
+ */
+ ret = restore_creds(args->t->creds_args, args->proc_fd);
+ ret = ret || restore_dumpable_flag(&args->mm);
+ ret = ret || restore_pdeath_sig(args->t);
+
+ futex_set_and_wake(&thread_inprogress, args->nr_threads);
+
+ restore_finish_stage(CR_STATE_RESTORE_CREDS);
+
+ if (ret)
+ BUG();
+
+ /* Wait until children stop to use args->task_entries */
+ futex_wait_while_gt(&thread_inprogress, 1);
+
+ sys_close(args->proc_fd);
+ log_set_fd(-1);
+
+ /*
+ * The code that prepared the itimers makes shure the
+ * code below doesn't fail due to bad timing values.
+ */
+
+#define itimer_armed(args, i) \
+ (args->itimers[i].it_interval.tv_sec || \
+ args->itimers[i].it_interval.tv_usec)
+
+ if (itimer_armed(args, 0))
+ sys_setitimer(ITIMER_REAL, &args->itimers[0], NULL);
+ if (itimer_armed(args, 1))
+ sys_setitimer(ITIMER_VIRTUAL, &args->itimers[1], NULL);
+ if (itimer_armed(args, 2))
+ sys_setitimer(ITIMER_PROF, &args->itimers[2], NULL);
+
+ restore_posix_timers(args);
+
+ sys_munmap(args->rst_mem, args->rst_mem_size);
+
+ /*
+ * Sigframe stack.
+ */
+ new_sp = (long)rt_sigframe + SIGFRAME_OFFSET;
+
+ /*
+ * Prepare the stack and call for sigreturn,
+ * pure assembly since we don't need any additional
+ * code insns from gcc.
+ */
+ rst_sigreturn(new_sp);
+
+core_restore_end:
+ futex_abort_and_wake(&task_entries->nr_in_progress);
+ pr_err("Restorer fail %ld\n", sys_getpid());
+ sys_exit_group(1);
+ return -1;
+}
diff --git a/criu/pie/util-fd.c b/criu/pie/util-fd.c
new file mode 100644
index 000000000000..d90fd12b236c
--- /dev/null
+++ b/criu/pie/util-fd.c
@@ -0,0 +1,168 @@
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/mount.h>
+
+#include <errno.h>
+
+#include "compiler.h"
+#include "log.h"
+#include "asm/string.h"
+#include "asm/types.h"
+
+#ifdef CR_NOGLIBC
+# include "syscall.h"
+# define __sys(foo) sys_##foo
+#else
+# define __sys(foo) foo
+#endif
+
+#include "util-pie.h"
+#include "fcntl.h"
+
+#include "bug.h"
+
+static void scm_fdset_init_chunk(struct scm_fdset *fdset, int nr_fds)
+{
+ struct cmsghdr *cmsg;
+
+ fdset->hdr.msg_controllen = CMSG_LEN(sizeof(int) * nr_fds);
+
+ cmsg = CMSG_FIRSTHDR(&fdset->hdr);
+ cmsg->cmsg_len = fdset->hdr.msg_controllen;
+}
+
+static int *scm_fdset_init(struct scm_fdset *fdset, struct sockaddr_un *saddr,
+ int saddr_len, bool with_flags)
+{
+ struct cmsghdr *cmsg;
+
+ BUILD_BUG_ON(sizeof(fdset->msg_buf) < (CMSG_SPACE(sizeof(int) * CR_SCM_MAX_FD)));
+
+ fdset->iov.iov_base = fdset->opts;
+ fdset->iov.iov_len = with_flags ? sizeof(fdset->opts) : 1;
+
+ fdset->hdr.msg_iov = &fdset->iov;
+ fdset->hdr.msg_iovlen = 1;
+ fdset->hdr.msg_name = (struct sockaddr *)saddr;
+ fdset->hdr.msg_namelen = saddr_len;
+
+ fdset->hdr.msg_control = &fdset->msg_buf;
+ fdset->hdr.msg_controllen = CMSG_LEN(sizeof(int) * CR_SCM_MAX_FD);
+
+ cmsg = CMSG_FIRSTHDR(&fdset->hdr);
+ cmsg->cmsg_len = fdset->hdr.msg_controllen;
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+
+ return (int *)CMSG_DATA(cmsg);
+}
+
+int send_fds(int sock, struct sockaddr_un *saddr, int len,
+ int *fds, int nr_fds, bool with_flags)
+{
+ struct scm_fdset fdset;
+ int *cmsg_data;
+ int i, min_fd, ret;
+
+ cmsg_data = scm_fdset_init(&fdset, saddr, len, with_flags);
+ for (i = 0; i < nr_fds; i += min_fd) {
+ min_fd = min(CR_SCM_MAX_FD, nr_fds - i);
+ scm_fdset_init_chunk(&fdset, min_fd);
+ builtin_memcpy(cmsg_data, &fds[i], sizeof(int) * min_fd);
+
+ if (with_flags) {
+ int j;
+
+ for (j = 0; j < min_fd; j++) {
+ int flags, fd = fds[i + j];
+ struct fd_opts *p = fdset.opts + j;
+ struct f_owner_ex owner_ex;
+ u32 v[2];
+
+ flags = __sys(fcntl)(fd, F_GETFD, 0);
+ if (flags < 0) {
+ pr_err("fcntl(%d, F_GETFD) -> %d\n", fd, flags);
+ return -1;
+ }
+
+ p->flags = (char)flags;
+
+ ret = __sys(fcntl)(fd, F_GETOWN_EX, (long)&owner_ex);
+ if (ret) {
+ pr_err("fcntl(%d, F_GETOWN_EX) -> %d\n", fd, ret);
+ return -1;
+ }
+
+ /*
+ * Simple case -- nothing is changed.
+ */
+ if (owner_ex.pid == 0) {
+ p->fown.pid = 0;
+ continue;
+ }
+
+ ret = __sys(fcntl)(fd, F_GETOWNER_UIDS, (long)&v);
+ if (ret) {
+ pr_err("fcntl(%d, F_GETOWNER_UIDS) -> %d\n", fd, ret);
+ return -1;
+ }
+
+ p->fown.uid = v[0];
+ p->fown.euid = v[1];
+ p->fown.pid_type = owner_ex.type;
+ p->fown.pid = owner_ex.pid;
+ }
+ }
+
+ ret = __sys(sendmsg)(sock, &fdset.hdr, 0);
+ if (ret <= 0)
+ return ret ? : -1;
+ }
+
+ return 0;
+}
+
+int recv_fds(int sock, int *fds, int nr_fds, struct fd_opts *opts)
+{
+ struct scm_fdset fdset;
+ struct cmsghdr *cmsg;
+ int *cmsg_data;
+ int ret;
+ int i, min_fd;
+
+ cmsg_data = scm_fdset_init(&fdset, NULL, 0, opts != NULL);
+ for (i = 0; i < nr_fds; i += min_fd) {
+ min_fd = min(CR_SCM_MAX_FD, nr_fds - i);
+ scm_fdset_init_chunk(&fdset, min_fd);
+
+ ret = __sys(recvmsg)(sock, &fdset.hdr, 0);
+ if (ret <= 0)
+ return ret ? : -1;
+
+ cmsg = CMSG_FIRSTHDR(&fdset.hdr);
+ if (!cmsg || cmsg->cmsg_type != SCM_RIGHTS)
+ return -EINVAL;
+ if (fdset.hdr.msg_flags & MSG_CTRUNC)
+ return -ENFILE;
+
+ min_fd = (cmsg->cmsg_len - sizeof(struct cmsghdr)) / sizeof(int);
+ /*
+ * In case if kernel screwed the recepient, most probably
+ * the caller stack frame will be overwriten, just scream
+ * and exit.
+ *
+ * FIXME Need to sanitize util.h to be able to include it
+ * into files which do not have glibc and a couple of
+ * sys_write_ helpers. Meawhile opencoded BUG_ON here.
+ */
+ BUG_ON(min_fd > CR_SCM_MAX_FD);
+
+ if (unlikely(min_fd <= 0))
+ return -1;
+ builtin_memcpy(&fds[i], cmsg_data, sizeof(int) * min_fd);
+ if (opts)
+ builtin_memcpy(opts + i, fdset.opts, sizeof(struct fd_opts) * min_fd);
+ }
+
+ return 0;
+}
diff --git a/criu/pie/util-vdso.c b/criu/pie/util-vdso.c
new file mode 100644
index 000000000000..e93b110fe43b
--- /dev/null
+++ b/criu/pie/util-vdso.c
@@ -0,0 +1,210 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <elf.h>
+#include <fcntl.h>
+#include <errno.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+#include "asm/string.h"
+#include "asm/types.h"
+
+#include "image.h"
+#include "util-vdso.h"
+#include "vma.h"
+#include "log.h"
+#include "bug.h"
+
+#ifdef LOG_PREFIX
+# undef LOG_PREFIX
+#endif
+#define LOG_PREFIX "vdso: "
+
+/* Check if pointer is out-of-bound */
+static bool __ptr_oob(void *ptr, void *start, size_t size)
+{
+ void *end = (void *)((unsigned long)start + size);
+ return ptr > end || ptr < start;
+}
+
+/*
+ * Elf hash, see format specification.
+ */
+static unsigned long elf_hash(const unsigned char *name)
+{
+ unsigned long h = 0, g;
+
+ while (*name) {
+ h = (h << 4) + *name++;
+ g = h & 0xf0000000ul;
+ if (g)
+ h ^= g >> 24;
+ h &= ~g;
+ }
+ return h;
+}
+
+int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t)
+{
+ const char *vdso_symbols[VDSO_SYMBOL_MAX] = {
+ ARCH_VDSO_SYMBOLS
+ };
+
+ Elf64_Phdr *dynamic = NULL, *load = NULL;
+ Elf64_Ehdr *ehdr = (void *)mem;
+ Elf64_Dyn *dyn_strtab = NULL;
+ Elf64_Dyn *dyn_symtab = NULL;
+ Elf64_Dyn *dyn_strsz = NULL;
+ Elf64_Dyn *dyn_syment = NULL;
+ Elf64_Dyn *dyn_hash = NULL;
+ Elf64_Word *hash = NULL;
+ Elf64_Phdr *phdr;
+ Elf64_Dyn *d;
+
+ Elf64_Word *bucket, *chain;
+ Elf64_Word nbucket, nchain;
+
+ /*
+ * See Elf specification for this magic values.
+ */
+ static const char elf_ident[] = {
+ 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ };
+
+ char *dynsymbol_names;
+ unsigned int i, j, k;
+
+ BUILD_BUG_ON(sizeof(elf_ident) != sizeof(ehdr->e_ident));
+
+ pr_debug("Parsing at %lx %lx\n", (long)mem, (long)mem + (long)size);
+
+ /*
+ * Make sure it's a file we support.
+ */
+ if (builtin_memcmp(ehdr->e_ident, elf_ident, sizeof(elf_ident))) {
+ pr_err("Elf header magic mismatch\n");
+ return -EINVAL;
+ }
+
+ /*
+ * We need PT_LOAD and PT_DYNAMIC here. Each once.
+ */
+ phdr = (void *)&mem[ehdr->e_phoff];
+ for (i = 0; i < ehdr->e_phnum; i++, phdr++) {
+ if (__ptr_oob(phdr, mem, size))
+ goto err_oob;
+ switch (phdr->p_type) {
+ case PT_DYNAMIC:
+ if (dynamic) {
+ pr_err("Second PT_DYNAMIC header\n");
+ return -EINVAL;
+ }
+ dynamic = phdr;
+ break;
+ case PT_LOAD:
+ if (load) {
+ pr_err("Second PT_LOAD header\n");
+ return -EINVAL;
+ }
+ load = phdr;
+ break;
+ }
+ }
+
+ if (!load || !dynamic) {
+ pr_err("One of obligated program headers is missed\n");
+ return -EINVAL;
+ }
+
+ pr_debug("PT_LOAD p_vaddr: %lx\n", (unsigned long)load->p_vaddr);
+
+ /*
+ * Dynamic section tags should provide us the rest of information
+ * needed. Note that we're interested in a small set of tags.
+ */
+ d = (void *)&mem[dynamic->p_offset];
+ for (i = 0; i < dynamic->p_filesz / sizeof(*d); i++, d++) {
+ if (__ptr_oob(d, mem, size))
+ goto err_oob;
+
+ if (d->d_tag == DT_NULL) {
+ break;
+ } else if (d->d_tag == DT_STRTAB) {
+ dyn_strtab = d;
+ pr_debug("DT_STRTAB: %lx\n", (unsigned long)d->d_un.d_ptr);
+ } else if (d->d_tag == DT_SYMTAB) {
+ dyn_symtab = d;
+ pr_debug("DT_SYMTAB: %lx\n", (unsigned long)d->d_un.d_ptr);
+ } else if (d->d_tag == DT_STRSZ) {
+ dyn_strsz = d;
+ pr_debug("DT_STRSZ: %lx\n", (unsigned long)d->d_un.d_val);
+ } else if (d->d_tag == DT_SYMENT) {
+ dyn_syment = d;
+ pr_debug("DT_SYMENT: %lx\n", (unsigned long)d->d_un.d_val);
+ } else if (d->d_tag == DT_HASH) {
+ dyn_hash = d;
+ pr_debug("DT_HASH: %lx\n", (unsigned long)d->d_un.d_ptr);
+ }
+ }
+
+ if (!dyn_strtab || !dyn_symtab || !dyn_strsz || !dyn_syment || !dyn_hash) {
+ pr_err("Not all dynamic entries are present\n");
+ return -EINVAL;
+ }
+
+ dynsymbol_names = &mem[dyn_strtab->d_un.d_val - load->p_vaddr];
+ if (__ptr_oob(dynsymbol_names, mem, size))
+ goto err_oob;
+
+ hash = (void *)&mem[(unsigned long)dyn_hash->d_un.d_ptr - (unsigned long)load->p_vaddr];
+ if (__ptr_oob(hash, mem, size))
+ goto err_oob;
+
+ nbucket = hash[0];
+ nchain = hash[1];
+ bucket = &hash[2];
+ chain = &hash[nbucket + 2];
+
+ pr_debug("nbucket %lx nchain %lx bucket %lx chain %lx\n",
+ (long)nbucket, (long)nchain, (unsigned long)bucket, (unsigned long)chain);
+
+ for (i = 0; i < VDSO_SYMBOL_MAX; i++) {
+ const char * symbol = vdso_symbols[i];
+ k = elf_hash((const unsigned char *)symbol);
+
+ for (j = bucket[k % nbucket]; j < nchain && chain[j] != STN_UNDEF; j = chain[j]) {
+ Elf64_Sym *sym = (void *)&mem[dyn_symtab->d_un.d_ptr - load->p_vaddr];
+ char *name;
+
+ sym = &sym[j];
+ if (__ptr_oob(sym, mem, size))
+ continue;
+
+ if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC &&
+ ELF64_ST_BIND(sym->st_info) != STB_GLOBAL)
+ continue;
+
+ name = &dynsymbol_names[sym->st_name];
+ if (__ptr_oob(name, mem, size))
+ continue;
+
+ if (builtin_strcmp(name, symbol))
+ continue;
+
+ builtin_memcpy(t->symbols[i].name, name, sizeof(t->symbols[i].name));
+ t->symbols[i].offset = (unsigned long)sym->st_value - load->p_vaddr;
+ break;
+ }
+ }
+
+ return 0;
+
+err_oob:
+ pr_err("Corrupted Elf data\n");
+ return -EFAULT;
+}
+
diff --git a/criu/pie/util.c b/criu/pie/util.c
new file mode 100644
index 000000000000..354667294e37
--- /dev/null
+++ b/criu/pie/util.c
@@ -0,0 +1,47 @@
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/mount.h>
+#include <unistd.h>
+#include <errno.h>
+
+#include "compiler.h"
+#include "asm/string.h"
+#include "asm/types.h"
+#include "fcntl.h"
+#include "log.h"
+#include "util-pie.h"
+
+#ifdef CR_NOGLIBC
+# include "syscall.h"
+# define __sys(foo) sys_##foo
+#else
+# define __sys(foo) foo
+#endif
+
+int open_detach_mount(char *dir)
+{
+ int fd, ret;
+
+ fd = __sys(open)(dir, O_RDONLY | O_DIRECTORY, 0);
+ if (fd < 0)
+ pr_err("Can't open directory %s: %d\n", dir, fd);
+
+ ret = __sys(umount2)(dir, MNT_DETACH);
+ if (ret) {
+ pr_err("Can't detach mount %s: %d\n", dir, ret);
+ goto err_close;
+ }
+
+ ret = __sys(rmdir)(dir);
+ if (ret) {
+ pr_err("Can't remove tmp dir %s: %d\n", dir, ret);
+ goto err_close;
+ }
+
+ return fd;
+
+err_close:
+ if (fd >= 0)
+ __sys(close)(fd);
+ return -1;
+}
diff --git a/criu/pipes.c b/criu/pipes.c
new file mode 100644
index 000000000000..a1552127837f
--- /dev/null
+++ b/criu/pipes.c
@@ -0,0 +1,521 @@
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+
+#include "imgset.h"
+#include "image.h"
+#include "files.h"
+#include "pipes.h"
+#include "util-pie.h"
+
+#include "protobuf.h"
+#include "protobuf/pipe.pb-c.h"
+#include "protobuf/pipe-data.pb-c.h"
+
+static LIST_HEAD(pipes);
+
+static void show_saved_pipe_fds(struct pipe_info *pi)
+{
+ struct fdinfo_list_entry *fle;
+
+ pr_info(" `- ID %p %#xpn", pi, pi->pe->id);
+ list_for_each_entry(fle, &pi->d.fd_info_head, desc_list)
+ pr_info(" `- FD %d pid %d\n", fle->fe->fd, fle->pid);
+}
+
+static int pipe_data_read(struct cr_img *img, struct pipe_data_rst *r)
+{
+ unsigned long bytes = r->pde->bytes;
+
+ if (!bytes)
+ return 0;
+
+ /*
+ * We potentially allocate more memory than required for data,
+ * but this is OK. Look at restore_pipe_data -- it vmsplice-s
+ * this into the kernel with F_GIFT flag (since some time it
+ * works on non-aligned data), thus just giving this page to
+ * pipe buffer. And since kernel allocates pipe buffers in pages
+ * anyway we don't increase memory consumption :)
+ */
+
+ r->data = mmap(NULL, bytes, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANON, 0, 0);
+ if (r->data == MAP_FAILED) {
+ pr_perror("Can't map mem for pipe buffers");
+ return -1;
+ }
+
+ return read_img_buf(img, r->data, bytes);
+}
+
+int collect_pipe_data(int img_type, struct pipe_data_rst **hash)
+{
+ int ret;
+ struct cr_img *img;
+ struct pipe_data_rst *r = NULL;
+
+ img = open_image(img_type, O_RSTR);
+ if (!img)
+ return -1;
+
+ while (1) {
+ ret = -1;
+ r = xmalloc(sizeof(*r));
+ if (!r)
+ break;
+
+ ret = pb_read_one_eof(img, &r->pde, PB_PIPE_DATA);
+ if (ret <= 0)
+ break;
+
+ ret = pipe_data_read(img, r);
+ if (ret < 0)
+ break;
+
+ ret = r->pde->pipe_id & PIPE_DATA_HASH_MASK;
+ r->next = hash[ret];
+ hash[ret] = r;
+
+ pr_info("Collected pipe data for %#x (chain %u)\n",
+ r->pde->pipe_id, ret);
+ }
+
+ if (r && r->pde)
+ pipe_data_entry__free_unpacked(r->pde, NULL);
+ xfree(r);
+
+ close_image(img);
+ return ret;
+}
+
+/* Choose who will restore a pipe. */
+void mark_pipe_master(void)
+{
+ LIST_HEAD(head);
+
+ pr_info("Pipes:\n");
+
+ while (1) {
+ struct fdinfo_list_entry *fle;
+ struct pipe_info *pi, *pic, *p;
+ struct pipe_info *pr = NULL, *pw = NULL;
+
+ if (list_empty(&pipes))
+ break;
+
+ pi = list_first_entry(&pipes, struct pipe_info, list);
+ list_move(&pi->list, &head);
+
+ pr_info(" `- PIPE ID %#x\n", pi->pe->pipe_id);
+ show_saved_pipe_fds(pi);
+
+ fle = file_master(&pi->d);
+ p = pi;
+ if (!(pi->pe->flags & O_LARGEFILE)) {
+ if (pi->pe->flags & O_WRONLY) {
+ if (pw == NULL)
+ pw = pi;
+ } else {
+ if (pr == NULL)
+ pr = pi;
+ }
+ }
+
+ list_for_each_entry(pic, &pi->pipe_list, pipe_list) {
+ struct fdinfo_list_entry *f;
+
+ list_move(&pic->list, &head);
+ f = file_master(&pic->d);
+ if (fdinfo_rst_prio(f, fle)) {
+ p = pic;
+ fle = f;
+ }
+
+ if (!(pic->pe->flags & O_LARGEFILE)) {
+ if (pic->pe->flags & O_WRONLY) {
+ if (pw == NULL)
+ pw = pic;
+ } else {
+ if (pr == NULL)
+ pr = pic;
+ }
+ }
+
+ show_saved_pipe_fds(pic);
+ }
+ p->create = 1;
+ if (pr)
+ pr->reopen = 0;
+ if (pw)
+ pw->reopen = 0;
+ pr_info(" by %#x\n", p->pe->id);
+ }
+
+ list_splice(&head, &pipes);
+}
+
+static struct pipe_data_rst *pd_hash_pipes[PIPE_DATA_HASH_SIZE];
+
+int restore_pipe_data(int img_type, int pfd, u32 id, struct pipe_data_rst **hash)
+{
+ int ret;
+ struct pipe_data_rst *pd;
+ struct iovec iov;
+
+ for (pd = hash[id & PIPE_DATA_HASH_MASK]; pd != NULL; pd = pd->next)
+ if (pd->pde->pipe_id == id)
+ break;
+
+ if (!pd) { /* no data for this pipe */
+ pr_info("No data for pipe %#x\n", id);
+ return 0;
+ }
+
+ if (!pd->pde->bytes)
+ goto out;
+
+ if (!pd->data) {
+ pr_err("Double data restore occurred on %#x\n", id);
+ return -1;
+ }
+
+ iov.iov_base = pd->data;
+ iov.iov_len = pd->pde->bytes;
+
+ while (iov.iov_len > 0) {
+ ret = vmsplice(pfd, &iov, 1, SPLICE_F_GIFT | SPLICE_F_NONBLOCK);
+ if (ret < 0) {
+ pr_perror("%#x: Error splicing data", id);
+ goto err;
+ }
+
+ if (ret == 0 || ret > iov.iov_len /* sanity */) {
+ pr_err("%#x: Wanted to restore %zu bytes, but got %d\n", id,
+ iov.iov_len, ret);
+ ret = -1;
+ goto err;
+ }
+
+ iov.iov_base += ret;
+ iov.iov_len -= ret;
+ }
+
+ /*
+ * 3 reasons for killing the buffer from our address space:
+ *
+ * 1. We gifted the pages to the kernel to optimize memory usage, thus
+ * accidental memory corruption can change the pipe buffer.
+ * 2. This will make the vmas restoration a bit faster due to less self
+ * mappings to be unmapped.
+ * 3. We can catch bugs with double pipe data restore.
+ */
+
+ munmap(pd->data, pd->pde->bytes);
+ pd->data = NULL;
+out:
+ ret = 0;
+ if (pd->pde->has_size) {
+ pr_info("Restoring size %#x for %#x\n",
+ pd->pde->size, pd->pde->pipe_id);
+ ret = fcntl(pfd, F_SETPIPE_SZ, pd->pde->size);
+ if (ret < 0)
+ pr_perror("Can't restore pipe size");
+ else
+ ret = 0;
+ }
+err:
+ return ret;
+}
+
+static int reopen_pipe(int fd, int flags)
+{
+ int ret;
+ char path[PSFDS];
+
+ sprintf(path, "/proc/self/fd/%d", fd);
+ ret = open(path, flags);
+ if (ret < 0)
+ pr_perror("Unable to reopen the pipe %s", path);
+ close(fd);
+
+ return ret;
+}
+
+static int recv_pipe_fd(struct pipe_info *pi)
+{
+ struct fdinfo_list_entry *fle;
+ int tmp, fd;
+
+ fle = file_master(&pi->d);
+ fd = fle->fe->fd;
+
+ pr_info("\tWaiting fd for %d\n", fd);
+
+ tmp = recv_fd(fd);
+ if (tmp < 0) {
+ pr_err("Can't get fd %d\n", tmp);
+ return -1;
+ }
+ close(fd);
+
+ if (pi->reopen)
+ fd = reopen_pipe(tmp, pi->pe->flags);
+ else
+ fd = tmp;
+ if (fd >= 0) {
+ if (rst_file_params(fd, pi->pe->fown, pi->pe->flags)) {
+ close(fd);
+ return -1;
+ }
+ }
+
+ return fd;
+}
+
+static char *pipe_d_name(struct file_desc *d, char *buf, size_t s)
+{
+ struct pipe_info *pi;
+
+ pi = container_of(d, struct pipe_info, d);
+ if (snprintf(buf, s, "pipe:[%d]", pi->pe->pipe_id) >= s) {
+ pr_err("Not enough room for pipe %d identifier string\n",
+ pi->pe->pipe_id);
+ return NULL;
+ }
+
+ return buf;
+}
+
+static int open_pipe(struct file_desc *d)
+{
+ struct pipe_info *pi, *p;
+ int ret, tmp;
+ int pfd[2];
+ int sock;
+
+ pi = container_of(d, struct pipe_info, d);
+ pr_info("\t\tCreating pipe pipe_id=%#x id=%#x\n", pi->pe->pipe_id, pi->pe->id);
+ if (inherited_fd(d, &tmp)) {
+ if (tmp < 0)
+ return tmp;
+
+ pi->reopen = 1;
+ goto out;
+ }
+
+ if (!pi->create)
+ return recv_pipe_fd(pi);
+
+ if (pipe(pfd) < 0) {
+ pr_perror("Can't create pipe");
+ return -1;
+ }
+
+ ret = restore_pipe_data(CR_FD_PIPES_DATA, pfd[1],
+ pi->pe->pipe_id, pd_hash_pipes);
+ if (ret)
+ return -1;
+
+ sock = socket(PF_UNIX, SOCK_DGRAM, 0);
+ if (sock < 0) {
+ pr_perror("Can't create socket");
+ return -1;
+ }
+
+ list_for_each_entry(p, &pi->pipe_list, pipe_list) {
+ struct fdinfo_list_entry *fle;
+ int fd;
+
+ fle = file_master(&p->d);
+ fd = pfd[p->pe->flags & O_WRONLY];
+
+ if (send_fd_to_peer(fd, fle, sock)) {
+ pr_perror("Can't send file descriptor");
+ return -1;
+ }
+ }
+
+ close(sock);
+
+ close(pfd[!(pi->pe->flags & O_WRONLY)]);
+ tmp = pfd[pi->pe->flags & O_WRONLY];
+
+out:
+ if (pi->reopen)
+ tmp = reopen_pipe(tmp, pi->pe->flags);
+
+ if (tmp >= 0)
+ if (rst_file_params(tmp, pi->pe->fown, pi->pe->flags))
+ return -1;
+
+ return tmp;
+}
+
+static int want_transport(FdinfoEntry *fe, struct file_desc *d)
+{
+ struct pipe_info *pi;
+
+ pi = container_of(d, struct pipe_info, d);
+ return !pi->create;
+}
+
+static struct file_desc_ops pipe_desc_ops = {
+ .type = FD_TYPES__PIPE,
+ .open = open_pipe,
+ .want_transport = want_transport,
+ .name = pipe_d_name,
+};
+
+static int collect_one_pipe(void *o, ProtobufCMessage *base)
+{
+ struct pipe_info *pi = o, *tmp;
+
+ pi->pe = pb_msg(base, PipeEntry);
+
+ pi->create = 0;
+ pi->reopen = 1;
+ pr_info("Collected pipe entry ID %#x PIPE ID %#x\n",
+ pi->pe->id, pi->pe->pipe_id);
+
+ if (file_desc_add(&pi->d, pi->pe->id, &pipe_desc_ops))
+ return -1;
+
+ INIT_LIST_HEAD(&pi->pipe_list);
+ if (!inherited_fd(&pi->d, NULL)) {
+ list_for_each_entry(tmp, &pipes, list)
+ if (pi->pe->pipe_id == tmp->pe->pipe_id)
+ break;
+
+ if (&tmp->list != &pipes)
+ list_add(&pi->pipe_list, &tmp->pipe_list);
+ }
+
+ list_add_tail(&pi->list, &pipes);
+
+ return 0;
+}
+
+struct collect_image_info pipe_cinfo = {
+ .fd_type = CR_FD_PIPES,
+ .pb_type = PB_PIPE,
+ .priv_size = sizeof(struct pipe_info),
+ .collect = collect_one_pipe,
+};
+
+int collect_pipes(void)
+{
+ return collect_pipe_data(CR_FD_PIPES_DATA, pd_hash_pipes);
+}
+
+int dump_one_pipe_data(struct pipe_data_dump *pd, int lfd, const struct fd_parms *p)
+{
+ struct cr_img *img;
+ int pipe_size, i, bytes;
+ int steal_pipe[2];
+ int ret = -1;
+ PipeDataEntry pde = PIPE_DATA_ENTRY__INIT;
+
+ if (p->flags & O_WRONLY)
+ return 0;
+
+ /* Maybe we've dumped it already */
+ for (i = 0; i < pd->nr; i++) {
+ if (pd->ids[i] == pipe_id(p))
+ return 0;
+ }
+
+ pr_info("Dumping data from pipe %#x fd %d\n", pipe_id(p), lfd);
+
+ if (pd->nr >= NR_PIPES_WITH_DATA) {
+ pr_err("OOM storing pipe\n");
+ return -1;
+ }
+
+ img = img_from_set(glob_imgset, pd->img_type);
+ pd->ids[pd->nr++] = pipe_id(p);
+
+ pipe_size = fcntl(lfd, F_GETPIPE_SZ);
+ if (pipe_size < 0) {
+ pr_err("Can't obtain piped data size\n");
+ goto err;
+ }
+
+ if (pipe(steal_pipe) < 0) {
+ pr_perror("Can't create pipe for stealing data");
+ goto err;
+ }
+
+ bytes = tee(lfd, steal_pipe[1], pipe_size, SPLICE_F_NONBLOCK);
+ if (bytes < 0) {
+ if (errno != EAGAIN) {
+ pr_perror("Can't pick pipe data");
+ goto err_close;
+ }
+
+ bytes = 0;
+ }
+
+ pde.pipe_id = pipe_id(p);
+ pde.bytes = bytes;
+ pde.has_size = true;
+ pde.size = pipe_size;
+
+ if (pb_write_one(img, &pde, PB_PIPE_DATA))
+ goto err_close;
+
+ if (bytes) {
+ int wrote;
+
+ wrote = splice(steal_pipe[0], NULL, img_raw_fd(img), NULL, bytes, 0);
+ if (wrote < 0) {
+ pr_perror("Can't push pipe data");
+ goto err_close;
+ } else if (wrote != bytes) {
+ pr_err("%#x: Wanted to write %d bytes, but wrote %d\n",
+ pipe_id(p), bytes, wrote);
+ goto err_close;
+ }
+ }
+
+ ret = 0;
+
+err_close:
+ close(steal_pipe[0]);
+ close(steal_pipe[1]);
+err:
+ return ret;
+}
+
+static struct pipe_data_dump pd_pipes = { .img_type = CR_FD_PIPES_DATA, };
+
+static int dump_one_pipe(int lfd, u32 id, const struct fd_parms *p)
+{
+ PipeEntry pe = PIPE_ENTRY__INIT;
+
+ pr_info("Dumping pipe %d with id %#x pipe_id %#x\n",
+ lfd, id, pipe_id(p));
+
+ if (p->flags & O_DIRECT) {
+ pr_err("The packetized mode for pipes is not supported yet\n");
+ return -1;
+ }
+
+ pe.id = id;
+ pe.pipe_id = pipe_id(p);
+ pe.flags = p->flags;
+ pe.fown = (FownEntry *)&p->fown;
+
+ if (pb_write_one(img_from_set(glob_imgset, CR_FD_PIPES), &pe, PB_PIPE))
+ return -1;
+
+ return dump_one_pipe_data(&pd_pipes, lfd, p);
+}
+
+const struct fdtype_ops pipe_dump_ops = {
+ .type = FD_TYPES__PIPE,
+ .dump = dump_one_pipe,
+};
diff --git a/criu/plugin.c b/criu/plugin.c
new file mode 100644
index 000000000000..f764ae7d3e63
--- /dev/null
+++ b/criu/plugin.c
@@ -0,0 +1,247 @@
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <dirent.h>
+#include <stdio.h>
+#include <errno.h>
+#include <dlfcn.h>
+
+#include "cr_options.h"
+#include "compiler.h"
+#include "xmalloc.h"
+#include "plugin.h"
+#include "list.h"
+#include "log.h"
+
+cr_plugin_ctl_t cr_plugin_ctl;
+
+/*
+ * If we met old version of a plugin, selfgenerate a plugin descriptor for it.
+ */
+static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path)
+{
+ cr_plugin_desc_t *d;
+
+ d = xzalloc(sizeof(*d));
+ if (!d)
+ return NULL;
+
+ d->name = strdup(path);
+ d->max_hooks = CR_PLUGIN_HOOK__MAX;
+ d->version = CRIU_PLUGIN_VERSION_OLD;
+
+ pr_warn("Generating dynamic descriptor for plugin `%s'."
+ "Won't work in next version of the program."
+ "Please update your plugin.\n", path);
+
+#define __assign_hook(__hook, __name) \
+ do { \
+ void *name; \
+ name = dlsym(h, __name); \
+ if (name) \
+ d->hooks[CR_PLUGIN_HOOK__ ##__hook] = name; \
+ } while (0)
+
+ __assign_hook(DUMP_UNIX_SK, "cr_plugin_dump_unix_sk");
+ __assign_hook(RESTORE_UNIX_SK, "cr_plugin_restore_unix_sk");
+ __assign_hook(DUMP_EXT_FILE, "cr_plugin_dump_file");
+ __assign_hook(RESTORE_EXT_FILE, "cr_plugin_restore_file");
+ __assign_hook(DUMP_EXT_MOUNT, "cr_plugin_dump_ext_mount");
+ __assign_hook(RESTORE_EXT_MOUNT, "cr_plugin_restore_ext_mount");
+ __assign_hook(DUMP_EXT_LINK, "cr_plugin_dump_ext_link");
+
+#undef __assign_hook
+
+ d->init = dlsym(h, "cr_plugin_init");
+ d->exit = dlsym(h, "cr_plugin_fini");
+
+ return d;
+}
+
+static void show_plugin_desc(cr_plugin_desc_t *d)
+{
+ size_t i;
+
+ pr_debug("Plugin \"%s\" (version %u hooks %u)\n",
+ d->name, d->version, d->max_hooks);
+ for (i = 0; i < d->max_hooks; i++) {
+ if (d->hooks[i])
+ pr_debug("\t%4zu -> %p\n", i, d->hooks[i]);
+ }
+}
+
+static int verify_plugin(cr_plugin_desc_t *d)
+{
+ if (d->version > CRIU_PLUGIN_VERSION) {
+ pr_debug("Plugin %s has version %x while max %x supported\n",
+ d->name, d->version, CRIU_PLUGIN_VERSION);
+ return -1;
+ }
+
+ if (d->max_hooks > CR_PLUGIN_HOOK__MAX) {
+ pr_debug("Plugin %s has %u assigned while max %u supported\n",
+ d->name, d->max_hooks, CR_PLUGIN_HOOK__MAX);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int cr_lib_load(int stage, char *path)
+{
+ cr_plugin_desc_t *d;
+ plugin_desc_t *this;
+ size_t i;
+ void *h;
+
+ h = dlopen(path, RTLD_LAZY);
+ if (h == NULL) {
+ pr_err("Unable to load %s: %s\n", path, dlerror());
+ return -1;
+ }
+
+ /*
+ * Load plugin descriptor. If plugin is too old -- create
+ * dynamic plugin descriptor. In most cases this won't
+ * be a common operation and plugins are not supposed to
+ * be changing own format frequently.
+ */
+ d = dlsym(h, "CR_PLUGIN_DESC");
+ if (!d)
+ d = cr_gen_plugin_desc(h, path);
+ if (!d) {
+ pr_err("Can't load plugin %s\n", path);
+ dlclose(h);
+ return -1;
+ }
+
+ this = xzalloc(sizeof(*this));
+ if (!this) {
+ dlclose(h);
+ return -1;
+ }
+
+ if (verify_plugin(d)) {
+ pr_err("Corrupted plugin %s\n", path);
+ xfree(this);
+ dlclose(h);
+ return -1;
+ }
+
+ this->d = d;
+ this->dlhandle = h;
+ INIT_LIST_HEAD(&this->list);
+
+ for (i = 0; i < d->max_hooks; i++)
+ INIT_LIST_HEAD(&this->link[i]);
+
+ list_add_tail(&this->list, &cr_plugin_ctl.head);
+ show_plugin_desc(d);
+
+ if (d->init && d->init(stage)) {
+ pr_err("Failed in init(%d) of \"%s\"\n", stage, d->name);
+ list_del(&this->list);
+ xfree(this);
+ dlclose(h);
+ return -1;
+ }
+
+ /*
+ * Chain hooks into appropriate places for
+ * fast handler access.
+ */
+ for (i = 0; i < d->max_hooks; i++) {
+ if (!d->hooks[i])
+ continue;
+ list_add_tail(&this->link[i], &cr_plugin_ctl.hook_chain[i]);
+ }
+
+ return 0;
+}
+
+void cr_plugin_fini(int stage, int ret)
+{
+ plugin_desc_t *this, *tmp;
+
+ list_for_each_entry_safe(this, tmp, &cr_plugin_ctl.head, list) {
+ void *h = this->dlhandle;
+ size_t i;
+
+ list_del(&this->list);
+ if (this->d->exit)
+ this->d->exit(stage, ret);
+
+ for (i = 0; i < this->d->max_hooks; i++) {
+ if (!list_empty(&this->link[i]))
+ list_del(&this->link[i]);
+ }
+
+ if (this->d->version == CRIU_PLUGIN_VERSION_OLD)
+ xfree(this->d);
+ dlclose(h);
+ }
+}
+
+int cr_plugin_init(int stage)
+{
+ int exit_code = -1;
+ char *path;
+ size_t i;
+ DIR *d;
+
+ INIT_LIST_HEAD(&cr_plugin_ctl.head);
+ for (i = 0; i < ARRAY_SIZE(cr_plugin_ctl.hook_chain); i++)
+ INIT_LIST_HEAD(&cr_plugin_ctl.hook_chain[i]);
+
+ if (opts.libdir == NULL) {
+ path = getenv("CRIU_LIBS_DIR");
+ if (path)
+ opts.libdir = path;
+ else {
+ if (access(CR_PLUGIN_DEFAULT, F_OK))
+ return 0;
+
+ opts.libdir = CR_PLUGIN_DEFAULT;
+ }
+ }
+
+ d = opendir(opts.libdir);
+ if (d == NULL) {
+ pr_perror("Unable to open directory %s", opts.libdir);
+ return -1;
+ }
+
+ while (1) {
+ char path[PATH_MAX];
+ struct dirent *de;
+ int len;
+
+ errno = 0;
+ de = readdir(d);
+ if (de == NULL) {
+ if (errno == 0)
+ break;
+ pr_perror("Unable to read the libraries directory");
+ goto err;
+ }
+
+ len = strlen(de->d_name);
+
+ if (len < 3 || strncmp(de->d_name + len - 3, ".so", 3))
+ continue;
+
+ snprintf(path, sizeof(path), "%s/%s", opts.libdir, de->d_name);
+
+ if (cr_lib_load(stage, path))
+ goto err;
+ }
+
+ exit_code = 0;
+err:
+ closedir(d);
+
+ if (exit_code)
+ cr_plugin_fini(stage, exit_code);
+
+ return exit_code;
+}
diff --git a/criu/proc_parse.c b/criu/proc_parse.c
new file mode 100644
index 000000000000..c7c577554bae
--- /dev/null
+++ b/criu/proc_parse.c
@@ -0,0 +1,2444 @@
+#include <stdio.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <errno.h>
+#include <sys/stat.h>
+#include <string.h>
+#include <ctype.h>
+#include <linux/fs.h>
+
+#include "asm/types.h"
+#include "list.h"
+#include "util.h"
+#include "mount.h"
+#include "mman.h"
+#include "cpu.h"
+#include "file-lock.h"
+#include "pstree.h"
+#include "fsnotify.h"
+#include "posix-timer.h"
+#include "kerndat.h"
+#include "vdso.h"
+#include "vma.h"
+#include "bfd.h"
+#include "proc_parse.h"
+#include "cr_options.h"
+#include "sysfs_parse.h"
+#include "seccomp.h"
+#include "string.h"
+#include "namespaces.h"
+#include "files-reg.h"
+
+#include "protobuf.h"
+#include "protobuf/fdinfo.pb-c.h"
+#include "protobuf/mnt.pb-c.h"
+
+#include <stdlib.h>
+
+struct buffer {
+ char buf[PAGE_SIZE];
+ char end; /* '\0' */
+};
+
+static struct buffer __buf;
+static char *buf = __buf.buf;
+
+#define BUF_SIZE sizeof(__buf.buf)
+
+/*
+ * This is how AIO ring buffers look like in proc
+ */
+
+#define AIO_FNAME "/[aio]"
+
+/* check the @line starts with "%lx-%lx" format */
+static bool is_vma_range_fmt(char *line)
+{
+#define ____is_vma_addr_char(__c) \
+ (((__c) <= '9' && (__c) >= '0') || \
+ ((__c) <= 'f' && (__c) >= 'a'))
+
+ while (*line && ____is_vma_addr_char(*line))
+ line++;
+
+ if (*line++ != '-')
+ return false;
+
+ while (*line && ____is_vma_addr_char(*line))
+ line++;
+
+ if (*line++ != ' ')
+ return false;
+
+ return true;
+#undef ____is_vma_addr_char
+}
+
+static int parse_vmflags(char *buf, struct vma_area *vma_area)
+{
+ char *tok;
+ bool shared = false;
+ bool maywrite = false;
+
+ if (!buf[0])
+ return 0;
+
+ tok = strtok(buf, " \n");
+ if (!tok)
+ return 0;
+
+#define _vmflag_match(_t, _s) (_t[0] == _s[0] && _t[1] == _s[1])
+
+ do {
+ /* open() block */
+ if (_vmflag_match(tok, "sh"))
+ shared = true;
+ else if (_vmflag_match(tok, "mw"))
+ maywrite = true;
+
+ /* mmap() block */
+ if (_vmflag_match(tok, "gd"))
+ vma_area->e->flags |= MAP_GROWSDOWN;
+ else if (_vmflag_match(tok, "lo"))
+ vma_area->e->flags |= MAP_LOCKED;
+ else if (_vmflag_match(tok, "nr"))
+ vma_area->e->flags |= MAP_NORESERVE;
+ else if (_vmflag_match(tok, "ht"))
+ vma_area->e->flags |= MAP_HUGETLB;
+
+ /* madvise() block */
+ if (_vmflag_match(tok, "sr"))
+ vma_area->e->madv |= (1ul << MADV_SEQUENTIAL);
+ else if (_vmflag_match(tok, "rr"))
+ vma_area->e->madv |= (1ul << MADV_RANDOM);
+ else if (_vmflag_match(tok, "dc"))
+ vma_area->e->madv |= (1ul << MADV_DONTFORK);
+ else if (_vmflag_match(tok, "dd"))
+ vma_area->e->madv |= (1ul << MADV_DONTDUMP);
+ else if (_vmflag_match(tok, "mg"))
+ vma_area->e->madv |= (1ul << MADV_MERGEABLE);
+ else if (_vmflag_match(tok, "hg"))
+ vma_area->e->madv |= (1ul << MADV_HUGEPAGE);
+ else if (_vmflag_match(tok, "nh"))
+ vma_area->e->madv |= (1ul << MADV_NOHUGEPAGE);
+
+ /* vmsplice doesn't work for VM_IO and VM_PFNMAP mappings. */
+ if (_vmflag_match(tok, "io") || _vmflag_match(tok, "pf")) {
+ /*
+ * VVAR area mapped by the kernel as
+ * VM_IO | VM_PFNMAP| VM_DONTEXPAND | VM_DONTDUMP
+ */
+ if (!vma_area_is(vma_area, VMA_AREA_VVAR))
+ vma_area->e->status |= VMA_UNSUPP;
+ }
+
+ /*
+ * Anything else is just ignored.
+ */
+ } while ((tok = strtok(NULL, " \n")));
+
+#undef _vmflag_match
+
+ if (shared && maywrite)
+ vma_area->e->fdflags = O_RDWR;
+ else
+ vma_area->e->fdflags = O_RDONLY;
+ vma_area->e->has_fdflags = true;
+
+ if (vma_area->e->madv)
+ vma_area->e->has_madv = true;
+
+ return 0;
+}
+
+static inline int is_anon_shmem_map(dev_t dev)
+{
+ return kdat.shmem_dev == dev;
+}
+
+struct vma_file_info {
+ int dev_maj;
+ int dev_min;
+ unsigned long ino;
+ struct vma_area *vma;
+};
+
+static inline int vfi_equal(struct vma_file_info *a, struct vma_file_info *b)
+{
+ return ((a->ino ^ b->ino) |
+ (a->dev_maj ^ b->dev_maj) |
+ (a->dev_min ^ b->dev_min)) == 0;
+}
+
+static int vma_get_mapfile(char *fname, struct vma_area *vma, DIR *mfd,
+ struct vma_file_info *vfi, struct vma_file_info *prev_vfi)
+{
+ char path[32];
+ int flags;
+
+ if (prev_vfi->vma && vfi_equal(vfi, prev_vfi)) {
+ struct vma_area *prev = prev_vfi->vma;
+
+ /*
+ * If vfi is equal (!) and negative @vm_file_fd --
+ * we have nothing to borrow for sure.
+ */
+ if (prev->vm_file_fd < 0)
+ return 0;
+
+ pr_debug("vma %"PRIx64" borrows vfi from previous %"PRIx64"\n",
+ vma->e->start, prev->e->start);
+ vma->vm_file_fd = prev->vm_file_fd;
+ if (prev->e->status & VMA_AREA_SOCKET)
+ vma->e->status |= VMA_AREA_SOCKET | VMA_AREA_REGULAR;
+
+ /*
+ * FIXME -- in theory there can be vmas that have
+ * dev:ino match, but live in different mount
+ * namespaces. However, we only borrow files for
+ * subsequent vmas. These are _very_ likely to
+ * have files from the same namespaces.
+ */
+ vma->file_borrowed = true;
+
+ return 0;
+ }
+
+ /* Figure out if it's file mapping */
+ snprintf(path, sizeof(path), "%"PRIx64"-%"PRIx64, vma->e->start, vma->e->end);
+
+ /*
+ * Note that we "open" it in dumper process space
+ * so later we might refer to it via /proc/self/fd/vm_file_fd
+ * if needed.
+ */
+ flags = O_PATH;
+ if (vfi->dev_maj == 0)
+ /*
+ * Opening with O_PATH omits calling kernel ->open
+ * method, thus for some special files their type
+ * detection might be broken. Thus we open those with
+ * the O_RDONLY to potentially get ENXIO and check
+ * it below.
+ */
+ flags = O_RDONLY;
+
+ vma->vm_file_fd = openat(dirfd(mfd), path, flags);
+ if (vma->vm_file_fd < 0) {
+ if (errno == ENOENT)
+ /* Just mapping w/o map_files link */
+ return 0;
+
+ if (errno == ENXIO) {
+ struct stat buf;
+
+ if (fstatat(dirfd(mfd), path, &buf, 0))
+ return -1;
+
+ if (S_ISSOCK(buf.st_mode)) {
+ pr_info("Found socket mapping @%"PRIx64"\n", vma->e->start);
+ vma->vm_socket_id = buf.st_ino;
+ vma->e->status |= VMA_AREA_SOCKET | VMA_AREA_REGULAR;
+ return 0;
+ }
+
+ if ((buf.st_mode & S_IFMT) == 0 && !strncmp(fname, AIO_FNAME, sizeof(AIO_FNAME) - 1)) {
+ /* AIO ring, let's try */
+ close(vma->vm_file_fd);
+ vma->aio_nr_req = -1;
+ vma->e->status = VMA_AREA_AIORING;
+ return 0;
+ }
+
+ pr_err("Unknown shit %o (%s)\n", buf.st_mode, fname);
+ return -1;
+ }
+
+ if (errno == EPERM && !opts.aufs) {
+ int fd;
+ dev_t vfi_dev;
+
+ /*
+ * Kernel prohibits reading map_files for users. The
+ * best we can do here is fill stat using the information
+ * from smaps file and ... hope for the better :\
+ *
+ * Here we'll miss AIO-s and sockets :(
+ */
+
+ if (fname[0] == '\0') {
+ /*
+ * Another bad thing is that kernel first checks
+ * for permission access to ANY map_files link,
+ * then checks for its existance. So we have to
+ * check for file path being empty to "emulate"
+ * the ENOENT case.
+ */
+
+ if (vfi->dev_maj != 0 || vfi->dev_min != 0 || vfi->ino != 0) {
+ pr_err("Strange file mapped at %lx [%s]:%d.%d.%ld\n",
+ (unsigned long)vma->e->start, fname,
+ vfi->dev_maj, vfi->dev_min, vfi->ino);
+ return -1;
+ }
+
+ return 0;
+ } else if (fname[0] != '/') {
+ /*
+ * This should be some kind of
+ * special mapping like [heap], [vdso]
+ * and such, the caller should take care
+ * of the @fname and vma status.
+ */
+ return 0;
+ }
+
+ vfi_dev = makedev(vfi->dev_maj, vfi->dev_min);
+ if (is_anon_shmem_map(vfi_dev)) {
+ if (!(vma->e->flags & MAP_SHARED))
+ return -1;
+
+ vma->e->flags |= MAP_ANONYMOUS;
+ vma->e->status |= VMA_ANON_SHARED;
+ vma->e->shmid = vfi->ino;
+
+ if (!strncmp(fname, "/SYSV", 5))
+ vma->e->status |= VMA_AREA_SYSVIPC;
+
+ return 0;
+ }
+
+ pr_info("Failed to open map_files/%s, try to go via [%s] path\n", path, fname);
+ fd = open(fname, O_RDONLY);
+ if (fd < 0) {
+ pr_perror("Can't open mapped [%s]", fname);
+ return -1;
+ }
+
+ vma->vmst = xmalloc(sizeof(struct stat));
+ if (!vma->vmst) {
+ close(fd);
+ return -1;
+ }
+
+ if (fstat(fd, vma->vmst) < 0) {
+ pr_perror("Can't stat [%s]\n", fname);
+ close(fd);
+ return -1;
+ }
+
+ if (vma->vmst->st_dev != vfi_dev ||
+ vma->vmst->st_ino != vfi->ino) {
+ pr_err("Failed to resolve mapping %lx filename\n",
+ (unsigned long)vma->e->start);
+ close(fd);
+ return -1;
+ }
+
+ vma->vm_file_fd = fd;
+ return 0;
+ }
+
+ pr_perror("Can't open map_files");
+ return -1;
+ }
+
+ vma->vmst = xmalloc(sizeof(struct stat));
+ if (!vma->vmst)
+ return -1;
+
+ /*
+ * For AUFS support, we need to check if the symbolic link
+ * points to a branch. If it does, we cannot fstat() its file
+ * descriptor because it would return a different dev/ino than
+ * the real file. If fixup_aufs_vma_fd() returns positive,
+ * it means that it has stat()'ed using the full pathname.
+ * Zero return means that the symbolic link does not point to
+ * a branch and we can do fstat() below.
+ */
+ if (opts.aufs) {
+ int ret;
+
+ ret = fixup_aufs_vma_fd(vma);
+ if (ret < 0)
+ return -1;
+ if (ret > 0)
+ return 0;
+ }
+
+ if (fstat(vma->vm_file_fd, vma->vmst) < 0) {
+ pr_perror("Failed fstat on map %"PRIx64"", vma->e->start);
+ return -1;
+ }
+
+ return 0;
+}
+
+int parse_self_maps_lite(struct vm_area_list *vms)
+{
+ FILE *maps;
+
+ vm_area_list_init(vms);
+
+ maps = fopen_proc(PROC_SELF, "maps");
+ if (maps == NULL) {
+ pr_perror("Can't open self maps");
+ return -1;
+ }
+
+ while (fgets(buf, BUF_SIZE, maps) != NULL) {
+ struct vma_area *vma;
+ char *end;
+
+ vma = alloc_vma_area();
+ if (!vma) {
+ fclose(maps);
+ return -1;
+ }
+
+ vma->e->start = strtoul(buf, &end, 16);
+ vma->e->end = strtoul(end + 1, NULL, 16);
+ list_add_tail(&vma->list, &vms->h);
+ vms->nr++;
+
+ pr_debug("Parsed %"PRIx64"-%"PRIx64" vma\n", vma->e->start, vma->e->end);
+ }
+
+ fclose(maps);
+ return 0;
+}
+
+#ifdef CONFIG_VDSO
+static inline int handle_vdso_vma(struct vma_area *vma)
+{
+ vma->e->status |= VMA_AREA_REGULAR;
+ if ((vma->e->prot & VDSO_PROT) == VDSO_PROT)
+ vma->e->status |= VMA_AREA_VDSO;
+ return 0;
+}
+
+static inline int handle_vvar_vma(struct vma_area *vma)
+{
+ vma->e->status |= VMA_AREA_REGULAR;
+ if ((vma->e->prot & VVAR_PROT) == VVAR_PROT)
+ vma->e->status |= VMA_AREA_VVAR;
+ return 0;
+}
+#else
+static inline int handle_vdso_vma(struct vma_area *vma)
+{
+ pr_warn_once("Found vDSO area without support\n");
+ return -1;
+}
+
+static inline int handle_vvar_vma(struct vma_area *vma)
+{
+ pr_warn_once("Found VVAR area without support\n");
+ return -1;
+}
+#endif
+
+static int handle_vma(pid_t pid, struct vma_area *vma_area,
+ char *file_path, DIR *map_files_dir,
+ struct vma_file_info *vfi,
+ struct vma_file_info *prev_vfi,
+ struct vm_area_list *vma_area_list)
+{
+ if (vma_get_mapfile(file_path, vma_area, map_files_dir, vfi, prev_vfi))
+ goto err_bogus_mapfile;
+
+ if (vma_area->e->status != 0) {
+ if (vma_area->e->status & VMA_AREA_AIORING)
+ vma_area_list->nr_aios++;
+ return 0;
+ } else if (!strcmp(file_path, "[vsyscall]") ||
+ !strcmp(file_path, "[vectors]")) {
+ vma_area->e->status |= VMA_AREA_VSYSCALL;
+ } else if (!strcmp(file_path, "[vdso]")) {
+ if (handle_vdso_vma(vma_area))
+ goto err;
+ } else if (!strcmp(file_path, "[vvar]")) {
+ if (handle_vvar_vma(vma_area))
+ goto err;
+ } else if (!strcmp(file_path, "[heap]")) {
+ vma_area->e->status |= VMA_AREA_REGULAR | VMA_AREA_HEAP;
+ } else {
+ vma_area->e->status = VMA_AREA_REGULAR;
+ }
+
+ /*
+ * Some mapping hints for restore, we save this on
+ * disk and restore might need to analyze it.
+ */
+ if (vma_area->file_borrowed) {
+ struct vma_area *prev = prev_vfi->vma;
+
+ /*
+ * Pick-up flags that might be set in the branch below.
+ * Status is copied as-is as it should be zero here,
+ * and have full match with the previous.
+ */
+ vma_area->e->flags |= (prev->e->flags & MAP_ANONYMOUS);
+ vma_area->e->status = prev->e->status;
+ vma_area->e->shmid = prev->e->shmid;
+ vma_area->vmst = prev->vmst;
+ vma_area->mnt_id = prev->mnt_id;
+ } else if (vma_area->vm_file_fd >= 0) {
+ struct stat *st_buf = vma_area->vmst;
+
+ if (S_ISREG(st_buf->st_mode))
+ /* regular file mapping -- supported */;
+ else if (S_ISCHR(st_buf->st_mode) && (st_buf->st_rdev == DEVZERO))
+ /* devzero mapping -- also makes sense */;
+ else {
+ pr_err("Can't handle non-regular mapping on %d's map %"PRIx64"\n", pid, vma_area->e->start);
+ goto err;
+ }
+
+ /*
+ * /dev/zero stands for anon-shared mapping
+ * otherwise it's some file mapping.
+ */
+ if (is_anon_shmem_map(st_buf->st_dev)) {
+ if (!(vma_area->e->flags & MAP_SHARED))
+ goto err_bogus_mapping;
+ vma_area->e->flags |= MAP_ANONYMOUS;
+ vma_area->e->status |= VMA_ANON_SHARED;
+ vma_area->e->shmid = st_buf->st_ino;
+
+ if (!strncmp(file_path, "/SYSV", 5)) {
+ pr_info("path: %s\n", file_path);
+ vma_area->e->status |= VMA_AREA_SYSVIPC;
+ }
+ } else {
+ if (vma_area->e->flags & MAP_PRIVATE)
+ vma_area->e->status |= VMA_FILE_PRIVATE;
+ else
+ vma_area->e->status |= VMA_FILE_SHARED;
+ }
+
+ /*
+ * We cannot use the mnt_id value provided by the kernel
+ * for vm_file_fd if it is an AUFS file (the value is
+ * wrong). In such a case, fixup_aufs_vma_fd() has set
+ * mnt_id to -1 to mimic pre-3.15 kernels that didn't
+ * have mnt_id.
+ */
+ if (vma_area->mnt_id != -1 &&
+ get_fd_mntid(vma_area->vm_file_fd, &vma_area->mnt_id))
+ return -1;
+ } else {
+ /*
+ * No file but mapping -- anonymous one.
+ */
+ if (vma_area->e->flags & MAP_SHARED) {
+ vma_area->e->status |= VMA_ANON_SHARED;
+ vma_area->e->shmid = vfi->ino;
+ } else {
+ vma_area->e->status |= VMA_ANON_PRIVATE;
+ }
+ vma_area->e->flags |= MAP_ANONYMOUS;
+ }
+
+ return 0;
+err:
+ return -1;
+err_bogus_mapping:
+ pr_err("Bogus mapping 0x%"PRIx64"-0x%"PRIx64" (flags: %#x vm_file_fd: %d)\n",
+ vma_area->e->start, vma_area->e->end,
+ vma_area->e->flags, vma_area->vm_file_fd);
+ goto err;
+
+err_bogus_mapfile:
+ pr_perror("Can't open %d's mapfile link %"PRIx64, pid, vma_area->e->start);
+ goto err;
+}
+
+static int vma_list_add(struct vma_area *vma_area,
+ struct vm_area_list *vma_area_list,
+ unsigned long *prev_end,
+ struct vma_file_info *vfi, struct vma_file_info *prev_vfi)
+{
+ if (vma_area->e->status & VMA_UNSUPP) {
+ pr_err("Unsupported mapping found %016"PRIx64"-%016"PRIx64"\n",
+ vma_area->e->start, vma_area->e->end);
+ return -1;
+ }
+
+ /* Add a guard page only if here is enough space for it */
+ if ((vma_area->e->flags & MAP_GROWSDOWN) &&
+ *prev_end < vma_area->e->start)
+ vma_area->e->start -= PAGE_SIZE; /* Guard page */
+ *prev_end = vma_area->e->end;
+
+ list_add_tail(&vma_area->list, &vma_area_list->h);
+ vma_area_list->nr++;
+ if (vma_area_is_private(vma_area, kdat.task_size)) {
+ unsigned long pages;
+
+ pages = vma_area_len(vma_area) / PAGE_SIZE;
+ vma_area_list->priv_size += pages;
+ vma_area_list->longest = max(vma_area_list->longest, pages);
+ }
+
+ *prev_vfi = *vfi;
+ prev_vfi->vma = vma_area;
+
+ return 0;
+}
+
+int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list)
+{
+ struct vma_area *vma_area = NULL;
+ unsigned long start, end, pgoff, prev_end = 0;
+ char r, w, x, s;
+ int ret = -1;
+ struct vma_file_info vfi;
+ struct vma_file_info prev_vfi = {};
+
+ DIR *map_files_dir = NULL;
+ struct bfd f;
+
+ vma_area_list->nr = 0;
+ vma_area_list->nr_aios = 0;
+ vma_area_list->longest = 0;
+ vma_area_list->priv_size = 0;
+ INIT_LIST_HEAD(&vma_area_list->h);
+
+ f.fd = open_proc(pid, "smaps");
+ if (f.fd < 0)
+ goto err_n;
+
+ if (bfdopenr(&f))
+ goto err_n;
+
+ map_files_dir = opendir_proc(pid, "map_files");
+ if (!map_files_dir) /* old kernel? */
+ goto err;
+
+ while (1) {
+ int num, path_off;
+ bool eof;
+ char *str;
+
+ str = breadline(&f);
+ if (IS_ERR(str))
+ goto err;
+ eof = (str == NULL);
+
+ if (!eof && !is_vma_range_fmt(str)) {
+ if (!strncmp(str, "Nonlinear", 9)) {
+ BUG_ON(!vma_area);
+ pr_err("Nonlinear mapping found %016"PRIx64"-%016"PRIx64"\n",
+ vma_area->e->start, vma_area->e->end);
+ /*
+ * VMA is already on list and will be
+ * freed later as list get destroyed.
+ */
+ vma_area = NULL;
+ goto err;
+ } else if (!strncmp(str, "VmFlags: ", 9)) {
+ BUG_ON(!vma_area);
+ if (parse_vmflags(&str[9], vma_area))
+ goto err;
+ continue;
+ } else
+ continue;
+ }
+
+ if (vma_area && vma_list_add(vma_area, vma_area_list,
+ &prev_end, &vfi, &prev_vfi))
+ goto err;
+
+ if (eof)
+ break;
+
+ vma_area = alloc_vma_area();
+ if (!vma_area)
+ goto err;
+
+ num = sscanf(str, "%lx-%lx %c%c%c%c %lx %x:%x %lu %n",
+ &start, &end, &r, &w, &x, &s, &pgoff,
+ &vfi.dev_maj, &vfi.dev_min, &vfi.ino, &path_off);
+ if (num < 10) {
+ pr_err("Can't parse: %s\n", str);
+ goto err;
+ }
+
+ vma_area->e->start = start;
+ vma_area->e->end = end;
+ vma_area->e->pgoff = pgoff;
+ vma_area->e->prot = PROT_NONE;
+
+ if (r == 'r')
+ vma_area->e->prot |= PROT_READ;
+ if (w == 'w')
+ vma_area->e->prot |= PROT_WRITE;
+ if (x == 'x')
+ vma_area->e->prot |= PROT_EXEC;
+
+ if (s == 's')
+ vma_area->e->flags = MAP_SHARED;
+ else if (s == 'p')
+ vma_area->e->flags = MAP_PRIVATE;
+ else {
+ pr_err("Unexpected VMA met (%c)\n", s);
+ goto err;
+ }
+
+ if (handle_vma(pid, vma_area, str + path_off, map_files_dir,
+ &vfi, &prev_vfi, vma_area_list))
+ goto err;
+ }
+
+ vma_area = NULL;
+ ret = 0;
+
+err:
+ bclose(&f);
+err_n:
+ if (map_files_dir)
+ closedir(map_files_dir);
+
+ xfree(vma_area);
+ return ret;
+
+}
+
+int parse_pid_stat(pid_t pid, struct proc_pid_stat *s)
+{
+ char *tok, *p;
+ int fd;
+ int n;
+
+ fd = open_proc(pid, "stat");
+ if (fd < 0)
+ return -1;
+
+ n = read(fd, buf, BUF_SIZE);
+ close(fd);
+ if (n < 1) {
+ pr_err("stat for %d is corrupted\n", pid);
+ return -1;
+ }
+
+ memset(s, 0, sizeof(*s));
+
+ tok = strchr(buf, ' ');
+ if (!tok)
+ goto err;
+ *tok++ = '\0';
+ if (*tok != '(')
+ goto err;
+
+ s->pid = atoi(buf);
+
+ p = strrchr(tok + 1, ')');
+ if (!p)
+ goto err;
+ *tok = '\0';
+ *p = '\0';
+
+ strlcpy(s->comm, tok + 1, sizeof(s->comm));
+
+ n = sscanf(p + 1,
+ " %c %d %d %d %d %d %u %lu %lu %lu %lu "
+ "%lu %lu %ld %ld %ld %ld %d %d %llu %lu %ld %lu %lu %lu %lu "
+ "%lu %lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld "
+ "%lu %lu %lu %lu %lu %lu %lu %d",
+ &s->state,
+ &s->ppid,
+ &s->pgid,
+ &s->sid,
+ &s->tty_nr,
+ &s->tty_pgrp,
+ &s->flags,
+ &s->min_flt,
+ &s->cmin_flt,
+ &s->maj_flt,
+ &s->cmaj_flt,
+ &s->utime,
+ &s->stime,
+ &s->cutime,
+ &s->cstime,
+ &s->priority,
+ &s->nice,
+ &s->num_threads,
+ &s->zero0,
+ &s->start_time,
+ &s->vsize,
+ &s->mm_rss,
+ &s->rsslim,
+ &s->start_code,
+ &s->end_code,
+ &s->start_stack,
+ &s->esp,
+ &s->eip,
+ &s->sig_pending,
+ &s->sig_blocked,
+ &s->sig_ignored,
+ &s->sig_handled,
+ &s->wchan,
+ &s->zero1,
+ &s->zero2,
+ &s->exit_signal,
+ &s->task_cpu,
+ &s->rt_priority,
+ &s->policy,
+ &s->delayacct_blkio_ticks,
+ &s->gtime,
+ &s->cgtime,
+ &s->start_data,
+ &s->end_data,
+ &s->start_brk,
+ &s->arg_start,
+ &s->arg_end,
+ &s->env_start,
+ &s->env_end,
+ &s->exit_code);
+ if (n < 50)
+ goto err;
+
+ return 0;
+
+err:
+ pr_err("Parsing %d's stat failed (#fields do not match)\n", pid);
+ return -1;
+}
+
+int prepare_loginuid(unsigned int value, unsigned int loglevel)
+{
+ int fd, ret = 0;
+ char buf[11]; /* 4294967295 is maximum for u32 */
+
+ fd = open_proc_rw(PROC_SELF, "loginuid");
+ if (fd < 0)
+ return -1;
+
+ snprintf(buf, 11, "%u", value);
+
+ if (write(fd, buf, 11) < 0) {
+ print_on_level(loglevel,
+ "Write %s to /proc/self/loginuid failed: %s",
+ buf, strerror(errno));
+ ret = -1;
+ }
+ close(fd);
+ return ret;
+}
+
+unsigned int parse_pid_loginuid(pid_t pid, int *err, bool ignore_noent)
+{
+ int fd;
+ ssize_t num;
+
+ *err = 0;
+ fd = __open_proc(pid, (ignore_noent) ? ENOENT : 0,
+ O_RDONLY, "loginuid");
+ if (fd < 0)
+ goto out;
+
+ num = read(fd, buf, 10);
+ close(fd);
+ if (num < 0) {
+ pr_perror("Unable to read /proc/%d/loginuid", pid);
+ goto out;
+ }
+ buf[num] = '\0';
+
+ return strtol(buf, NULL, 10);
+
+out:
+ *err = -1;
+ return INVALID_UID; /* unset value */
+}
+
+int parse_pid_oom_score_adj(pid_t pid, int *err)
+{
+ int fd;
+ ssize_t num;
+
+ *err = 0;
+ fd = open_proc(pid, "oom_score_adj");
+ if (fd < 0)
+ goto out;
+
+ num = read(fd, buf, 10);
+ close(fd);
+ if (num < 0) {
+ pr_perror("Unable to read /proc/%d/oom_score_adj", pid);
+ goto out;
+ }
+ buf[num] = '\0';
+
+ return strtol(buf, NULL, 10);
+
+out:
+ *err = -1;
+ return 0;
+}
+
+static int ids_parse(char *str, unsigned int *arr)
+{
+ char *end;
+
+ arr[0] = strtol(str, &end, 10);
+ arr[1] = strtol(end + 1, &end, 10);
+ arr[2] = strtol(end + 1, &end, 10);
+ arr[3] = strtol(end + 1, &end, 10);
+ if (*end)
+ return -1;
+ else
+ return 0;
+}
+
+static int cap_parse(char *str, unsigned int *res)
+{
+ int i, ret;
+
+ for (i = 0; i < PROC_CAP_SIZE; i++) {
+ ret = sscanf(str, "%08x", &res[PROC_CAP_SIZE - 1 - i]);
+ if (ret != 1)
+ return -1;
+ str += 8;
+ }
+
+ return 0;
+}
+
+int parse_pid_status(pid_t pid, struct proc_status_creds *cr)
+{
+ struct bfd f;
+ int done = 0;
+ int ret = -1;
+ char *str;
+ bool parsed_seccomp = false;
+
+ f.fd = open_proc(pid, "status");
+ if (f.fd < 0) {
+ pr_perror("Can't open proc status");
+ return -1;
+ }
+
+ cr->sigpnd = 0;
+ cr->shdpnd = 0;
+
+ if (bfdopenr(&f))
+ return -1;
+
+ while (done < 12) {
+ str = breadline(&f);
+ if (str == NULL)
+ break;
+ if (IS_ERR(str))
+ goto err_parse;
+
+ if (!strncmp(str, "State:", 6)) {
+ cr->state = str[7];
+ done++;
+ continue;
+ }
+
+ if (!strncmp(str, "PPid:", 5)) {
+ if (sscanf(str, "PPid:\t%d", &cr->ppid) != 1) {
+ pr_err("Unable to parse: %s\n", str);
+ goto err_parse;
+ }
+ done++;
+ continue;
+ }
+
+ if (!strncmp(str, "Uid:", 4)) {
+ if (ids_parse(str + 5, cr->uids))
+ goto err_parse;
+
+ done++;
+ continue;
+ }
+
+ if (!strncmp(str, "Gid:", 4)) {
+ if (ids_parse(str + 5, cr->gids))
+ goto err_parse;
+
+ done++;
+ continue;
+ }
+
+ if (!strncmp(str, "CapInh:", 7)) {
+ if (cap_parse(str + 8, cr->cap_inh))
+ goto err_parse;
+
+ done++;
+ continue;
+ }
+
+ if (!strncmp(str, "CapEff:", 7)) {
+ if (cap_parse(str + 8, cr->cap_eff))
+ goto err_parse;
+
+ done++;
+ continue;
+ }
+
+ if (!strncmp(str, "CapPrm:", 7)) {
+ if (cap_parse(str + 8, cr->cap_prm))
+ goto err_parse;
+
+ done++;
+ continue;
+ }
+
+ if (!strncmp(str, "CapBnd:", 7)) {
+ if (cap_parse(str + 8, cr->cap_bnd))
+ goto err_parse;
+
+ done++;
+ continue;
+ }
+
+ if (!strncmp(str, "Seccomp:", 8)) {
+ if (sscanf(str + 9, "%d", &cr->seccomp_mode) != 1) {
+ goto err_parse;
+ }
+
+ parsed_seccomp = true;
+ done++;
+ continue;
+ }
+
+ if (!strncmp(str, "ShdPnd:", 7)) {
+ unsigned long long sigpnd;
+
+ if (sscanf(str + 7, "%llx", &sigpnd) != 1)
+ goto err_parse;
+ cr->shdpnd |= sigpnd;
+
+ done++;
+ continue;
+ }
+ if (!strncmp(str, "SigPnd:", 7)) {
+ unsigned long long sigpnd;
+
+ if (sscanf(str + 7, "%llx", &sigpnd) != 1)
+ goto err_parse;
+ cr->sigpnd |= sigpnd;
+
+ done++;
+ continue;
+ }
+ }
+
+ /* seccomp is optional */
+ if (done >= 11 || (done == 10 && !parsed_seccomp))
+ ret = 0;
+
+err_parse:
+ if (ret)
+ pr_err("Error parsing proc status file\n");
+ bclose(&f);
+ return ret;
+}
+
+struct opt2flag {
+ char *opt;
+ unsigned flag;
+};
+
+static bool sb_opt_cb(char *opt, char *unknown, size_t *uoff)
+{
+ unsigned int id;
+
+ if (sscanf(opt, "gid=%d", &id) == 1) {
+ *uoff += sprintf(unknown + *uoff, "gid=%d", userns_gid(id));
+ unknown[*uoff] = ',';
+ (*uoff)++;
+ return true;
+ } else if (sscanf(opt, "uid=%d", &id) == 1) {
+ *uoff += sprintf(unknown + *uoff, "uid=%d", userns_uid(id));
+ unknown[*uoff] = ',';
+ (*uoff)++;
+ return true;
+ }
+ return false;
+}
+
+static int do_opt2flag(char *opt, unsigned *flags,
+ const struct opt2flag *opts, char *unknown,
+ bool (*cb)(char *opt, char *unknown, size_t *uoff))
+{
+ int i;
+ char *end;
+ size_t uoff = 0;
+
+ while (1) {
+ end = strchr(opt, ',');
+ if (end)
+ *end = '\0';
+
+ for (i = 0; opts[i].opt != NULL; i++)
+ if (!strcmp(opts[i].opt, opt)) {
+ (*flags) |= opts[i].flag;
+ break;
+ }
+
+ if (opts[i].opt == NULL && cb && !cb(opt, unknown, &uoff)) {
+ if (!unknown) {
+ pr_err("Unknown option [%s]\n", opt);
+ return -1;
+ }
+
+ strcpy(unknown + uoff, opt);
+ uoff += strlen(opt);
+ unknown[uoff] = ',';
+ uoff++;
+ }
+
+ if (!end) {
+ if (uoff)
+ uoff--;
+ if (unknown)
+ unknown[uoff] = '\0';
+ break;
+ } else
+ opt = end + 1;
+ }
+
+ return 0;
+}
+
+static int parse_mnt_flags(char *opt, unsigned *flags)
+{
+ static const struct opt2flag mnt_opt2flag[] = {
+ { "rw", 0, },
+ { "ro", MS_RDONLY, },
+ { "nosuid", MS_NOSUID, },
+ { "nodev", MS_NODEV, },
+ { "noexec", MS_NOEXEC, },
+ { "noatime", MS_NOATIME, },
+ { "nodiratime", MS_NODIRATIME, },
+ { "relatime", MS_RELATIME, },
+ { },
+ };
+
+ if (do_opt2flag(opt, flags, mnt_opt2flag, NULL, NULL))
+ return -1;
+
+ /* Otherwise the kernel assumes RELATIME by default */
+ if ((*flags & (MS_RELATIME | MS_NOATIME)) == 0)
+ *flags = MS_STRICTATIME;
+
+ return 0;
+}
+
+static int parse_sb_opt(char *opt, unsigned *flags, char *uopt)
+{
+ static const struct opt2flag sb_opt2flag[] = {
+ { "rw", 0, },
+ { "ro", MS_RDONLY, },
+ { "sync", MS_SYNC, },
+ { "dirsync", MS_DIRSYNC, },
+ { "mad", MS_MANDLOCK, },
+ { },
+ };
+
+ return do_opt2flag(opt, flags, sb_opt2flag, uopt, sb_opt_cb);
+}
+
+static int parse_mnt_opt(char *str, struct mount_info *mi, int *off)
+{
+ char *istr = str, *end;
+
+ while (1) {
+ end = strchr(str, ' ');
+ if (!end) {
+ pr_err("Error parsing mount options\n");
+ return -1;
+ }
+
+ *end = '\0';
+ if (!strncmp(str, "-", 1))
+ break;
+ else if (!strncmp(str, "shared:", 7)) {
+ mi->flags |= MS_SHARED;
+ mi->shared_id = atoi(str + 7);
+ } else if (!strncmp(str, "master:", 7)) {
+ mi->flags |= MS_SLAVE;
+ mi->master_id = atoi(str + 7);
+ } else if (!strncmp(str, "propagate_from:", 15)) {
+ /* skip */;
+ } else if (!strncmp(str, "unbindable", 11))
+ mi->flags |= MS_UNBINDABLE;
+ else {
+ pr_err("Unknown option [%s]\n", str);
+ return -1;
+ }
+
+ str = end + 1;
+ }
+
+ *off = end - istr + 1;
+ return 0;
+}
+
+/*
+ * mountinfo contains mangled paths. space, tab and back slash were replaced
+ * with usual octal escape. This function replaces these symbols back.
+ */
+static void cure_path(char *path)
+{
+ int i, len, off = 0;
+
+ if (strchr(path, '\\') == NULL) /* fast path */
+ return;
+
+ len = strlen(path);
+ for (i = 0; i < len; i++) {
+ if (!strncmp(path + i, "\\040", 4)) {
+ path[i - off] = ' ';
+ goto replace;
+ } else if (!strncmp(path + i, "\\011", 4)) {
+ path[i - off] = '\t';
+ goto replace;
+ } else if (!strncmp(path + i, "\\134", 4)) {
+ path[i - off] = '\\';
+ goto replace;
+ }
+ if (off)
+ path[i - off] = path[i];
+ continue;
+replace:
+ off += 3;
+ i += 3;
+ }
+ path[len - off] = 0;
+}
+
+static int parse_mountinfo_ent(char *str, struct mount_info *new, char **fsname)
+{
+ struct fd_link root_link;
+ unsigned int kmaj, kmin;
+ int ret, n;
+ char *sub, *opt = NULL;
+
+ new->mountpoint = xmalloc(PATH_MAX);
+ if (new->mountpoint == NULL)
+ goto err;
+ new->ns_mountpoint = new->mountpoint;
+
+ new->mountpoint[0] = '.';
+ ret = sscanf(str, "%i %i %u:%u %ms %s %ms %n",
+ &new->mnt_id, &new->parent_mnt_id,
+ &kmaj, &kmin, &new->root, new->mountpoint + 1,
+ &opt, &n);
+ if (ret != 7)
+ goto err;
+
+ cure_path(new->mountpoint);
+ cure_path(new->root);
+
+ root_link.len = strlen(new->root);
+ strcpy(root_link.name, new->root);
+ if (strip_deleted(&root_link)) {
+ strcpy(new->root, root_link.name);
+ new->deleted = true;
+ }
+
+ new->mountpoint = xrealloc(new->mountpoint, strlen(new->mountpoint) + 1);
+ if (!new->mountpoint)
+ goto err;
+
+ new->s_dev = new->s_dev_rt = MKKDEV(kmaj, kmin);
+ new->flags = 0;
+ if (parse_mnt_flags(opt, &new->flags))
+ goto err;
+
+ free(opt); /* we are going to reallocate/reuse this buffer */
+ opt = NULL;
+
+ str += n;
+ if (parse_mnt_opt(str, new, &n))
+ goto err;
+
+ str += n;
+ ret = sscanf(str, "%ms %ms %ms", fsname, &new->source, &opt);
+ if (ret == 2) {
+ /* src may be empty */
+ opt = new->source;
+ new->source = xstrdup("");
+ if (new->source == NULL)
+ goto err;
+ } else if (ret != 3)
+ goto err;
+
+ cure_path(new->source);
+
+ /*
+ * The kernel reports "subtypes" sometimes and the valid
+ * type-vs-subtype delimiter is the dot symbol. We disregard
+ * any subtypes for the purpose of finding the fstype.
+ */
+ sub = strchr(*fsname, '.');
+ if (sub)
+ *sub = 0;
+
+ new->fstype = find_fstype_by_name(*fsname);
+
+ new->options = xmalloc(strlen(opt) + 1);
+ if (!new->options)
+ goto err;
+
+ if (parse_sb_opt(opt, &new->sb_flags, new->options))
+ goto err;
+
+ ret = 0;
+ret:
+ xfree(opt);
+ return ret;
+err:
+ ret = -1;
+ goto ret;
+}
+
+static LIST_HEAD(skip_mount_list);
+
+struct str_node {
+ struct list_head node;
+ char string[];
+};
+
+bool add_skip_mount(const char *mountpoint)
+{
+ struct str_node *skip = xmalloc(sizeof(struct str_node) +
+ strlen(mountpoint) + 1);
+ if (!skip)
+ return false;
+
+ strcpy(skip->string, mountpoint);
+ list_add(&skip->node, &skip_mount_list);
+ return true;
+}
+
+static bool should_skip_mount(const char *mountpoint)
+{
+ struct str_node *pos;
+
+ list_for_each_entry(pos, &skip_mount_list, node) {
+ if (strcmp(mountpoint, pos->string) == 0)
+ return true;
+ }
+
+ return false;
+}
+
+struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool for_dump)
+{
+ struct mount_info *list = NULL;
+ FILE *f;
+ char str[1024];
+
+ f = fopen_proc(pid, "mountinfo");
+ if (!f) {
+ pr_perror("Can't open %d mountinfo", pid);
+ return NULL;
+ }
+
+ while (fgets(str, sizeof(str), f)) {
+ struct mount_info *new;
+ int ret = -1;
+ char *fsname = NULL;
+
+ new = mnt_entry_alloc();
+ if (!new)
+ goto end;
+
+ new->nsid = nsid;
+
+ ret = parse_mountinfo_ent(str, new, &fsname);
+ if (ret < 0) {
+ pr_err("Bad format in %d mountinfo: '%s'\n", pid, str);
+ goto end;
+ }
+
+ /*
+ * Drop this mountpoint early, so that lookup_mnt_id/etc will
+ * fail loudly at "dump" stage if an opened file or another mnt
+ * depends on this one.
+ */
+ if (for_dump && should_skip_mount(new->mountpoint + 1)) {
+ pr_info("\tskip %s @ %s\n", fsname, new->mountpoint);
+ mnt_entry_free(new);
+ new = NULL;
+ goto end;
+ }
+
+ pr_info("\ttype %s source %s mnt_id %d s_dev %#x %s @ %s flags %#x options %s\n",
+ fsname, new->source,
+ new->mnt_id, new->s_dev, new->root, new->mountpoint,
+ new->flags, new->options);
+
+ if (new->fstype->parse) {
+ ret = new->fstype->parse(new);
+ if (ret) {
+ pr_err("Failed to parse FS specific data on %s\n",
+ new->mountpoint);
+ goto end;
+ }
+ }
+end:
+ if (fsname)
+ free(fsname);
+
+ if (new) {
+ new->next = list;
+ list = new;
+ }
+
+ if (ret)
+ goto err;
+ }
+out:
+ fclose(f);
+ return list;
+
+err:
+ while (list) {
+ struct mount_info *next = list->next;
+ mnt_entry_free(list);
+ list = next;
+ }
+ goto out;
+}
+
+static char nybble(const char n)
+{
+ if (n >= '0' && n <= '9')
+ return n - '0';
+ else if (n >= 'A' && n <= 'F')
+ return n - ('A' - 10);
+ else if (n >= 'a' && n <= 'f')
+ return n - ('a' - 10);
+ return 0;
+}
+
+static int alloc_fhandle(FhEntry *fh)
+{
+ fh->n_handle = FH_ENTRY_SIZES__min_entries;
+ fh->handle = xmalloc(pb_repeated_size(fh, handle));
+
+ return fh->handle == NULL ? -1 : 0;
+}
+
+static void free_fhandle(FhEntry *fh)
+{
+ if (fh->handle)
+ xfree(fh->handle);
+}
+
+void free_inotify_wd_entry(union fdinfo_entries *e)
+{
+ free_fhandle(e->ify.e.f_handle);
+ xfree(e);
+}
+
+void free_fanotify_mark_entry(union fdinfo_entries *e)
+{
+ if (e->ffy.e.ie)
+ free_fhandle(e->ffy.ie.f_handle);
+ xfree(e);
+}
+
+void free_event_poll_entry(union fdinfo_entries *e)
+{
+ xfree(e);
+}
+
+static void parse_fhandle_encoded(char *tok, FhEntry *fh)
+{
+ char *d = (char *)fh->handle;
+ int i = 0;
+
+ memzero(d, pb_repeated_size(fh, handle));
+
+ while (*tok == ' ')
+ tok++;
+
+ while (*tok) {
+ if (i >= pb_repeated_size(fh, handle))
+ break;
+ d[i++] = (nybble(tok[0]) << 4) | nybble(tok[1]);
+ if (tok[1])
+ tok += 2;
+ else
+ break;
+ }
+}
+
+static int parse_timerfd(struct bfd *f, char *str, TimerfdEntry *tfy)
+{
+ /*
+ * Format is
+ * clockid: 0
+ * ticks: 0
+ * settime flags: 01
+ * it_value: (0, 49406829)
+ * it_interval: (1, 0)
+ */
+ if (sscanf(str, "clockid: %d", &tfy->clockid) != 1)
+ goto parse_err;
+
+ str = breadline(f);
+ if (IS_ERR_OR_NULL(str))
+ goto nodata;
+ if (sscanf(str, "ticks: %llu", (unsigned long long *)&tfy->ticks) != 1)
+ goto parse_err;
+
+ str = breadline(f);
+ if (IS_ERR_OR_NULL(str))
+ goto nodata;
+ if (sscanf(str, "settime flags: 0%o", &tfy->settime_flags) != 1)
+ goto parse_err;
+
+ str = breadline(f);
+ if (IS_ERR_OR_NULL(str))
+ goto nodata;
+ if (sscanf(str, "it_value: (%llu, %llu)",
+ (unsigned long long *)&tfy->vsec,
+ (unsigned long long *)&tfy->vnsec) != 2)
+ goto parse_err;
+
+ str = breadline(f);
+ if (IS_ERR_OR_NULL(str))
+ goto nodata;
+ if (sscanf(str, "it_interval: (%llu, %llu)",
+ (unsigned long long *)&tfy->isec,
+ (unsigned long long *)&tfy->insec) != 2)
+ goto parse_err;
+ return 0;
+
+parse_err:
+ return -1;
+nodata:
+ pr_err("No data left in proc file while parsing timerfd\n");
+ goto parse_err;
+}
+
+#define fdinfo_field(str, field) !strncmp(str, field":", sizeof(field))
+
+static int parse_file_lock_buf(char *buf, struct file_lock *fl,
+ bool is_blocked);
+static int parse_fdinfo_pid_s(int pid, int fd, int type,
+ int (*cb)(union fdinfo_entries *e, void *arg), void *arg)
+{
+ struct bfd f;
+ char *str;
+ bool entry_met = false;
+ int ret, exit_code = -1;;
+
+ f.fd = open_proc(pid, "fdinfo/%d", fd);
+ if (f.fd < 0) {
+ pr_perror("Can't open fdinfo/%d to parse", fd);
+ return -1;
+ }
+
+ if (bfdopenr(&f))
+ return -1;
+
+ while (1) {
+ union fdinfo_entries entry;
+
+ str = breadline(&f);
+ if (!str)
+ break;
+ if (IS_ERR(str))
+ goto out;
+
+ if (fdinfo_field(str, "pos") ||
+ fdinfo_field(str, "flags") ||
+ fdinfo_field(str, "mnt_id")) {
+ unsigned long long val;
+ struct fdinfo_common *fdinfo = arg;
+
+ if (type != FD_TYPES__UND)
+ continue;
+ ret = sscanf(str, "%*s %lli", &val);
+ if (ret != 1)
+ goto parse_err;
+
+ if (fdinfo_field(str, "pos"))
+ fdinfo->pos = val;
+ else if (fdinfo_field(str, "flags"))
+ fdinfo->flags = val;
+ else if (fdinfo_field(str, "mnt_id"))
+ fdinfo->mnt_id = val;
+
+ entry_met = true;
+ continue;
+ }
+
+ if (fdinfo_field(str, "lock")) {
+ struct file_lock *fl;
+ struct fdinfo_common *fdinfo = arg;
+
+ if (type != FD_TYPES__UND)
+ continue;
+
+ fl = alloc_file_lock();
+ if (!fl) {
+ pr_perror("Alloc file lock failed!");
+ goto out;
+ }
+
+ if (parse_file_lock_buf(str + 6, fl, 0)) {
+ xfree(fl);
+ goto parse_err;
+ }
+
+ pr_info("lockinfo: %lld:%d %x %d %02x:%02x:%ld %lld %s\n",
+ fl->fl_id, fl->fl_kind, fl->fl_ltype,
+ fl->fl_owner, fl->maj, fl->min, fl->i_no,
+ fl->start, fl->end);
+
+
+ if (fl->fl_kind == FL_UNKNOWN) {
+ pr_err("Unknown file lock!\n");
+ xfree(fl);
+ goto out;
+ }
+
+ fl->real_owner = fdinfo->owner;
+ fl->owners_fd = fd;
+ list_add_tail(&fl->list, &file_lock_list);
+ }
+
+ if (type == FD_TYPES__UND)
+ continue;
+
+ if (fdinfo_field(str, "eventfd-count")) {
+ eventfd_file_entry__init(&entry.efd);
+
+ if (type != FD_TYPES__EVENTFD)
+ goto parse_err;
+ ret = sscanf(str, "eventfd-count: %"PRIx64,
+ &entry.efd.counter);
+ if (ret != 1)
+ goto parse_err;
+ ret = cb(&entry, arg);
+ if (ret)
+ goto out;
+
+ entry_met = true;
+ continue;
+ }
+ if (fdinfo_field(str, "clockid")) {
+ timerfd_entry__init(&entry.tfy);
+
+ if (type != FD_TYPES__TIMERFD)
+ goto parse_err;
+ ret = parse_timerfd(&f, str, &entry.tfy);
+ if (ret)
+ goto parse_err;
+ ret = cb(&entry, arg);
+ if (ret)
+ goto out;
+
+ entry_met = true;
+ continue;
+ }
+ if (fdinfo_field(str, "tfd")) {
+ union fdinfo_entries *e;
+
+ if (type != FD_TYPES__EVENTPOLL)
+ goto parse_err;
+
+ e = xmalloc(sizeof(union fdinfo_entries));
+ if (!e)
+ goto out;
+
+ eventpoll_tfd_entry__init(&e->epl.e);
+
+ ret = sscanf(str, "tfd: %d events: %x data: %"PRIx64,
+ &e->epl.e.tfd, &e->epl.e.events, &e->epl.e.data);
+ if (ret != 3) {
+ free_event_poll_entry(e);
+ goto parse_err;
+ }
+ ret = cb(e, arg);
+ if (ret)
+ goto out;
+
+ entry_met = true;
+ continue;
+ }
+ if (fdinfo_field(str, "sigmask")) {
+ signalfd_entry__init(&entry.sfd);
+
+ if (type != FD_TYPES__SIGNALFD)
+ goto parse_err;
+ ret = sscanf(str, "sigmask: %Lx",
+ (unsigned long long *)&entry.sfd.sigmask);
+ if (ret != 1)
+ goto parse_err;
+ ret = cb(&entry, arg);
+ if (ret)
+ goto out;
+
+ entry_met = true;
+ continue;
+ }
+ if (fdinfo_field(str, "fanotify flags")) {
+ struct fsnotify_params *p = arg;
+
+ if (type != FD_TYPES__FANOTIFY)
+ goto parse_err;
+
+ ret = sscanf(str, "fanotify flags:%x event-flags:%x",
+ &p->faflags, &p->evflags);
+ if (ret != 2)
+ goto parse_err;
+ entry_met = true;
+ continue;
+ }
+ if (fdinfo_field(str, "fanotify ino")) {
+ union fdinfo_entries *e;
+ int hoff = 0;
+
+ if (type != FD_TYPES__FANOTIFY)
+ goto parse_err;
+
+ e = xmalloc(sizeof(*e));
+ if (!e)
+ goto parse_err;
+
+ fanotify_mark_entry__init(&e->ffy.e);
+ fanotify_inode_mark_entry__init(&e->ffy.ie);
+ fh_entry__init(&e->ffy.f_handle);
+ e->ffy.e.ie = &e->ffy.ie;
+ e->ffy.ie.f_handle = &e->ffy.f_handle;
+
+ ret = sscanf(str,
+ "fanotify ino:%"PRIx64" sdev:%x mflags:%x mask:%x ignored_mask:%x "
+ "fhandle-bytes:%x fhandle-type:%x f_handle: %n",
+ &e->ffy.ie.i_ino, &e->ffy.e.s_dev,
+ &e->ffy.e.mflags, &e->ffy.e.mask, &e->ffy.e.ignored_mask,
+ &e->ffy.f_handle.bytes, &e->ffy.f_handle.type,
+ &hoff);
+ if (ret != 7 || hoff == 0) {
+ free_fanotify_mark_entry(e);
+ goto parse_err;
+ }
+
+ if (alloc_fhandle(&e->ffy.f_handle)) {
+ free_fanotify_mark_entry(e);
+ goto out;
+ }
+ parse_fhandle_encoded(str + hoff, &e->ffy.f_handle);
+
+ e->ffy.e.type = MARK_TYPE__INODE;
+ ret = cb(e, arg);
+
+
+ if (ret)
+ goto out;
+
+ entry_met = true;
+ continue;
+ }
+ if (fdinfo_field(str, "fanotify mnt_id")) {
+ union fdinfo_entries *e;
+
+ if (type != FD_TYPES__FANOTIFY)
+ goto parse_err;
+
+ e = xmalloc(sizeof(*e));
+ if (!e)
+ goto parse_err;
+
+ fanotify_mark_entry__init(&e->ffy.e);
+ fanotify_mount_mark_entry__init(&e->ffy.me);
+ e->ffy.e.me = &e->ffy.me;
+
+ ret = sscanf(str,
+ "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x",
+ &e->ffy.e.me->mnt_id, &e->ffy.e.mflags,
+ &e->ffy.e.mask, &e->ffy.e.ignored_mask);
+ if (ret != 4)
+ goto parse_err;
+
+ e->ffy.e.type = MARK_TYPE__MOUNT;
+ ret = cb(e, arg);
+ if (ret)
+ goto out;
+
+ entry_met = true;
+ continue;
+ }
+ if (fdinfo_field(str, "inotify wd")) {
+ InotifyWdEntry *ify;
+ union fdinfo_entries *e;
+ int hoff;
+
+ if (type != FD_TYPES__INOTIFY)
+ goto parse_err;
+
+ e = xmalloc(sizeof(*e));
+ if (!e)
+ goto parse_err;
+ ify = &e->ify.e;
+
+ inotify_wd_entry__init(ify);
+ ify->f_handle = &e->ify.f_handle;
+ fh_entry__init(ify->f_handle);
+
+ ret = sscanf(str,
+ "inotify wd:%x ino:%"PRIx64" sdev:%x "
+ "mask:%x ignored_mask:%x "
+ "fhandle-bytes:%x fhandle-type:%x "
+ "f_handle: %n",
+ &ify->wd, &ify->i_ino, &ify->s_dev,
+ &ify->mask, &ify->ignored_mask,
+ &ify->f_handle->bytes, &ify->f_handle->type,
+ &hoff);
+ if (ret != 7) {
+ free_inotify_wd_entry(e);
+ goto parse_err;
+ }
+
+ if (alloc_fhandle(ify->f_handle)) {
+ free_inotify_wd_entry(e);
+ goto out;
+ }
+
+ parse_fhandle_encoded(str + hoff, ify->f_handle);
+
+ ret = cb(e, arg);
+
+ if (ret)
+ goto out;
+
+ entry_met = true;
+ continue;
+ }
+ }
+
+ exit_code = 0;
+ if (entry_met)
+ goto out;
+ /*
+ * An eventpoll/inotify file may have no target fds set thus
+ * resulting in no tfd: lines in proc. This is normal.
+ */
+ if (type == FD_TYPES__EVENTPOLL || type == FD_TYPES__INOTIFY)
+ goto out;
+
+ pr_err("No records of type %d found in fdinfo file\n", type);
+parse_err:
+ exit_code = -1;
+ pr_perror("%s: error parsing [%s] for %d", __func__, str, type);
+out:
+ bclose(&f);
+ return exit_code;
+}
+
+int parse_fdinfo_pid(int pid, int fd, int type,
+ int (*cb)(union fdinfo_entries *e, void *arg), void *arg)
+{
+ return parse_fdinfo_pid_s(pid, fd, type, cb, arg);
+}
+
+int parse_fdinfo(int fd, int type,
+ int (*cb)(union fdinfo_entries *e, void *arg), void *arg)
+{
+ return parse_fdinfo_pid_s(PROC_SELF, fd, type, cb, arg);
+}
+
+int get_fd_mntid(int fd, int *mnt_id)
+{
+ struct fdinfo_common fdinfo = { .mnt_id = -1};
+
+ if (parse_fdinfo(fd, FD_TYPES__UND, NULL, &fdinfo))
+ return -1;
+
+ *mnt_id = fdinfo.mnt_id;
+ return 0;
+}
+
+static int parse_file_lock_buf(char *buf, struct file_lock *fl,
+ bool is_blocked)
+{
+ int num;
+ char fl_flag[10], fl_type[15], fl_option[10];
+
+ if (is_blocked) {
+ num = sscanf(buf, "%lld: -> %s %s %s %d %x:%x:%ld %lld %s",
+ &fl->fl_id, fl_flag, fl_type, fl_option,
+ &fl->fl_owner, &fl->maj, &fl->min, &fl->i_no,
+ &fl->start, fl->end);
+ } else {
+ num = sscanf(buf, "%lld:%s %s %s %d %x:%x:%ld %lld %s",
+ &fl->fl_id, fl_flag, fl_type, fl_option,
+ &fl->fl_owner, &fl->maj, &fl->min, &fl->i_no,
+ &fl->start, fl->end);
+ }
+
+ if (num < 10) {
+ pr_err("Invalid file lock info (%d): %s\n", num, buf);
+ return -1;
+ }
+
+ if (!strcmp(fl_flag, "POSIX"))
+ fl->fl_kind = FL_POSIX;
+ else if (!strcmp(fl_flag, "FLOCK"))
+ fl->fl_kind = FL_FLOCK;
+ else
+ fl->fl_kind = FL_UNKNOWN;
+
+ if (!strcmp(fl_type, "MSNFS")) {
+ fl->fl_ltype |= LOCK_MAND;
+
+ if (!strcmp(fl_option, "READ")) {
+ fl->fl_ltype |= LOCK_READ;
+ } else if (!strcmp(fl_option, "RW")) {
+ fl->fl_ltype |= LOCK_RW;
+ } else if (!strcmp(fl_option, "WRITE")) {
+ fl->fl_ltype |= LOCK_WRITE;
+ } else {
+ pr_err("Unknown lock option!\n");
+ return -1;
+ }
+ } else {
+ if (!strcmp(fl_option, "UNLCK")) {
+ fl->fl_ltype |= F_UNLCK;
+ } else if (!strcmp(fl_option, "WRITE")) {
+ fl->fl_ltype |= F_WRLCK;
+ } else if (!strcmp(fl_option, "READ")) {
+ fl->fl_ltype |= F_RDLCK;
+ } else {
+ pr_err("Unknown lock option!\n");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+int parse_file_locks(void)
+{
+ struct file_lock *fl;
+
+ FILE *fl_locks;
+ int exit_code = -1;
+ bool is_blocked;
+
+ if (kdat.has_fdinfo_lock)
+ return 0;
+
+ fl_locks = fopen_proc(PROC_GEN, "locks");
+ if (!fl_locks) {
+ pr_perror("Can't open file locks file!");
+ return -1;
+ }
+
+ while (fgets(buf, BUF_SIZE, fl_locks)) {
+ is_blocked = strstr(buf, "->") != NULL;
+
+ fl = alloc_file_lock();
+ if (!fl) {
+ pr_perror("Alloc file lock failed!");
+ goto err;
+ }
+
+ if (parse_file_lock_buf(buf, fl, is_blocked)) {
+ xfree(fl);
+ goto err;
+ }
+
+ pr_info("lockinfo: %lld:%d %x %d %02x:%02x:%ld %lld %s\n",
+ fl->fl_id, fl->fl_kind, fl->fl_ltype,
+ fl->fl_owner, fl->maj, fl->min, fl->i_no,
+ fl->start, fl->end);
+
+
+ if (fl->fl_kind == FL_UNKNOWN) {
+ pr_err("Unknown file lock: %s!\n", buf);
+ xfree(fl);
+ goto err;
+ }
+
+ if (is_blocked) {
+ /*
+ * All target processes are stopped in this moment and
+ * can't wait any locks.
+ */
+ pr_debug("Skip blocked processes\n");
+ xfree(fl);
+ continue;
+ }
+
+ if ((fl->fl_kind == FL_POSIX) &&
+ !pid_in_pstree(fl->fl_owner)) {
+ /*
+ * We only care about tasks which are taken
+ * into dump, so we only collect file locks
+ * belong to these tasks.
+ */
+ xfree(fl);
+ continue;
+ }
+
+ list_add_tail(&fl->list, &file_lock_list);
+ }
+
+ exit_code = 0;
+err:
+ fclose(fl_locks);
+ return exit_code;
+}
+
+void free_posix_timers(struct proc_posix_timers_stat *st)
+{
+ while (!list_empty(&st->timers)) {
+ struct proc_posix_timer *timer;
+ timer = list_first_entry(&st->timers, struct proc_posix_timer, list);
+ list_del(&timer->list);
+ xfree(timer);
+ }
+}
+
+int parse_posix_timers(pid_t pid, struct proc_posix_timers_stat *args)
+{
+ int exit_code = -1;
+ int pid_t;
+ int i = 0;
+
+ struct bfd f;
+ char *s;
+ char sigpid[7];
+ char tidpid[4];
+
+ struct proc_posix_timer *timer = NULL;
+
+ INIT_LIST_HEAD(&args->timers);
+ args->timer_n = 0;
+
+ f.fd = open_proc(pid, "timers");
+ if (f.fd < 0) {
+ pr_perror("Can't open posix timers file!");
+ return -1;
+ }
+
+ if (bfdopenr(&f))
+ return -1;
+
+ while (1) {
+ char pbuf[17]; /* 16 + eol */
+
+ s = breadline(&f);
+ if (!s)
+ break;
+ if (IS_ERR(s))
+ goto err;
+
+ switch (i % 4) {
+ case 0:
+ timer = xzalloc(sizeof(struct proc_posix_timer));
+ if (timer == NULL)
+ goto err;
+
+ if (sscanf(s, "ID: %ld",
+ &timer->spt.it_id) != 1)
+ goto err;
+ break;
+ case 1:
+ if (sscanf(s, "signal: %d/%16s",
+ &timer->spt.si_signo, pbuf) != 2)
+ goto err;
+ break;
+ case 2:
+ if (sscanf(s, "notify: %6[a-z]/%3[a-z].%d\n",
+ sigpid, tidpid, &pid_t) != 3)
+ goto err;
+ break;
+ case 3:
+ if (sscanf(s, "ClockID: %d\n",
+ &timer->spt.clock_id) != 1)
+ goto err;
+
+ timer->spt.sival_ptr = NULL;
+ if (sscanf(pbuf, "%p", &timer->spt.sival_ptr) != 1 &&
+ strcmp(pbuf, "(null)")) {
+ pr_err("Unable to parse '%s'\n", pbuf);
+ goto err;
+ }
+
+ if ( tidpid[0] == 't') {
+ timer->spt.it_sigev_notify = SIGEV_THREAD_ID;
+ } else {
+ switch (sigpid[0]) {
+ case 's' :
+ timer->spt.it_sigev_notify = SIGEV_SIGNAL;
+ break;
+ case 't' :
+ timer->spt.it_sigev_notify = SIGEV_THREAD;
+ break;
+ default :
+ timer->spt.it_sigev_notify = SIGEV_NONE;
+ break;
+ }
+ }
+
+ list_add(&timer->list, &args->timers);
+ timer = NULL;
+ args->timer_n++;
+ break;
+ }
+ i++;
+ }
+
+ exit_code = 0;
+out:
+ bclose(&f);
+ return exit_code;
+err:
+ xfree(timer);
+ free_posix_timers(args);
+ pr_perror("Parse error in posix timers proc file!");
+ goto out;
+}
+
+int parse_threads(int pid, struct pid **_t, int *_n)
+{
+ struct dirent *de;
+ DIR *dir;
+ struct pid *t = NULL;
+ int nr = 1;
+
+ if (*_t)
+ t = *_t;
+
+ dir = opendir_proc(pid, "task");
+ if (!dir)
+ return -1;
+
+ while ((de = readdir(dir))) {
+ struct pid *tmp;
+
+ /* We expect numbers only here */
+ if (de->d_name[0] == '.')
+ continue;
+
+ if (*_t == NULL) {
+ tmp = xrealloc(t, nr * sizeof(struct pid));
+ if (!tmp) {
+ xfree(t);
+ return -1;
+ }
+ t = tmp;
+ t[nr - 1].virt = -1;
+ }
+ t[nr - 1].real = atoi(de->d_name);
+ nr++;
+ }
+
+ closedir(dir);
+
+ if (*_t == NULL) {
+ *_t = t;
+ *_n = nr - 1;
+ } else
+ BUG_ON(nr - 1 != *_n);
+
+ return 0;
+}
+
+int parse_task_cgroup(int pid, struct list_head *retl, unsigned int *n)
+{
+ FILE *f;
+
+ f = fopen_proc(pid, "cgroup");
+ if (f == NULL)
+ return -1;
+ while (fgets(buf, BUF_SIZE, f)) {
+ struct cg_ctl *ncc, *cc;
+ char *name, *path = NULL, *e;
+
+ ncc = xmalloc(sizeof(*cc));
+ if (!ncc)
+ goto err;
+
+ /*
+ * Typical output (':' is a separator here)
+ *
+ * 4:cpu,cpuacct:/
+ * 3:cpuset:/
+ * 2:name=systemd:/user.slice/user-1000.slice/session-1.scope
+ */
+ name = strchr(buf, ':');
+ if (name)
+ path = strchr(++name, ':');
+ if (!name || !path) {
+ pr_err("Failed parsing cgroup %s\n", buf);
+ xfree(ncc);
+ goto err;
+ }
+ e = strchr(name, '\n');
+ *path++ = '\0';
+ if (e)
+ *e = '\0';
+
+ ncc->name = xstrdup(name);
+ ncc->path = xstrdup(path);
+ if (!ncc->name || !ncc->path) {
+ xfree(ncc->name);
+ xfree(ncc->path);
+ xfree(ncc);
+ goto err;
+ }
+
+ list_for_each_entry(cc, retl, l)
+ if (strcmp(cc->name, name) >= 0)
+ break;
+
+ list_add_tail(&ncc->l, &cc->l);
+ (*n)++;
+ }
+
+ fclose(f);
+ return 0;
+
+err:
+ put_ctls(retl);
+ fclose(f);
+ return -1;
+}
+
+void put_ctls(struct list_head *l)
+{
+ struct cg_ctl *c, *n;
+
+ list_for_each_entry_safe(c, n, l, l) {
+ xfree(c->name);
+ xfree(c->path);
+ xfree(c);
+ }
+}
+
+/* Parse and create all the real controllers. This does not include things with
+ * the "name=" prefix, e.g. systemd.
+ */
+int collect_controllers(struct list_head *cgroups, unsigned int *n_cgroups)
+{
+ int exit_code = -1;
+ FILE *f;
+
+ f = fopen_proc(PROC_SELF, "cgroup");
+ if (f == NULL)
+ return -1;
+
+ while (fgets(buf, BUF_SIZE, f)) {
+ struct cg_controller *nc = NULL;
+ char *controllers, *off;
+
+ controllers = strchr(buf, ':');
+ if (!controllers) {
+ pr_err("Unable to parse \"%s\"\n", buf);
+ goto err;
+ }
+ controllers++;
+
+ off = strchr(controllers, ':');
+ if (!off) {
+ pr_err("Unable to parse \"%s\"\n", buf);
+ goto err;
+ }
+ *off = '\0';
+ while (1) {
+ off = strchr(controllers, ',');
+ if (off)
+ *off = '\0';
+
+ if (!strncmp("name=", controllers, 5))
+ goto skip;
+
+ if (!nc) {
+ nc = new_controller(controllers);
+ if (!nc)
+ goto err;
+ list_add_tail(&nc->l, cgroups);
+ (*n_cgroups)++;
+ } else {
+ void *m;
+ char *n;
+
+ nc->n_controllers++;
+ m = xrealloc(nc->controllers, sizeof(char *) * nc->n_controllers);
+ if (!m)
+ goto err;
+
+ nc->controllers = m;
+
+ n = xstrdup(controllers);
+ if (!n)
+ goto err;
+
+ nc->controllers[nc->n_controllers-1] = n;
+ }
+
+skip:
+ if (!off)
+ break;
+ controllers = off + 1;
+ }
+ }
+
+ exit_code = 0;
+err:
+ fclose(f);
+ return exit_code;
+}
+
+/*
+ * If an OverlayFS mountpoint is found in the mountinfo table,
+ * we enable opts.overlayfs, which is a workaround for the
+ * OverlayFS Kernel bug.
+ *
+ * See fixup_overlayfs for details.
+ */
+int overlayfs_parse(struct mount_info *new)
+{
+ opts.overlayfs = true;
+ return 0;
+}
+
+/*
+ * AUFS callback function to "fix up" the root pathname.
+ * See sysfs_parse.c for details.
+ */
+int aufs_parse(struct mount_info *new)
+{
+ int ret = 0;
+
+ if (!strcmp(new->mountpoint, "./")) {
+ opts.aufs = true;
+ ret = parse_aufs_branches(new);
+ }
+
+ return ret;
+}
+
+bool proc_status_creds_dumpable(struct proc_status_creds *parent,
+ struct proc_status_creds *child)
+{
+ const size_t size = sizeof(struct proc_status_creds) -
+ offsetof(struct proc_status_creds, cap_inh);
+
+ /*
+ * The comparision rules are the following
+ *
+ * - CAPs can be different
+ * - seccomp filters should be passed via
+ * semantic comparision (FIXME) but for
+ * now we require them to be exactly
+ * identical
+ * - the rest of members must match
+ */
+
+ if (memcmp(parent, child, size)) {
+ if (!pr_quelled(LOG_DEBUG)) {
+ pr_debug("Creds undumpable (parent:child)\n"
+ " uids: %d:%d %d:%d %d:%d %d:%d\n"
+ " gids: %d:%d %d:%d %d:%d %d:%d\n"
+ " state: %d:%d"
+ " ppid: %d:%d\n"
+ " sigpnd: %llu:%llu\n"
+ " shdpnd: %llu:%llu\n"
+ " seccomp_mode: %d:%d\n"
+ " last_filter: %u:%u\n",
+ parent->uids[0], child->uids[0],
+ parent->uids[1], child->uids[1],
+ parent->uids[2], child->uids[2],
+ parent->uids[3], child->uids[3],
+ parent->gids[0], child->gids[0],
+ parent->gids[1], child->gids[1],
+ parent->gids[2], child->gids[2],
+ parent->gids[3], child->gids[3],
+ parent->state, child->state,
+ parent->ppid, child->ppid,
+ parent->sigpnd, child->sigpnd,
+ parent->shdpnd, child->shdpnd,
+ parent->seccomp_mode, child->seccomp_mode,
+ parent->last_filter, child->last_filter);
+ }
+ return false;
+ }
+
+ return true;
+}
+
+int parse_children(pid_t pid, pid_t **_c, int *_n)
+{
+ pid_t *ch = NULL;
+ int nr = 0;
+ DIR *dir;
+ struct dirent *de;
+ struct bfd f;
+
+ dir = opendir_proc(pid, "task");
+ if (dir == NULL)
+ return -1;
+
+ while ((de = readdir(dir))) {
+ char *pos, *end;
+
+ if (dir_dots(de))
+ continue;
+
+ f.fd = open_proc(pid, "task/%s/children", de->d_name);
+ if (f.fd < 0)
+ goto err;
+
+ if (bfdopenr(&f))
+ goto err;
+
+ while (1) {
+ pid_t val, *tmp;
+
+ pos = breadchr(&f, ' ');
+ if (IS_ERR(pos))
+ goto err_close;
+ if (pos == NULL)
+ break;
+
+ val = strtol(pos, &end, 0);
+
+ if (*end != 0 && *end != ' ') {
+ pr_err("Unable to parse %s\n", end);
+ goto err_close;
+ }
+
+ tmp = xrealloc(ch, (nr + 1) * sizeof(pid_t));
+ if (!tmp)
+ goto err_close;
+
+ ch = tmp;
+ ch[nr] = val;
+ nr++;
+ }
+ bclose(&f);
+ }
+
+ *_c = ch;
+ *_n = nr;
+
+ closedir(dir);
+ return 0;
+err_close:
+ bclose(&f);
+err:
+ closedir(dir);
+ xfree(ch);
+ return -1;
+}
+
diff --git a/criu/protobuf-desc.c b/criu/protobuf-desc.c
new file mode 100644
index 000000000000..c80ebb794671
--- /dev/null
+++ b/criu/protobuf-desc.c
@@ -0,0 +1,104 @@
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <fcntl.h>
+#include <arpa/inet.h>
+#include <ctype.h>
+
+#include "asm/types.h"
+
+#include "compiler.h"
+#include "log.h"
+
+#include "protobuf-desc.h"
+
+#include "protobuf/inventory.pb-c.h"
+#include "protobuf/stats.pb-c.h"
+#include "protobuf/regfile.pb-c.h"
+#include "protobuf/ext-file.pb-c.h"
+#include "protobuf/ns.pb-c.h"
+#include "protobuf/eventfd.pb-c.h"
+#include "protobuf/eventpoll.pb-c.h"
+#include "protobuf/signalfd.pb-c.h"
+#include "protobuf/fsnotify.pb-c.h"
+#include "protobuf/core.pb-c.h"
+#include "protobuf/mm.pb-c.h"
+#include "protobuf/pipe.pb-c.h"
+#include "protobuf/fifo.pb-c.h"
+#include "protobuf/fdinfo.pb-c.h"
+#include "protobuf/pipe-data.pb-c.h"
+#include "protobuf/pstree.pb-c.h"
+#include "protobuf/sa.pb-c.h"
+#include "protobuf/sk-unix.pb-c.h"
+#include "protobuf/sk-inet.pb-c.h"
+#include "protobuf/packet-sock.pb-c.h"
+#include "protobuf/sk-packet.pb-c.h"
+#include "protobuf/creds.pb-c.h"
+#include "protobuf/timer.pb-c.h"
+#include "protobuf/utsns.pb-c.h"
+#include "protobuf/ipc-var.pb-c.h"
+#include "protobuf/ipc-shm.pb-c.h"
+#include "protobuf/ipc-msg.pb-c.h"
+#include "protobuf/ipc-sem.pb-c.h"
+#include "protobuf/fs.pb-c.h"
+#include "protobuf/remap-file-path.pb-c.h"
+#include "protobuf/ghost-file.pb-c.h"
+#include "protobuf/mnt.pb-c.h"
+#include "protobuf/netdev.pb-c.h"
+#include "protobuf/tcp-stream.pb-c.h"
+#include "protobuf/tty.pb-c.h"
+#include "protobuf/file-lock.pb-c.h"
+#include "protobuf/rlimit.pb-c.h"
+#include "protobuf/pagemap.pb-c.h"
+#include "protobuf/siginfo.pb-c.h"
+#include "protobuf/sk-netlink.pb-c.h"
+#include "protobuf/vma.pb-c.h"
+#include "protobuf/tun.pb-c.h"
+#include "protobuf/cgroup.pb-c.h"
+#include "protobuf/timerfd.pb-c.h"
+#include "protobuf/cpuinfo.pb-c.h"
+#include "protobuf/userns.pb-c.h"
+#include "protobuf/seccomp.pb-c.h"
+#include "protobuf/binfmt-misc.pb-c.h"
+
+struct cr_pb_message_desc cr_pb_descs[PB_MAX];
+
+#define CR_PB_DESC(__type, __vtype, __ftype) \
+ CR_PB_MDESC_INIT(cr_pb_descs[PB_##__type], \
+ __vtype##Entry, \
+ __ftype##_entry)
+
+#define PB_PACK_TYPECHECK(__o, __fn) ({ if (0) __fn##__pack(__o, NULL); (pb_pack_t)&__fn##__pack; })
+#define PB_GPS_TYPECHECK(__o, __fn) ({ if (0) __fn##__get_packed_size(__o); (pb_getpksize_t)&__fn##__get_packed_size; })
+#define PB_UNPACK_TYPECHECK(__op, __fn) ({ if (0) *__op = __fn##__unpack(NULL, 0, NULL); (pb_unpack_t)&__fn##__unpack; })
+#define PB_FREE_TYPECHECK(__o, __fn) ({ if (0) __fn##__free_unpacked(__o, NULL); (pb_free_t)&__fn##__free_unpacked; })
+
+/*
+ * This should be explicitly "called" to do type-checking
+ */
+
+#define CR_PB_MDESC_INIT(__var, __type, __name) \
+ do { \
+ __var.getpksize = PB_GPS_TYPECHECK((__type *)NULL, __name); \
+ __var.pack = PB_PACK_TYPECHECK((__type *)NULL, __name); \
+ __var.unpack = PB_UNPACK_TYPECHECK((__type **)NULL, __name); \
+ __var.free = PB_FREE_TYPECHECK((__type *)NULL, __name); \
+ __var.pb_desc = &__name##__descriptor; \
+ } while (0)
+
+void cr_pb_init(void)
+{
+ CR_PB_DESC(IDS, TaskKobjIds, task_kobj_ids);
+ CR_PB_DESC(SIGACT, Sa, sa);
+ CR_PB_DESC(SK_QUEUES, SkPacket, sk_packet);
+ CR_PB_MDESC_INIT(cr_pb_descs[PB_IPCNS_MSG], IpcMsg, ipc_msg);
+ CR_PB_DESC(IPCNS_MSG_ENT, IpcMsg, ipc_msg);
+ CR_PB_DESC(REMAP_FPATH, RemapFilePath, remap_file_path);
+ CR_PB_DESC(NETDEV, NetDevice, net_device);
+ CR_PB_MDESC_INIT(cr_pb_descs[PB_PAGEMAP_HEAD], PagemapHead, pagemap_head);
+
+#include "protobuf-desc-gen.h"
+}
diff --git a/criu/protobuf.c b/criu/protobuf.c
new file mode 100644
index 000000000000..ae003da44158
--- /dev/null
+++ b/criu/protobuf.c
@@ -0,0 +1,692 @@
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <arpa/inet.h>
+#include <ctype.h>
+
+#include <google/protobuf-c/protobuf-c.h>
+
+#include "image.h"
+#include "servicefd.h"
+#include "compiler.h"
+#include "asm/types.h"
+#include "log.h"
+#include "util.h"
+#include "string.h"
+#include "sockets.h"
+#include "cr_options.h"
+#include "bfd.h"
+#include "protobuf.h"
+
+/*
+ * To speed up reading of packed objects
+ * by providing space on stack, this should
+ * be more than enough for most objects.
+ */
+#define PB_PKOBJ_LOCAL_SIZE 1024
+
+#define INET_ADDR_LEN 40
+
+typedef struct {
+ void *data;
+ int number;
+ int depth;
+ int count;
+ char fmt[32];
+} pb_pr_field_t;
+
+typedef struct {
+ void *arg;
+ int single_entry;
+ const char *pretty_fmt;
+ pb_pr_field_t cur;
+} pb_pr_ctl_t;
+
+typedef int (*pb_pr_show_t)(pb_pr_field_t *field);
+
+/*
+ * This one describes how fields should be shown
+ * @fsize is the size of the field entry
+ * @show is the callback to print the entry
+ */
+struct pb_shower {
+ size_t fsize;
+ pb_pr_show_t show;
+};
+
+static int pb_msg_int32x(pb_pr_field_t *field)
+{
+ pr_msg("%#x", *(int *)field->data);
+ return 0;
+}
+
+static int pb_msg_int64x(pb_pr_field_t *field)
+{
+ pr_msg("%#016lx", *(long *)field->data);
+ return 0;
+}
+
+static int pb_msg_int64x_r(pb_pr_field_t *field)
+{
+ long val = *(long *)field->data;
+ if (val)
+ pr_msg("%#016lx", val);
+ else
+ pr_msg("0");
+ return 0;
+}
+
+static int pb_msg_string(pb_pr_field_t *field)
+{
+ pr_msg("\"%s\"", *(char **)field->data);
+ return 0;
+}
+
+static int pb_msg_unk(pb_pr_field_t *field)
+{
+ pr_msg("unknown object %p", field->data);
+ return 0;
+}
+
+static inline void print_tabs(pb_pr_ctl_t *ctl)
+{
+ int counter = ctl->cur.depth;
+
+ if (!ctl->single_entry)
+ return;
+
+ while (counter--)
+ pr_msg("\t");
+}
+
+static void print_nested_message_braces(pb_pr_ctl_t *ctl, int right_brace)
+{
+ print_tabs(ctl);
+ pr_msg("%s%s", (right_brace) ? "}" : "{", (ctl->single_entry) ? "\n" : " ");
+}
+
+static void pb_show_msg(const void *msg, pb_pr_ctl_t *ctl);
+
+static int show_nested_message(pb_pr_field_t *field)
+{
+ pb_pr_ctl_t *ctl = container_of(field, pb_pr_ctl_t, cur);
+ void *arg = ctl->arg;
+
+ print_nested_message_braces(ctl, 0);
+ field->depth++;
+ pb_show_msg(field->data, ctl);
+ field->depth--;
+ print_nested_message_braces(ctl, 1);
+ ctl->arg = arg;
+ return 0;
+}
+
+static int show_enum(pb_pr_field_t *field)
+{
+ pb_pr_ctl_t *ctl = container_of(field, pb_pr_ctl_t, cur);
+ ProtobufCEnumDescriptor *d = ctl->arg;
+ const char *val_name = NULL;
+ int val, i;
+
+ val = *(int *)field->data;
+ for (i = 0; i < d->n_values; i++)
+ if (d->values[i].value == val) {
+ val_name = d->values[i].name;
+ break;
+ }
+
+ if (val_name != NULL)
+ pr_msg("%s", val_name);
+ else
+ pr_msg("%d", val);
+ return 0;
+}
+
+static int show_bool(pb_pr_field_t *field)
+{
+ protobuf_c_boolean val = *(protobuf_c_boolean *)field->data;
+
+ if (val)
+ pr_msg("True");
+ else
+ pr_msg("False");
+ return 0;
+}
+
+static int show_bytes(pb_pr_field_t *field)
+{
+ ProtobufCBinaryData *bytes = (ProtobufCBinaryData *)field->data;
+ int i = 0;
+
+ while (i < bytes->len)
+ pr_msg("%02x ", bytes->data[i++]);
+ return 0;
+}
+
+static int pb_show_pretty(pb_pr_field_t *field)
+{
+ switch (field->fmt[0]) {
+ case '%':
+ pr_msg(field->fmt, *(long *)field->data);
+ break;
+ case 'S':
+ {
+ ProtobufCBinaryData *name = (ProtobufCBinaryData *)field->data;
+ int i;
+
+ for (i = 0; i < name->len; i++) {
+ char c = (char)name->data[i];
+
+ if (isprint(c))
+ pr_msg("%c", c);
+ else if (c != 0)
+ pr_msg(".");
+ }
+ break;
+ }
+ case 'A':
+ {
+ char addr[INET_ADDR_LEN] = "<unknown>";
+ int family = (field->count == 1) ? AF_INET : AF_INET6;
+
+ if (inet_ntop(family, (void *)field->data, addr,
+ INET_ADDR_LEN) == NULL)
+ pr_msg("failed to translate");
+ else
+ pr_msg("%s", addr);
+ }
+ return 1;
+ }
+ return 0;
+}
+
+static void pb_copy_fmt(const char *fmt, char *to)
+{
+ while (*fmt != ' ' && *fmt != '\0') {
+ *to = *fmt;
+ to++;
+ fmt++;
+ }
+
+ *to = '\0';
+}
+
+static const char *pb_next_pretty(const char *pfmt)
+{
+ pfmt = strchr(pfmt, ' ');
+ if (pfmt) {
+ while (*pfmt == ' ')
+ pfmt++;
+
+ if (*pfmt == '\0')
+ pfmt = NULL;
+ }
+
+ return pfmt;
+}
+
+static int pb_find_fmt(char *what, pb_pr_ctl_t *ctl)
+{
+ int len;
+ const char *pretty = ctl->pretty_fmt;
+
+ len = strlen(what);
+ while (1) {
+ if (!strncmp(pretty, what, len)) {
+ pb_copy_fmt(pretty + len, ctl->cur.fmt);
+ return 1;
+ }
+
+ pretty = pb_next_pretty(pretty + len);
+ if (!pretty)
+ return 0;
+ }
+}
+
+static int pb_field_show_pretty(const ProtobufCFieldDescriptor *fd, pb_pr_ctl_t *ctl)
+{
+ char cookie[32];
+
+ if (!ctl->pretty_fmt)
+ return 0;
+
+ sprintf(cookie, "%s:", fd->name);
+ if (pb_find_fmt(cookie, ctl))
+ return 1;
+
+ if (!ctl->cur.depth)
+ sprintf(cookie, "%d:", ctl->cur.number);
+ else
+ sprintf(cookie, "%d.%d:", ctl->cur.depth, ctl->cur.number);
+
+ if (pb_find_fmt(cookie, ctl))
+ return 1;
+
+ sprintf(cookie, "*:");
+ if (pb_find_fmt(cookie, ctl))
+ return 1;
+
+ return 0;
+}
+
+static void pb_prepare_shower(const ProtobufCFieldDescriptor *fd,
+ pb_pr_ctl_t *ctl, struct pb_shower *sh)
+{
+ sh->fsize = 0;
+ sh->show = pb_msg_unk;
+
+ switch (fd->type) {
+ case PROTOBUF_C_TYPE_INT32:
+ case PROTOBUF_C_TYPE_SINT32:
+ case PROTOBUF_C_TYPE_UINT32:
+ case PROTOBUF_C_TYPE_SFIXED32:
+ sh->fsize = 4;
+ sh->show = pb_msg_int32x;
+ break;
+
+ case PROTOBUF_C_TYPE_INT64:
+ case PROTOBUF_C_TYPE_SINT64:
+ case PROTOBUF_C_TYPE_SFIXED64:
+ case PROTOBUF_C_TYPE_FIXED32:
+ case PROTOBUF_C_TYPE_UINT64:
+ case PROTOBUF_C_TYPE_FIXED64:
+ sh->fsize = 8;
+ sh->show = (fd->label == PROTOBUF_C_LABEL_REPEATED ?
+ pb_msg_int64x_r : pb_msg_int64x);
+ break;
+
+ case PROTOBUF_C_TYPE_STRING:
+ sh->fsize = sizeof (void *);
+ sh->show = pb_msg_string;
+ break;
+ case PROTOBUF_C_TYPE_MESSAGE:
+ sh->fsize = sizeof (void *);
+ sh->show = show_nested_message;
+ ctl->arg = (void *)fd->descriptor;
+ break;
+ case PROTOBUF_C_TYPE_ENUM:
+ sh->fsize = 4;
+ sh->show = show_enum;
+ ctl->arg = (void *)fd->descriptor;
+ break;
+
+ case PROTOBUF_C_TYPE_BOOL:
+ sh->fsize = sizeof (protobuf_c_boolean);
+ sh->show = show_bool;
+ break;
+ case PROTOBUF_C_TYPE_BYTES:
+ sh->fsize = sizeof (ProtobufCBinaryData);
+ sh->show = show_bytes;
+ break;
+ case PROTOBUF_C_TYPE_FLOAT:
+ sh->fsize = 4;
+ break;
+ case PROTOBUF_C_TYPE_DOUBLE:
+ sh->fsize = 8;
+ break;
+
+ default:
+ BUG();
+ }
+
+ if (pb_field_show_pretty(fd, ctl))
+ sh->show = pb_show_pretty;
+}
+
+static void pb_show_repeated(const ProtobufCFieldDescriptor *fd,
+ pb_pr_ctl_t *ctl, struct pb_shower *sh)
+{
+ pb_pr_field_t *field = &ctl->cur;
+ unsigned long i, nr_fields = field->count;
+
+ if (nr_fields == 0) {
+ pr_msg("<empty>");
+ return;
+ }
+
+ if (fd->type == PROTOBUF_C_TYPE_MESSAGE) {
+ void *p = field->data;
+
+ for (i = 0; i < nr_fields; i++) {
+ field->data = (void *)(*(long *)p);
+ sh->show(field);
+ p += sh->fsize;
+ }
+
+ return;
+ }
+
+ for (i = 0; i < nr_fields; i++) {
+ if (i)
+ pr_msg(":");
+ if (sh->show(field))
+ break;
+ field->data += sh->fsize;
+ }
+}
+
+static void pb_show_field(const ProtobufCFieldDescriptor *fd, pb_pr_ctl_t *ctl)
+{
+ struct pb_shower sh;
+
+ print_tabs(ctl);
+ pr_msg("%s: ", fd->name);
+
+ pb_prepare_shower(fd, ctl, &sh);
+ pb_show_repeated(fd, ctl, &sh);
+
+ if (ctl->single_entry)
+ pr_msg("\n");
+ else
+ pr_msg(" ");
+}
+
+static int pb_optional_field_present(const ProtobufCFieldDescriptor *field,
+ const void *msg)
+{
+ if ((field->type == PROTOBUF_C_TYPE_MESSAGE) ||
+ (field->type == PROTOBUF_C_TYPE_STRING)) {
+ const void *opt_flag = * (const void * const *)(msg + field->offset);
+
+ if ((opt_flag == NULL) || (opt_flag == field->default_value))
+ return 0;
+ } else {
+ const protobuf_c_boolean *has = msg + field->quantifier_offset;
+
+ if (!*has)
+ return 0;
+ }
+ return 1;
+}
+
+static bool should_show_field(const char *name)
+{
+ char *s, *e;
+ int len;
+
+ if (!opts.show_fmt)
+ return true;
+
+ len = strlen(name);
+ s = opts.show_fmt;
+
+ while (1) {
+ e = strchrnul(s, ',');
+ if (e - s == len) {
+ if (!strncmp(name, s, len))
+ return true;
+ }
+ if (*e == '\0')
+ return false;
+ s = e + 1;
+ }
+}
+
+static void pb_show_msg(const void *msg, pb_pr_ctl_t *ctl)
+{
+ int i;
+ const ProtobufCMessageDescriptor *md = ctl->arg;
+
+ BUG_ON(md == NULL);
+
+ for (i = 0; i < md->n_fields; i++) {
+ const ProtobufCFieldDescriptor fd = md->fields[i];
+ unsigned long *data;
+ size_t nr_fields;
+
+ nr_fields = 1;
+ data = (unsigned long *)(msg + fd.offset);
+
+ if (fd.label == PROTOBUF_C_LABEL_OPTIONAL) {
+ if (!pb_optional_field_present(&fd, msg))
+ continue;
+ }
+
+ if (!should_show_field(fd.name))
+ continue;
+
+ if (fd.label == PROTOBUF_C_LABEL_REPEATED) {
+ nr_fields = *(size_t *)(msg + fd.quantifier_offset);
+ data = (unsigned long *)*data;
+ }
+
+ ctl->cur.data = data;
+ ctl->cur.number = i + 1;
+ ctl->cur.count = nr_fields;
+
+ pb_show_field(&fd, ctl);
+ }
+}
+
+static inline void pb_no_payload(struct cr_img *i, void *obj) { }
+
+void do_pb_show_plain(struct cr_img *img, int type, int single_entry,
+ void (*payload_hadler)(struct cr_img *, void *obj),
+ const char *pretty_fmt)
+{
+ pb_pr_ctl_t ctl = {NULL, single_entry, pretty_fmt};
+ void (*handle_payload)(struct cr_img *, void *obj);
+
+ if (!cr_pb_descs[type].pb_desc) {
+ pr_err("Wrong object requested %d\n", type);
+ return;
+ }
+
+ handle_payload = (payload_hadler) ? : pb_no_payload;
+
+ while (1) {
+ void *obj;
+
+ if (pb_read_one_eof(img, &obj, type) <= 0)
+ break;
+
+ ctl.arg = (void *)cr_pb_descs[type].pb_desc;
+ pb_show_msg(obj, &ctl);
+ handle_payload(img, obj);
+ cr_pb_descs[type].free(obj, NULL);
+ if (single_entry)
+ break;
+ pr_msg("\n");
+ }
+}
+
+static char *image_name(struct cr_img *img)
+{
+ int fd = img->_x.fd;
+ static char image_path[PATH_MAX];
+
+ if (read_fd_link(fd, image_path, sizeof(image_path)) > 0)
+ return image_path;
+ return NULL;
+}
+
+/*
+ * Reads PB record (header + packed object) from file @fd and unpack
+ * it with @unpack procedure to the pointer @pobj
+ *
+ * 1 on success
+ * -1 on error (or EOF met and @eof set to false)
+ * 0 on EOF and @eof set to true
+ *
+ * Don't forget to free memory granted to unpacked object in calling code if needed
+ */
+
+int do_pb_read_one(struct cr_img *img, void **pobj, int type, bool eof)
+{
+ u8 local[PB_PKOBJ_LOCAL_SIZE];
+ void *buf = (void *)&local;
+ u32 size;
+ int ret;
+
+ if (!cr_pb_descs[type].pb_desc) {
+ pr_err("Wrong object requested %d on %s\n",
+ type, image_name(img));
+ return -1;
+ }
+
+ *pobj = NULL;
+
+ if (unlikely(empty_image(img)))
+ ret = 0;
+ else
+ ret = bread(&img->_x, &size, sizeof(size));
+ if (ret == 0) {
+ if (eof) {
+ return 0;
+ } else {
+ pr_err("Unexpected EOF on %s\n",
+ image_name(img));
+ return -1;
+ }
+ } else if (ret < sizeof(size)) {
+ pr_perror("Read %d bytes while %d expected on %s",
+ ret, (int)sizeof(size),
+ image_name(img));
+ return -1;
+ }
+
+ if (size > sizeof(local)) {
+ ret = -1;
+ buf = xmalloc(size);
+ if (!buf)
+ goto err;
+ }
+
+ ret = bread(&img->_x, buf, size);
+ if (ret < 0) {
+ pr_perror("Can't read %d bytes from file %s",
+ size, image_name(img));
+ goto err;
+ } else if (ret != size) {
+ pr_perror("Read %d bytes while %d expected from %s",
+ ret, size, image_name(img));
+ ret = -1;
+ goto err;
+ }
+
+ *pobj = cr_pb_descs[type].unpack(NULL, size, buf);
+ if (!*pobj) {
+ ret = -1;
+ pr_err("Failed unpacking object %p from %s\n",
+ pobj, image_name(img));
+ goto err;
+ }
+
+ ret = 1;
+err:
+ if (buf != (void *)&local)
+ xfree(buf);
+
+ return ret;
+}
+
+/*
+ * Writes PB record (header + packed object pointed by @obj)
+ * to file @fd, using @getpksize to get packed size and @pack
+ * to implement packing
+ *
+ * 0 on success
+ * -1 on error
+ */
+int pb_write_one(struct cr_img *img, void *obj, int type)
+{
+ u8 local[PB_PKOBJ_LOCAL_SIZE];
+ void *buf = (void *)&local;
+ u32 size, packed;
+ int ret = -1;
+ struct iovec iov[2];
+
+ if (!cr_pb_descs[type].pb_desc) {
+ pr_err("Wrong object requested %d\n", type);
+ return -1;
+ }
+
+ if (lazy_image(img) && open_image_lazy(img))
+ return -1;
+
+ size = cr_pb_descs[type].getpksize(obj);
+ if (size > (u32)sizeof(local)) {
+ buf = xmalloc(size);
+ if (!buf)
+ goto err;
+ }
+
+ packed = cr_pb_descs[type].pack(obj, buf);
+ if (packed != size) {
+ pr_err("Failed packing PB object %p\n", obj);
+ goto err;
+ }
+
+ iov[0].iov_base = &size;
+ iov[0].iov_len = sizeof(size);
+ iov[1].iov_base = buf;
+ iov[1].iov_len = size;
+
+ ret = bwritev(&img->_x, iov, 2);
+ if (ret != size + sizeof(size)) {
+ pr_perror("Can't write %d bytes", (int)(size + sizeof(size)));
+ goto err;
+ }
+
+ ret = 0;
+err:
+ if (buf != (void *)&local)
+ xfree(buf);
+ return ret;
+}
+
+int collect_image(struct collect_image_info *cinfo)
+{
+ int ret;
+ struct cr_img *img;
+ void *(*o_alloc)(size_t size) = malloc;
+ void (*o_free)(void *ptr) = free;
+
+ pr_info("Collecting %d/%d (flags %x)\n",
+ cinfo->fd_type, cinfo->pb_type, cinfo->flags);
+
+ img = open_image(cinfo->fd_type, O_RSTR);
+ if (!img)
+ return -1;
+
+ cinfo->flags |= COLLECT_HAPPENED;
+ if (cinfo->flags & COLLECT_SHARED) {
+ o_alloc = shmalloc;
+ o_free = shfree_last;
+ }
+
+ while (1) {
+ void *obj;
+ ProtobufCMessage *msg;
+
+ if (cinfo->priv_size) {
+ ret = -1;
+ obj = o_alloc(cinfo->priv_size);
+ if (!obj)
+ break;
+ } else
+ obj = NULL;
+
+ ret = pb_read_one_eof(img, &msg, cinfo->pb_type);
+ if (ret <= 0) {
+ o_free(obj);
+ break;
+ }
+
+ ret = cinfo->collect(obj, msg);
+ if (ret < 0) {
+ o_free(obj);
+ cr_pb_descs[cinfo->pb_type].free(msg, NULL);
+ break;
+ }
+
+ if (!cinfo->priv_size)
+ cr_pb_descs[cinfo->pb_type].free(msg, NULL);
+ }
+
+ close_image(img);
+ pr_debug(" `- ... done\n");
+ return ret;
+}
diff --git a/criu/pstree.c b/criu/pstree.c
new file mode 100644
index 000000000000..06bc5f84b5be
--- /dev/null
+++ b/criu/pstree.c
@@ -0,0 +1,846 @@
+#include <sys/mman.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sched.h>
+
+#include "cr_options.h"
+#include "pstree.h"
+#include "util.h"
+#include "lock.h"
+#include "namespaces.h"
+#include "files.h"
+#include "tty.h"
+#include "mount.h"
+#include "asm/dump.h"
+
+#include "protobuf.h"
+#include "protobuf/pstree.pb-c.h"
+
+struct pstree_item *root_item;
+
+#define CLONE_ALLNS (CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWUSER)
+
+void core_entry_free(CoreEntry *core)
+{
+ if (core->tc && core->tc->timers)
+ xfree(core->tc->timers->posix);
+ if (core->thread_core)
+ xfree(core->thread_core->creds->groups);
+ arch_free_thread_info(core);
+ xfree(core);
+}
+
+#ifndef RLIM_NLIMITS
+# define RLIM_NLIMITS 16
+#endif
+
+CoreEntry *core_entry_alloc(int th, int tsk)
+{
+ size_t sz;
+ CoreEntry *core = NULL;
+ void *m;
+
+ sz = sizeof(CoreEntry);
+ if (tsk) {
+ sz += sizeof(TaskCoreEntry) + TASK_COMM_LEN;
+ if (th) {
+ sz += sizeof(TaskRlimitsEntry);
+ sz += RLIM_NLIMITS * sizeof(RlimitEntry *);
+ sz += RLIM_NLIMITS * sizeof(RlimitEntry);
+ sz += sizeof(TaskTimersEntry);
+ sz += 3 * sizeof(ItimerEntry); /* 3 for real, virt and prof */
+ }
+ }
+ if (th) {
+ CredsEntry *ce = NULL;
+
+ sz += sizeof(ThreadCoreEntry) + sizeof(ThreadSasEntry) + sizeof(CredsEntry);
+
+ sz += CR_CAP_SIZE * sizeof(ce->cap_inh[0]);
+ sz += CR_CAP_SIZE * sizeof(ce->cap_prm[0]);
+ sz += CR_CAP_SIZE * sizeof(ce->cap_eff[0]);
+ sz += CR_CAP_SIZE * sizeof(ce->cap_bnd[0]);
+ /*
+ * @groups are dynamic and allocated
+ * on demand.
+ */
+ }
+
+ m = xmalloc(sz);
+ if (m) {
+ core = xptr_pull(&m, CoreEntry);
+ core_entry__init(core);
+ core->mtype = CORE_ENTRY__MARCH;
+
+ if (tsk) {
+ core->tc = xptr_pull(&m, TaskCoreEntry);
+ task_core_entry__init(core->tc);
+ core->tc->comm = xptr_pull_s(&m, TASK_COMM_LEN);
+ memzero(core->tc->comm, TASK_COMM_LEN);
+
+ if (th) {
+ TaskRlimitsEntry *rls;
+ TaskTimersEntry *tte;
+ int i;
+
+ rls = core->tc->rlimits = xptr_pull(&m, TaskRlimitsEntry);
+ task_rlimits_entry__init(rls);
+
+ rls->n_rlimits = RLIM_NLIMITS;
+ rls->rlimits = xptr_pull_s(&m, sizeof(RlimitEntry *) * RLIM_NLIMITS);
+
+ for (i = 0; i < RLIM_NLIMITS; i++) {
+ rls->rlimits[i] = xptr_pull(&m, RlimitEntry);
+ rlimit_entry__init(rls->rlimits[i]);
+ }
+
+ tte = core->tc->timers = xptr_pull(&m, TaskTimersEntry);
+ task_timers_entry__init(tte);
+ tte->real = xptr_pull(&m, ItimerEntry);
+ itimer_entry__init(tte->real);
+ tte->virt = xptr_pull(&m, ItimerEntry);
+ itimer_entry__init(tte->virt);
+ tte->prof = xptr_pull(&m, ItimerEntry);
+ itimer_entry__init(tte->prof);
+ }
+ }
+
+ if (th) {
+ CredsEntry *ce;
+
+ core->thread_core = xptr_pull(&m, ThreadCoreEntry);
+ thread_core_entry__init(core->thread_core);
+ core->thread_core->sas = xptr_pull(&m, ThreadSasEntry);
+ thread_sas_entry__init(core->thread_core->sas);
+ ce = core->thread_core->creds = xptr_pull(&m, CredsEntry);
+ creds_entry__init(ce);
+
+ ce->n_cap_inh = CR_CAP_SIZE;
+ ce->n_cap_prm = CR_CAP_SIZE;
+ ce->n_cap_eff = CR_CAP_SIZE;
+ ce->n_cap_bnd = CR_CAP_SIZE;
+ ce->cap_inh = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_inh[0]));
+ ce->cap_prm = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_prm[0]));
+ ce->cap_eff = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_eff[0]));
+ ce->cap_bnd = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_bnd[0]));
+
+ if (arch_alloc_thread_info(core)) {
+ xfree(core);
+ core = NULL;
+ }
+ }
+ }
+
+ return core;
+}
+
+int pstree_alloc_cores(struct pstree_item *item)
+{
+ unsigned int i;
+
+ item->core = xzalloc(sizeof(*item->core) * item->nr_threads);
+ if (!item->core)
+ return -1;
+
+ for (i = 0; i < item->nr_threads; i++) {
+ if (item->threads[i].real == item->pid.real)
+ item->core[i] = core_entry_alloc(1, 1);
+ else
+ item->core[i] = core_entry_alloc(1, 0);
+
+ if (!item->core[i])
+ goto err;
+ }
+
+ return 0;
+err:
+ pstree_free_cores(item);
+ return -1;
+}
+
+void pstree_free_cores(struct pstree_item *item)
+{
+ unsigned int i;
+
+ if (item->core) {
+ for (i = 1; i < item->nr_threads; i++)
+ core_entry_free(item->core[i]);
+ xfree(item->core);
+ item->core = NULL;
+ }
+}
+
+void free_pstree(struct pstree_item *root_item)
+{
+ struct pstree_item *item = root_item, *parent;
+
+ while (item) {
+ if (!list_empty(&item->children)) {
+ item = list_first_entry(&item->children, struct pstree_item, sibling);
+ continue;
+ }
+
+ parent = item->parent;
+ list_del(&item->sibling);
+ pstree_free_cores(item);
+ xfree(item->threads);
+ xfree(item);
+ item = parent;
+ }
+}
+
+struct pstree_item *__alloc_pstree_item(bool rst)
+{
+ struct pstree_item *item;
+ int sz;
+
+ if (!rst) {
+ sz = sizeof(*item) + sizeof(struct dmp_info);
+ item = xzalloc(sz);
+ if (!item)
+ return NULL;
+ } else {
+ sz = sizeof(*item) + sizeof(struct rst_info);
+ item = shmalloc(sz);
+ if (!item)
+ return NULL;
+
+ memset(item, 0, sz);
+ vm_area_list_init(&rsti(item)->vmas);
+ }
+
+ INIT_LIST_HEAD(&item->children);
+ INIT_LIST_HEAD(&item->sibling);
+
+ item->pid.virt = -1;
+ item->pid.real = -1;
+ item->born_sid = -1;
+
+ return item;
+}
+
+struct pstree_item *alloc_pstree_helper(void)
+{
+ struct pstree_item *ret;
+
+ ret = alloc_pstree_item_with_rst();
+ if (ret) {
+ ret->state = TASK_HELPER;
+ rsti(ret)->clone_flags = CLONE_FILES | CLONE_FS;
+ task_entries->nr_helpers++;
+ }
+
+ return ret;
+}
+
+/* Deep first search on children */
+struct pstree_item *pstree_item_next(struct pstree_item *item)
+{
+ if (!list_empty(&item->children))
+ return list_first_entry(&item->children, struct pstree_item, sibling);
+
+ while (item->parent) {
+ if (item->sibling.next != &item->parent->children)
+ return list_entry(item->sibling.next, struct pstree_item, sibling);
+ item = item->parent;
+ }
+
+ return NULL;
+}
+
+/* Preorder traversal of pstree item */
+int preorder_pstree_traversal(struct pstree_item *item, int (*f)(struct pstree_item *))
+{
+ struct pstree_item *cursor;
+
+ if (f(item) < 0)
+ return -1;
+
+ list_for_each_entry(cursor, &item->children, sibling) {
+ if (preorder_pstree_traversal(cursor, f) < 0)
+ return -1;
+ }
+
+ return 0;
+}
+
+int dump_pstree(struct pstree_item *root_item)
+{
+ struct pstree_item *item = root_item;
+ PstreeEntry e = PSTREE_ENTRY__INIT;
+ int ret = -1, i;
+ struct cr_img *img;
+
+ pr_info("\n");
+ pr_info("Dumping pstree (pid: %d)\n", root_item->pid.real);
+ pr_info("----------------------------------------\n");
+
+ /*
+ * Make sure we're dumping session leader, if not an
+ * appropriate option must be passed.
+ *
+ * Also note that if we're not a session leader we
+ * can't get the situation where the leader sits somewhere
+ * deeper in process tree, thus top-level checking for
+ * leader is enough.
+ */
+ if (root_item->pid.virt != root_item->sid) {
+ if (!opts.shell_job) {
+ pr_err("The root process %d is not a session leader. "
+ "Consider using --" OPT_SHELL_JOB " option\n", item->pid.virt);
+ return -1;
+ }
+ }
+
+ img = open_image(CR_FD_PSTREE, O_DUMP);
+ if (!img)
+ return -1;
+
+ for_each_pstree_item(item) {
+ pr_info("Process: %d(%d)\n", item->pid.virt, item->pid.real);
+
+ e.pid = item->pid.virt;
+ e.ppid = item->parent ? item->parent->pid.virt : 0;
+ e.pgid = item->pgid;
+ e.sid = item->sid;
+ e.n_threads = item->nr_threads;
+
+ e.threads = xmalloc(sizeof(e.threads[0]) * e.n_threads);
+ if (!e.threads)
+ goto err;
+
+ for (i = 0; i < item->nr_threads; i++)
+ e.threads[i] = item->threads[i].virt;
+
+ ret = pb_write_one(img, &e, PB_PSTREE);
+ xfree(e.threads);
+
+ if (ret)
+ goto err;
+ }
+ ret = 0;
+
+err:
+ pr_info("----------------------------------------\n");
+ close_image(img);
+ return ret;
+}
+
+static int max_pid = 0;
+
+static int prepare_pstree_for_shell_job(void)
+{
+ pid_t current_sid = getsid(getpid());
+ pid_t current_gid = getpgid(getpid());
+
+ struct pstree_item *pi;
+
+ pid_t old_sid;
+ pid_t old_gid;
+
+ if (!opts.shell_job)
+ return 0;
+
+ if (root_item->sid == root_item->pid.virt)
+ return 0;
+
+ /*
+ * Migration of a root task group leader is a bit tricky.
+ * When a task yields SIGSTOP, the kernel notifies the parent
+ * with SIGCHLD. This means when task is running in a
+ * shell, the shell obtains SIGCHLD and sends a task to
+ * the background.
+ *
+ * The situation gets changed once we restore the
+ * program -- our tool become an additional stub between
+ * the restored program and the shell. So to be able to
+ * notify the shell with SIGCHLD from our restored
+ * program -- we make the root task to inherit the
+ * process group from us.
+ *
+ * Not that clever solution but at least it works.
+ */
+
+ old_sid = root_item->sid;
+ old_gid = root_item->pgid;
+
+ pr_info("Migrating process tree (GID %d->%d SID %d->%d)\n",
+ old_gid, current_gid, old_sid, current_sid);
+
+ for_each_pstree_item(pi) {
+ if (pi->pgid == old_gid)
+ pi->pgid = current_gid;
+ if (pi->sid == old_sid)
+ pi->sid = current_sid;
+ }
+
+ max_pid = max((int)current_sid, max_pid);
+ max_pid = max((int)current_gid, max_pid);
+
+ return 0;
+}
+
+static int read_pstree_image(void)
+{
+ int ret = 0, i;
+ struct cr_img *img;
+ struct pstree_item *pi, *parent = NULL;
+
+ pr_info("Reading image tree\n");
+
+ img = open_image(CR_FD_PSTREE, O_RSTR);
+ if (!img)
+ return -1;
+
+ while (1) {
+ PstreeEntry *e;
+
+ ret = pb_read_one_eof(img, &e, PB_PSTREE);
+ if (ret <= 0)
+ break;
+
+ ret = -1;
+ pi = alloc_pstree_item_with_rst();
+ if (pi == NULL)
+ break;
+
+ pi->pid.virt = e->pid;
+ max_pid = max((int)e->pid, max_pid);
+
+ pi->pgid = e->pgid;
+ max_pid = max((int)e->pgid, max_pid);
+
+ pi->sid = e->sid;
+ max_pid = max((int)e->sid, max_pid);
+
+ if (e->ppid == 0) {
+ if (root_item) {
+ pr_err("Parent missed on non-root task "
+ "with pid %d, image corruption!\n", e->pid);
+ goto err;
+ }
+ root_item = pi;
+ pi->parent = NULL;
+ } else {
+ /*
+ * Fast path -- if the pstree image is not edited, the
+ * parent of any item should have already being restored
+ * and sit among the last item's ancestors.
+ */
+ while (parent) {
+ if (parent->pid.virt == e->ppid)
+ break;
+ parent = parent->parent;
+ }
+
+ if (parent == NULL) {
+ for_each_pstree_item(parent) {
+ if (parent->pid.virt == e->ppid)
+ break;
+ }
+
+ if (parent == NULL) {
+ pr_err("Can't find a parent for %d\n", pi->pid.virt);
+ pstree_entry__free_unpacked(e, NULL);
+ xfree(pi);
+ goto err;
+ }
+ }
+
+ pi->parent = parent;
+ list_add(&pi->sibling, &parent->children);
+ }
+
+ parent = pi;
+
+ pi->nr_threads = e->n_threads;
+ pi->threads = xmalloc(e->n_threads * sizeof(struct pid));
+ if (!pi->threads)
+ break;
+
+ for (i = 0; i < e->n_threads; i++) {
+ pi->threads[i].real = -1;
+ pi->threads[i].virt = e->threads[i];
+ max_pid = max((int)e->threads[i], max_pid);
+ }
+
+ task_entries->nr_threads += e->n_threads;
+ task_entries->nr_tasks++;
+
+ pstree_entry__free_unpacked(e, NULL);
+
+ {
+ struct cr_img *img;
+
+ img = open_image(CR_FD_IDS, O_RSTR, pi->pid.virt);
+ if (!img)
+ goto err;
+ ret = pb_read_one_eof(img, &pi->ids, PB_IDS);
+ close_image(img);
+ }
+
+ if (ret == 0)
+ continue;
+ if (ret < 0)
+ goto err;
+
+ if (pi->ids->has_mnt_ns_id) {
+ if (rst_add_ns_id(pi->ids->mnt_ns_id, pi, &mnt_ns_desc))
+ goto err;
+ }
+ }
+err:
+ close_image(img);
+ return ret;
+}
+
+static int prepare_pstree_ids(void)
+{
+ struct pstree_item *item, *child, *helper, *tmp;
+ LIST_HEAD(helpers);
+
+ pid_t current_pgid = getpgid(getpid());
+
+ /*
+ * Some task can be reparented to init. A helper task should be added
+ * for restoring sid of such tasks. The helper tasks will be exited
+ * immediately after forking children and all children will be
+ * reparented to init.
+ */
+ list_for_each_entry(item, &root_item->children, sibling) {
+
+ /*
+ * If a child belongs to the root task's session or it's
+ * a session leader himself -- this is a simple case, we
+ * just proceed in a normal way.
+ */
+ if (item->sid == root_item->sid || item->sid == item->pid.virt)
+ continue;
+
+ helper = alloc_pstree_helper();
+ if (helper == NULL)
+ return -1;
+ helper->sid = item->sid;
+ helper->pgid = item->sid;
+ helper->pid.virt = item->sid;
+ helper->parent = root_item;
+ helper->ids = root_item->ids;
+ list_add_tail(&helper->sibling, &helpers);
+
+ pr_info("Add a helper %d for restoring SID %d\n",
+ helper->pid.virt, helper->sid);
+
+ child = list_entry(item->sibling.prev, struct pstree_item, sibling);
+ item = child;
+
+ /*
+ * Stack on helper task all children with target sid.
+ */
+ list_for_each_entry_safe_continue(child, tmp, &root_item->children, sibling) {
+ if (child->sid != helper->sid)
+ continue;
+ if (child->sid == child->pid.virt)
+ continue;
+
+ pr_info("Attach %d to the temporary task %d\n",
+ child->pid.virt, helper->pid.virt);
+
+ child->parent = helper;
+ list_move(&child->sibling, &helper->children);
+ }
+ }
+
+ /* Try to connect helpers to session leaders */
+ for_each_pstree_item(item) {
+ if (!item->parent) /* skip the root task */
+ continue;
+
+ if (item->state == TASK_HELPER)
+ continue;
+
+ if (item->sid != item->pid.virt) {
+ struct pstree_item *parent;
+
+ if (item->parent->sid == item->sid)
+ continue;
+
+ /* the task could fork a child before and after setsid() */
+ parent = item->parent;
+ while (parent && parent->pid.virt != item->sid) {
+ if (parent->born_sid != -1 && parent->born_sid != item->sid) {
+ pr_err("Can't determinate with which sid (%d or %d)"
+ "the process %d was born\n",
+ parent->born_sid, item->sid, parent->pid.virt);
+ return -1;
+ }
+ parent->born_sid = item->sid;
+ pr_info("%d was born with sid %d\n", parent->pid.virt, item->sid);
+ parent = parent->parent;
+ }
+
+ if (parent == NULL) {
+ pr_err("Can't find a session leader for %d\n", item->sid);
+ return -1;
+ }
+
+ continue;
+ }
+
+ pr_info("Session leader %d\n", item->sid);
+
+ /* Try to find helpers, who should be connected to the leader */
+ list_for_each_entry(child, &helpers, sibling) {
+ if (child->state != TASK_HELPER)
+ continue;
+
+ if (child->sid != item->sid)
+ continue;
+
+ child->pgid = item->pgid;
+ child->pid.virt = ++max_pid;
+ child->parent = item;
+ list_move(&child->sibling, &item->children);
+
+ pr_info("Attach %d to the task %d\n",
+ child->pid.virt, item->pid.virt);
+
+ break;
+ }
+ }
+
+ /* All other helpers are session leaders for own sessions */
+ list_splice(&helpers, &root_item->children);
+
+ /* Add a process group leader if it is absent */
+ for_each_pstree_item(item) {
+ struct pstree_item *gleader;
+
+ if (!item->pgid || item->pid.virt == item->pgid)
+ continue;
+
+ for_each_pstree_item(gleader) {
+ if (gleader->pid.virt == item->pgid)
+ break;
+ }
+
+ if (gleader) {
+ rsti(item)->pgrp_leader = gleader;
+ continue;
+ }
+
+ /*
+ * If the PGID is eq to current one -- this
+ * means we're inheriting group from the current
+ * task so we need to escape creating a helper here.
+ */
+ if (current_pgid == item->pgid)
+ continue;
+
+ helper = alloc_pstree_helper();
+ if (helper == NULL)
+ return -1;
+ helper->sid = item->sid;
+ helper->pgid = item->pgid;
+ helper->pid.virt = item->pgid;
+ helper->parent = item;
+ helper->ids = item->ids;
+ list_add(&helper->sibling, &item->children);
+ rsti(item)->pgrp_leader = helper;
+
+ pr_info("Add a helper %d for restoring PGID %d\n",
+ helper->pid.virt, helper->pgid);
+ }
+
+ return 0;
+}
+
+static unsigned long get_clone_mask(TaskKobjIdsEntry *i,
+ TaskKobjIdsEntry *p)
+{
+ unsigned long mask = 0;
+
+ if (i->files_id == p->files_id)
+ mask |= CLONE_FILES;
+ if (i->pid_ns_id != p->pid_ns_id)
+ mask |= CLONE_NEWPID;
+ if (i->net_ns_id != p->net_ns_id)
+ mask |= CLONE_NEWNET;
+ if (i->ipc_ns_id != p->ipc_ns_id)
+ mask |= CLONE_NEWIPC;
+ if (i->uts_ns_id != p->uts_ns_id)
+ mask |= CLONE_NEWUTS;
+ if (i->mnt_ns_id != p->mnt_ns_id)
+ mask |= CLONE_NEWNS;
+ if (i->user_ns_id != p->user_ns_id)
+ mask |= CLONE_NEWUSER;
+
+ return mask;
+}
+
+static int prepare_pstree_kobj_ids(void)
+{
+ struct pstree_item *item;
+
+ /* Find a process with minimal pid for shared fd tables */
+ for_each_pstree_item(item) {
+ struct pstree_item *parent = item->parent;
+ TaskKobjIdsEntry *ids;
+ unsigned long cflags;
+
+ if (!item->ids) {
+ if (item == root_item) {
+ cflags = opts.rst_namespaces_flags;
+ goto set_mask;
+ }
+
+ continue;
+ }
+
+ if (parent)
+ ids = parent->ids;
+ else
+ ids = root_ids;
+
+ /*
+ * Add some sanity check on image data.
+ */
+ if (unlikely(!ids)) {
+ pr_err("No kIDs provided, image corruption\n");
+ return -1;
+ }
+
+ cflags = get_clone_mask(item->ids, ids);
+
+ if (cflags & CLONE_FILES) {
+ int ret;
+
+ /*
+ * There might be a case when kIDs for
+ * root task are the same as in root_ids,
+ * thus it's image corruption and we should
+ * exit out.
+ */
+ if (unlikely(!item->parent)) {
+ pr_err("Image corruption on kIDs data\n");
+ return -1;
+ }
+
+ ret = shared_fdt_prepare(item);
+ if (ret)
+ return ret;
+ }
+
+set_mask:
+ rsti(item)->clone_flags = cflags;
+ if (parent)
+ /*
+ * Mount namespaces are setns()-ed at
+ * restore_task_mnt_ns() explicitly,
+ * no need in creating it with its own
+ * temporary namespace.
+ *
+ * Root task is exceptional -- it will
+ * be born in a fresh new mount namespace
+ * which will be populated with all other
+ * namespaces' entries.
+ */
+ rsti(item)->clone_flags &= ~CLONE_NEWNS;
+
+ cflags &= CLONE_ALLNS;
+
+ if (item == root_item) {
+ pr_info("Will restore in %lx namespaces\n", cflags);
+ root_ns_mask = cflags;
+ } else if (cflags & ~(root_ns_mask & CLONE_SUBNS)) {
+ /*
+ * Namespaces from CLONE_SUBNS can be nested, but in
+ * this case nobody can't share external namespaces of
+ * these types.
+ *
+ * Workaround for all other namespaces --
+ * all tasks should be in one namespace. And
+ * this namespace is either inherited from the
+ * criu or is created for the init task (only)
+ */
+ pr_err("Can't restore sub-task in NS\n");
+ return -1;
+ }
+ }
+
+ pr_debug("NS mask to use %lx\n", root_ns_mask);
+ return 0;
+}
+
+int prepare_pstree(void)
+{
+ int ret;
+
+ ret = read_pstree_image();
+ if (!ret)
+ /*
+ * Shell job may inherit sid/pgid from the current
+ * shell, not from image. Set things up for this.
+ */
+ ret = prepare_pstree_for_shell_job();
+ if (!ret)
+ /*
+ * Walk the collected tree and prepare for restoring
+ * of shared objects at clone time
+ */
+ ret = prepare_pstree_kobj_ids();
+ if (!ret)
+ /*
+ * Session/Group leaders might be dead. Need to fix
+ * pstree with properly injected helper tasks.
+ */
+ ret = prepare_pstree_ids();
+
+ return ret;
+}
+
+bool restore_before_setsid(struct pstree_item *child)
+{
+ int csid = child->born_sid == -1 ? child->sid : child->born_sid;
+
+ if (child->parent->born_sid == csid)
+ return true;
+
+ return false;
+}
+
+struct pstree_item *pstree_item_by_virt(pid_t virt)
+{
+ struct pstree_item *item;
+
+ for_each_pstree_item(item) {
+ if (item->pid.virt == virt)
+ return item;
+ }
+ return NULL;
+}
+
+struct pstree_item *pstree_item_by_real(pid_t real)
+{
+ struct pstree_item *item;
+
+ for_each_pstree_item(item) {
+ if (item->pid.real == real)
+ return item;
+ }
+ return NULL;
+}
+
+int pid_to_virt(pid_t real)
+{
+ struct pstree_item *item;
+
+ item = pstree_item_by_real(real);
+ if (item)
+ return item->pid.virt;
+ return 0;
+}
+
+bool pid_in_pstree(pid_t pid)
+{
+ return pstree_item_by_real(pid) != NULL;
+}
diff --git a/criu/ptrace.c b/criu/ptrace.c
new file mode 100644
index 000000000000..25970fc4eb57
--- /dev/null
+++ b/criu/ptrace.c
@@ -0,0 +1,331 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <limits.h>
+#include <signal.h>
+
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/wait.h>
+
+#include "compiler.h"
+#include "asm/types.h"
+#include "util.h"
+#include "ptrace.h"
+#include "proc_parse.h"
+#include "crtools.h"
+#include "seccomp.h"
+#include "cr_options.h"
+
+int unseize_task(pid_t pid, int orig_st, int st)
+{
+ pr_debug("\tUnseizing %d into %d\n", pid, st);
+
+ if (st == TASK_DEAD) {
+ kill(pid, SIGKILL);
+ return 0;
+ } else if (st == TASK_STOPPED) {
+ /*
+ * Task might have had STOP in queue. We detected such
+ * guy as TASK_STOPPED, but cleared signal to run the
+ * parasite code. hus after detach the task will become
+ * running. That said -- STOP everyone regardless of
+ * the initial state.
+ */
+ kill(pid, SIGSTOP);
+ } else if (st == TASK_ALIVE) {
+ /*
+ * Same as in the comment above -- there might be a
+ * task with STOP in queue that would get lost after
+ * detach, so stop it again.
+ */
+ if (orig_st == TASK_STOPPED)
+ kill(pid, SIGSTOP);
+ } else
+ pr_err("Unknown final state %d\n", st);
+
+ if (ptrace(PTRACE_DETACH, pid, NULL, NULL)) {
+ pr_perror("Unable to detach from %d", pid);
+ return -1;
+ }
+
+ return 0;
+}
+
+int suspend_seccomp(pid_t pid)
+{
+ if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP) < 0) {
+ pr_perror("suspending seccomp failed");
+ return -1;
+ }
+
+ return 0;
+}
+
+int seize_catch_task(pid_t pid)
+{
+ int ret;
+
+ ret = ptrace(PTRACE_SEIZE, pid, NULL, 0);
+ if (ret) {
+ /*
+ * ptrace API doesn't allow to distinguish
+ * attaching to zombie from other errors.
+ * All errors will be handled in seize_wait_task().
+ */
+ pr_warn("Unable to interrupt task: %d (%s)\n", pid, strerror(errno));
+ return ret;
+ }
+
+ /*
+ * If we SEIZE-d the task stop it before going
+ * and reading its stat from proc. Otherwise task
+ * may die _while_ we're doing it and we'll have
+ * inconsistent seize/state pair.
+ *
+ * If task dies after we seize it but before we
+ * do this interrupt, we'll notice it via proc.
+ */
+ ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL);
+ if (ret < 0) {
+ pr_warn("SEIZE %d: can't interrupt task: %s", pid, strerror(errno));
+ if (ptrace(PTRACE_DETACH, pid, NULL, NULL))
+ pr_perror("Unable to detach from %d", pid);
+ }
+
+ return ret;
+}
+
+static int skip_sigstop(int pid, int nr_signals)
+{
+ int i, status, ret;
+
+ /*
+ * 1) SIGSTOP is queued, but isn't handled yet:
+ * SGISTOP can't be blocked, so we need to wait when the kernel
+ * handles this signal.
+ *
+ * Otherwise the process will be stopped immediatly after
+ * starting it.
+ *
+ * 2) A seized task was stopped:
+ * PTRACE_SEIZE doesn't affect signal or group stop state.
+ * Currently ptrace reported that task is in stopped state.
+ * We need to start task again, and it will be trapped
+ * immediately, because we sent PTRACE_INTERRUPT to it.
+ */
+ for (i = 0; i < nr_signals; i++) {
+ ret = ptrace(PTRACE_CONT, pid, 0, 0);
+ if (ret) {
+ pr_perror("Unable to start process");
+ return -1;
+ }
+
+ ret = wait4(pid, &status, __WALL, NULL);
+ if (ret < 0) {
+ pr_perror("SEIZE %d: can't wait task", pid);
+ return -1;
+ }
+
+ if (!WIFSTOPPED(status)) {
+ pr_err("SEIZE %d: task not stopped after seize\n", pid);
+ return -1;
+ }
+ }
+ return 0;
+}
+
+/*
+ * This routine seizes task putting it into a special
+ * state where we can manipulate the task via ptrace
+ * interface, and finally we can detach ptrace out of
+ * of it so the task would not know if it was saddled
+ * up with someone else.
+ */
+int seize_wait_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds)
+{
+ siginfo_t si;
+ int status, nr_sigstop;
+ int ret = 0, ret2, wait_errno = 0;
+ struct proc_status_creds cr;
+
+ /*
+ * For the comparison below, let's zero out any padding.
+ */
+ memzero(&cr, sizeof(struct proc_status_creds));
+
+ /*
+ * It's ugly, but the ptrace API doesn't allow to distinguish
+ * attaching to zombie from other errors. Thus we have to parse
+ * the target's /proc/pid/stat. Sad, but parse whatever else
+ * we might need at that early point.
+ */
+
+ processes_to_wait--;
+try_again:
+
+ ret = wait4(pid, &status, __WALL, NULL);
+ if (ret < 0) {
+ /*
+ * wait4() can expectedly fail only in a first time
+ * if a task is zombie. If we are here from try_again,
+ * this means that we are tracing this task.
+ *
+ * processes_to_wait should be descrimented only once in this
+ * function if a first wait was success.
+ */
+ processes_to_wait++;
+ wait_errno = errno;
+ }
+
+ ret2 = parse_pid_status(pid, &cr);
+ if (ret2)
+ goto err;
+
+ if (ret < 0 || WIFEXITED(status) || WIFSIGNALED(status)) {
+ if (cr.state != 'Z') {
+ if (pid == getpid())
+ pr_err("The criu itself is within dumped tree.\n");
+ else
+ pr_err("Unseizable non-zombie %d found, state %c, err %d/%d\n",
+ pid, cr.state, ret, wait_errno);
+ return -1;
+ }
+
+ return TASK_DEAD;
+ }
+
+ if ((ppid != -1) && (cr.ppid != ppid)) {
+ pr_err("Task pid reused while suspending (%d: %d -> %d)\n",
+ pid, ppid, cr.ppid);
+ goto err;
+ }
+
+ if (!WIFSTOPPED(status)) {
+ pr_err("SEIZE %d: task not stopped after seize\n", pid);
+ goto err;
+ }
+
+ ret = ptrace(PTRACE_GETSIGINFO, pid, NULL, &si);
+ if (ret < 0) {
+ pr_perror("SEIZE %d: can't read signfo", pid);
+ goto err;
+ }
+
+ if (SI_EVENT(si.si_code) != PTRACE_EVENT_STOP) {
+ /*
+ * Kernel notifies us about the task being seized received some
+ * event other than the STOP, i.e. -- a signal. Let the task
+ * handle one and repeat.
+ */
+
+ if (ptrace(PTRACE_CONT, pid, NULL,
+ (void *)(unsigned long)si.si_signo)) {
+ pr_perror("Can't continue signal handling, aborting");
+ goto err;
+ }
+
+ ret = 0;
+ goto try_again;
+ }
+
+ if (*creds == NULL) {
+ *creds = xzalloc(sizeof(struct proc_status_creds));
+ if (!*creds)
+ goto err;
+
+ **creds = cr;
+
+ } else if (!proc_status_creds_dumpable(*creds, &cr)) {
+ pr_err("creds don't match %d %d\n", pid, ppid);
+ goto err;
+ }
+
+ if (cr.seccomp_mode != SECCOMP_MODE_DISABLED && suspend_seccomp(pid) < 0)
+ goto err;
+
+ nr_sigstop = 0;
+ if (cr.sigpnd & (1 << (SIGSTOP - 1)))
+ nr_sigstop++;
+ if (cr.shdpnd & (1 << (SIGSTOP - 1)))
+ nr_sigstop++;
+ if (si.si_signo == SIGSTOP)
+ nr_sigstop++;
+
+ if (nr_sigstop) {
+ if (skip_sigstop(pid, nr_sigstop))
+ goto err_stop;
+
+ return TASK_STOPPED;
+ }
+
+ if (si.si_signo == SIGTRAP)
+ return TASK_ALIVE;
+ else {
+ pr_err("SEIZE %d: unsupported stop signal %d\n", pid, si.si_signo);
+ goto err;
+ }
+
+err_stop:
+ kill(pid, SIGSTOP);
+err:
+ if (ptrace(PTRACE_DETACH, pid, NULL, NULL))
+ pr_perror("Unable to detach from %d", pid);
+ return -1;
+}
+
+int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes)
+{
+ unsigned long w;
+ if (bytes & (sizeof(long) - 1))
+ return -1;
+ for (w = 0; w < bytes / sizeof(long); w++) {
+ unsigned long *d = dst, *a = addr;
+ d[w] = ptrace(PTRACE_PEEKDATA, pid, a + w, NULL);
+ if (d[w] == -1U && errno)
+ goto err;
+ }
+ return 0;
+err:
+ return -2;
+}
+
+int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes)
+{
+ unsigned long w;
+ if (bytes & (sizeof(long) - 1))
+ return -1;
+ for (w = 0; w < bytes / sizeof(long); w++) {
+ unsigned long *s = src, *a = addr;
+ if (ptrace(PTRACE_POKEDATA, pid, a + w, s[w]))
+ goto err;
+ }
+ return 0;
+err:
+ return -2;
+}
+
+/* don't swap big space, it might overflow the stack */
+int ptrace_swap_area(pid_t pid, void *dst, void *src, long bytes)
+{
+ void *t = alloca(bytes);
+
+ if (ptrace_peek_area(pid, t, dst, bytes))
+ return -1;
+
+ if (ptrace_poke_area(pid, src, dst, bytes)) {
+ if (ptrace_poke_area(pid, t, dst, bytes))
+ return -2;
+ return -1;
+ }
+
+ memcpy(src, t, bytes);
+
+ return 0;
+}
diff --git a/criu/rbtree.c b/criu/rbtree.c
new file mode 100644
index 000000000000..64a38ea76a48
--- /dev/null
+++ b/criu/rbtree.c
@@ -0,0 +1,357 @@
+/*
+ * RBtree implementation adopted from the Linux kernel sources.
+ */
+
+#include <sys/types.h>
+#include "rbtree.h"
+
+static void __rb_rotate_left(struct rb_node *node, struct rb_root *root)
+{
+ struct rb_node *right = node->rb_right;
+ struct rb_node *parent = rb_parent(node);
+
+ node->rb_right = right->rb_left;
+ if (node->rb_right)
+ rb_set_parent(right->rb_left, node);
+ right->rb_left = node;
+
+ rb_set_parent(right, parent);
+
+ if (parent) {
+ if (node == parent->rb_left)
+ parent->rb_left = right;
+ else
+ parent->rb_right = right;
+ } else
+ root->rb_node = right;
+ rb_set_parent(node, right);
+}
+
+static void __rb_rotate_right(struct rb_node *node, struct rb_root *root)
+{
+ struct rb_node *left = node->rb_left;
+ struct rb_node *parent = rb_parent(node);
+
+ node->rb_left = left->rb_right;
+ if (node->rb_left)
+ rb_set_parent(left->rb_right, node);
+ left->rb_right = node;
+
+ rb_set_parent(left, parent);
+
+ if (parent) {
+ if (node == parent->rb_right)
+ parent->rb_right = left;
+ else
+ parent->rb_left = left;
+ } else
+ root->rb_node = left;
+ rb_set_parent(node, left);
+}
+
+void rb_insert_color(struct rb_node *node, struct rb_root *root)
+{
+ struct rb_node *parent, *gparent;
+
+ while ((parent = rb_parent(node)) && rb_is_red(parent)) {
+ gparent = rb_parent(parent);
+
+ if (parent == gparent->rb_left) {
+ {
+ register struct rb_node *uncle = gparent->rb_right;
+ if (uncle && rb_is_red(uncle)) {
+ rb_set_black(uncle);
+ rb_set_black(parent);
+ rb_set_red(gparent);
+ node = gparent;
+ continue;
+ }
+ }
+
+ if (parent->rb_right == node) {
+ register struct rb_node *tmp;
+ __rb_rotate_left(parent, root);
+ tmp = parent;
+ parent = node;
+ node = tmp;
+ }
+
+ rb_set_black(parent);
+ rb_set_red(gparent);
+ __rb_rotate_right(gparent, root);
+ } else {
+ {
+ register struct rb_node *uncle = gparent->rb_left;
+ if (uncle && rb_is_red(uncle)) {
+ rb_set_black(uncle);
+ rb_set_black(parent);
+ rb_set_red(gparent);
+ node = gparent;
+ continue;
+ }
+ }
+
+ if (parent->rb_left == node) {
+ register struct rb_node *tmp;
+ __rb_rotate_right(parent, root);
+ tmp = parent;
+ parent = node;
+ node = tmp;
+ }
+
+ rb_set_black(parent);
+ rb_set_red(gparent);
+ __rb_rotate_left(gparent, root);
+ }
+ }
+
+ rb_set_black(root->rb_node);
+}
+
+static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
+ struct rb_root *root)
+{
+ struct rb_node *other;
+
+ while ((!node || rb_is_black(node)) && node != root->rb_node) {
+ if (parent->rb_left == node) {
+ other = parent->rb_right;
+ if (rb_is_red(other)) {
+ rb_set_black(other);
+ rb_set_red(parent);
+ __rb_rotate_left(parent, root);
+ other = parent->rb_right;
+ }
+ if ((!other->rb_left || rb_is_black(other->rb_left)) &&
+ (!other->rb_right || rb_is_black(other->rb_right))) {
+ rb_set_red(other);
+ node = parent;
+ parent = rb_parent(node);
+ } else {
+ if (!other->rb_right || rb_is_black(other->rb_right)) {
+ rb_set_black(other->rb_left);
+ rb_set_red(other);
+ __rb_rotate_right(other, root);
+ other = parent->rb_right;
+ }
+ rb_set_color(other, rb_color(parent));
+ rb_set_black(parent);
+ rb_set_black(other->rb_right);
+ __rb_rotate_left(parent, root);
+ node = root->rb_node;
+ break;
+ }
+ } else {
+ other = parent->rb_left;
+ if (rb_is_red(other)) {
+ rb_set_black(other);
+ rb_set_red(parent);
+ __rb_rotate_right(parent, root);
+ other = parent->rb_left;
+ }
+ if ((!other->rb_left || rb_is_black(other->rb_left)) &&
+ (!other->rb_right || rb_is_black(other->rb_right))) {
+ rb_set_red(other);
+ node = parent;
+ parent = rb_parent(node);
+ } else {
+ if (!other->rb_left || rb_is_black(other->rb_left)) {
+ rb_set_black(other->rb_right);
+ rb_set_red(other);
+ __rb_rotate_left(other, root);
+ other = parent->rb_left;
+ }
+ rb_set_color(other, rb_color(parent));
+ rb_set_black(parent);
+ rb_set_black(other->rb_left);
+ __rb_rotate_right(parent, root);
+ node = root->rb_node;
+ break;
+ }
+ }
+ }
+
+ if (node)
+ rb_set_black(node);
+}
+
+void rb_erase(struct rb_node *node, struct rb_root *root)
+{
+ struct rb_node *child, *parent;
+ int color;
+
+ if (!node->rb_left)
+ child = node->rb_right;
+ else if (!node->rb_right)
+ child = node->rb_left;
+ else {
+ struct rb_node *old = node, *left;
+
+ node = node->rb_right;
+ while ((left = node->rb_left))
+ node = left;
+
+ if (rb_parent(old)) {
+ if (rb_parent(old)->rb_left == old)
+ rb_parent(old)->rb_left = node;
+ else
+ rb_parent(old)->rb_right = node;
+ } else
+ root->rb_node = node;
+
+ child = node->rb_right;
+ parent = rb_parent(node);
+ color = rb_color(node);
+
+ if (parent == old) {
+ parent = node;
+ } else {
+ if (child)
+ rb_set_parent(child, parent);
+ parent->rb_left = child;
+
+ node->rb_right = old->rb_right;
+ rb_set_parent(old->rb_right, node);
+ }
+
+ node->rb_parent_color = old->rb_parent_color;
+ node->rb_left = old->rb_left;
+ rb_set_parent(old->rb_left, node);
+
+ goto color;
+ }
+
+ parent = rb_parent(node);
+ color = rb_color(node);
+
+ if (child)
+ rb_set_parent(child, parent);
+
+ if (parent) {
+ if (parent->rb_left == node)
+ parent->rb_left = child;
+ else
+ parent->rb_right = child;
+ } else
+ root->rb_node = child;
+
+color:
+ if (color == RB_BLACK)
+ __rb_erase_color(child, parent, root);
+}
+
+/*
+ * This function returns the first node (in sort order) of the tree.
+ */
+struct rb_node *rb_first(const struct rb_root *root)
+{
+ struct rb_node *n;
+
+ n = root->rb_node;
+ if (!n)
+ return NULL;
+
+ while (n->rb_left)
+ n = n->rb_left;
+
+ return n;
+}
+
+struct rb_node *rb_last(const struct rb_root *root)
+{
+ struct rb_node *n;
+
+ n = root->rb_node;
+ if (!n)
+ return NULL;
+
+ while (n->rb_right)
+ n = n->rb_right;
+
+ return n;
+}
+
+struct rb_node *rb_next(const struct rb_node *node)
+{
+ struct rb_node *parent;
+
+ if (rb_parent(node) == node)
+ return NULL;
+
+ /*
+ * If we have a right-hand child, go down and
+ * then left as far as we can.
+ */
+ if (node->rb_right) {
+ node = node->rb_right;
+ while (node->rb_left)
+ node=node->rb_left;
+ return (struct rb_node *)node;
+ }
+
+ /*
+ * No right-hand children. Everything down and left is
+ * smaller than us, so any 'next' node must be in the general
+ * direction of our parent. Go up the tree; any time the
+ * ancestor is a right-hand child of its parent, keep going
+ * up. First time it's a left-hand child of its parent, said
+ * parent is our 'next' node.
+ */
+ while ((parent = rb_parent(node)) && node == parent->rb_right)
+ node = parent;
+
+ return parent;
+}
+
+struct rb_node *rb_prev(const struct rb_node *node)
+{
+ struct rb_node *parent;
+
+ if (rb_parent(node) == node)
+ return NULL;
+
+ /*
+ * If we have a left-hand child, go down and
+ * then right as far as we can.
+ */
+ if (node->rb_left) {
+ node = node->rb_left;
+ while (node->rb_right)
+ node = node->rb_right;
+ return (struct rb_node *)node;
+ }
+
+ /*
+ * No left-hand children. Go up till we find
+ * an ancestor which is a right-hand child of its parent.
+ */
+ while ((parent = rb_parent(node)) && node == parent->rb_left)
+ node = parent;
+
+ return parent;
+}
+
+void rb_replace_node(struct rb_node *victim,
+ struct rb_node *new,
+ struct rb_root *root)
+{
+ struct rb_node *parent = rb_parent(victim);
+
+ /* Set the surrounding nodes to point to the replacement */
+ if (parent) {
+ if (victim == parent->rb_left)
+ parent->rb_left = new;
+ else
+ parent->rb_right = new;
+ } else
+ root->rb_node = new;
+
+ if (victim->rb_left)
+ rb_set_parent(victim->rb_left, new);
+
+ if (victim->rb_right)
+ rb_set_parent(victim->rb_right, new);
+
+ /* Copy the pointers/colour from the victim to the replacement */
+ *new = *victim;
+}
diff --git a/criu/rst-malloc.c b/criu/rst-malloc.c
new file mode 100644
index 000000000000..d39499729112
--- /dev/null
+++ b/criu/rst-malloc.c
@@ -0,0 +1,223 @@
+#include <stdio.h>
+#include <stdbool.h>
+#include <sys/mman.h>
+
+#include "rst-malloc.h"
+#include "bug.h"
+#include "asm/types.h"
+
+struct rst_mem_type_s {
+ bool remapable;
+ bool enabled;
+ unsigned long free_bytes;
+ void *free_mem;
+ int (*grow)(struct rst_mem_type_s *, unsigned long size);
+ unsigned long last;
+
+ void *buf;
+ unsigned long size;
+};
+
+static inline unsigned long rst_mem_grow(unsigned long need_size)
+{
+ int rst_mem_batch = 2 * page_size();
+
+ need_size = round_up(need_size, page_size());
+ if (likely(need_size < rst_mem_batch))
+ need_size = rst_mem_batch;
+ else
+ pr_debug("Growing rst memory %lu pages\n", need_size / page_size());
+ return need_size;
+}
+
+static int grow_shared(struct rst_mem_type_s *t, unsigned long size)
+{
+ void *aux;
+
+ size = rst_mem_grow(size);
+
+ /*
+ * This buffer will not get remapped into
+ * restorer, thus we can just forget the
+ * previous chunk location and allocate a
+ * new one
+ */
+ aux = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_ANON, 0, 0);
+ if (aux == MAP_FAILED)
+ return -1;
+
+ t->free_mem = aux;
+ t->free_bytes = size;
+ t->last = 0;
+
+ return 0;
+}
+
+static int grow_remap(struct rst_mem_type_s *t, int flag, unsigned long size)
+{
+ void *aux;
+
+ size = rst_mem_grow(size);
+
+ if (!t->buf)
+ /*
+ * Can't call mremap with NULL address :(
+ */
+ aux = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ flag | MAP_ANON, 0, 0);
+ else
+ /*
+ * We'll have to remap all objects into restorer
+ * address space and get their new addresses. Since
+ * we allocate many objects as one linear array, it's
+ * simpler just to grow the buffer and let callers
+ * find out new array addresses, rather than allocate
+ * a completely new one and force callers use objects'
+ * cpos-s.
+ */
+ aux = mremap(t->buf, t->size,
+ t->size + size, MREMAP_MAYMOVE);
+ if (aux == MAP_FAILED)
+ return -1;
+
+ t->free_mem += (aux - t->buf);
+ t->free_bytes += size;
+ t->size += size;
+ t->buf = aux;
+
+ return 0;
+}
+
+static int grow_shremap(struct rst_mem_type_s *t, unsigned long size)
+{
+ return grow_remap(t, MAP_SHARED, size);
+}
+
+static int grow_private(struct rst_mem_type_s *t, unsigned long size)
+{
+ return grow_remap(t, MAP_PRIVATE, size);
+}
+
+static struct rst_mem_type_s rst_mems[RST_MEM_TYPES] = {
+ [RM_SHARED] = {
+ .grow = grow_shared,
+ .remapable = false,
+ .enabled = true,
+ },
+ [RM_SHREMAP] = {
+ .grow = grow_shremap,
+ .remapable = true,
+ .enabled = true,
+ },
+ [RM_PRIVATE] = {
+ .grow = grow_private,
+ .remapable = true,
+ .enabled = false,
+ },
+};
+
+void rst_mem_switch_to_private(void)
+{
+ rst_mems[RM_SHARED].enabled = false;
+ rst_mems[RM_SHREMAP].enabled = false;
+ rst_mems[RM_PRIVATE].enabled = true;
+}
+
+unsigned long rst_mem_align_cpos(int type)
+{
+ struct rst_mem_type_s *t = &rst_mems[type];
+ BUG_ON(!t->remapable || !t->enabled);
+
+ t->free_mem = (void *) round_up((unsigned long)t->free_mem, sizeof(void *));
+
+ return t->free_mem - t->buf;
+}
+
+void *rst_mem_remap_ptr(unsigned long pos, int type)
+{
+ struct rst_mem_type_s *t = &rst_mems[type];
+ BUG_ON(!t->remapable);
+ return t->buf + pos;
+}
+
+void *rst_mem_alloc(unsigned long size, int type)
+{
+ struct rst_mem_type_s *t = &rst_mems[type];
+ void *ret;
+
+ BUG_ON(!t->enabled);
+
+ if ((t->free_bytes < size) && t->grow(t, size)) {
+ pr_perror("Can't grow rst mem");
+ return NULL;
+ }
+
+ ret = t->free_mem;
+ t->free_mem += size;
+ t->free_bytes -= size;
+ t->last = size;
+
+ return ret;
+}
+
+void rst_mem_free_last(int type)
+{
+ struct rst_mem_type_s *t = &rst_mems[type];
+
+ BUG_ON(!t->enabled);
+
+ t->free_mem -= t->last;
+ t->free_bytes += t->last;
+ t->last = 0; /* next free_last would be no-op */
+}
+
+unsigned long rst_mem_lock(void)
+{
+ /*
+ * Don't allow further allocations from rst_mem since we're
+ * going to get the bootstrap area and remap all the stuff
+ * into it. The SHREMAP and SHARED should be already locked
+ * in the rst_mem_switch_to_private().
+ */
+ rst_mems[RM_PRIVATE].enabled = false;
+ return rst_mems[RM_PRIVATE].size + rst_mems[RM_SHREMAP].size;
+}
+
+static int rst_mem_remap_one(struct rst_mem_type_s *t, void *to)
+{
+ void *aux;
+
+ BUG_ON(!t->remapable || t->enabled);
+
+ if (!t->buf)
+ /*
+ * No allocations happenned from this buffer.
+ * It's safe just to do nothing.
+ */
+ return 0;
+
+ pr_debug("\tcall mremap(%p, %lu, %lu, MAYMOVE | FIXED, %p)\n",
+ t->buf, t->size, t->size, to);
+ aux = mremap(t->buf, t->size, t->size, MREMAP_MAYMOVE | MREMAP_FIXED, to);
+ if (aux == MAP_FAILED) {
+ pr_perror("Can't mremap rst mem");
+ return -1;
+ }
+
+ t->buf = aux;
+ return 0;
+}
+
+int rst_mem_remap(void *to)
+{
+ int ret;
+
+ ret = rst_mem_remap_one(&rst_mems[RM_PRIVATE], to);
+ if (!ret) {
+ to += rst_mems[RM_PRIVATE].size;
+ ret = rst_mem_remap_one(&rst_mems[RM_SHREMAP], to);
+ }
+
+ return ret;
+}
diff --git a/criu/seccomp.c b/criu/seccomp.c
new file mode 100644
index 000000000000..9fd545d677b6
--- /dev/null
+++ b/criu/seccomp.c
@@ -0,0 +1,272 @@
+#include <linux/filter.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#include "config.h"
+#include "imgset.h"
+#include "kcmp.h"
+#include "pstree.h"
+#include "ptrace.h"
+#include "proc_parse.h"
+#include "seccomp.h"
+#include "servicefd.h"
+#include "util.h"
+#include "rst-malloc.h"
+
+#include "protobuf.h"
+#include "protobuf/seccomp.pb-c.h"
+
+/* populated on dump during collect_seccomp_filters() */
+static int next_filter_id = 0;
+static struct seccomp_info **filters = NULL;
+
+static struct seccomp_info *find_inherited(struct pstree_item *parent,
+ struct sock_filter *filter, int len)
+{
+ struct seccomp_info *info;
+
+ /* if we have no filters yet, this one has no parent */
+ if (!filters)
+ return NULL;
+
+ for (info = filters[dmpi(parent)->pi_creds->last_filter]; info; info = info->prev) {
+
+ if (len != info->filter.filter.len)
+ continue;
+ if (!memcmp(filter, info->filter.filter.data, len))
+ return info;
+ }
+
+ return NULL;
+}
+
+static int collect_filter_for_pstree(struct pstree_item *item)
+{
+ struct seccomp_info *infos = NULL, *cursor;
+ int info_count, i, ret = -1;
+ struct sock_filter buf[BPF_MAXINSNS];
+ void *m;
+
+ if (item->state == TASK_DEAD ||
+ dmpi(item)->pi_creds->seccomp_mode != SECCOMP_MODE_FILTER)
+ return 0;
+
+ for (i = 0; true; i++) {
+ int len;
+ struct seccomp_info *info, *inherited = NULL;
+
+ len = ptrace(PTRACE_SECCOMP_GET_FILTER, item->pid.real, i, buf);
+ if (len < 0) {
+ if (errno == ENOENT) {
+ /* end of the search */
+ BUG_ON(i == 0);
+ goto save_infos;
+ } else if (errno == EINVAL) {
+ pr_err("dumping seccomp infos not supported\n");
+ goto out;
+ } else {
+ pr_perror("couldn't dump seccomp filter");
+ goto out;
+ }
+ }
+
+ inherited = find_inherited(item->parent, buf, len);
+ if (inherited) {
+ bool found = false;
+
+ /* Small sanity check: if infos is already populated,
+ * we should have inherited that filter too. */
+ for (cursor = infos; cursor; cursor = cursor->prev) {
+ if (inherited->prev== cursor) {
+ found = true;
+ break;
+ }
+ }
+
+ BUG_ON(!found);
+
+ infos = inherited;
+ continue;
+ }
+
+ info = xmalloc(sizeof(*info));
+ if (!info)
+ goto out;
+ seccomp_filter__init(&info->filter);
+
+ info->filter.filter.len = len * sizeof(struct sock_filter);
+ info->filter.filter.data = xmalloc(info->filter.filter.len);
+ if (!info->filter.filter.data) {
+ xfree(info);
+ goto out;
+ }
+
+ memcpy(info->filter.filter.data, buf, info->filter.filter.len);
+
+ info->prev = infos;
+ infos = info;
+ }
+
+save_infos:
+ info_count = i;
+
+ m = xrealloc(filters, sizeof(*filters) * (next_filter_id + info_count));
+ if (!m)
+ goto out;
+ filters = m;
+
+ for (cursor = infos, i = info_count + next_filter_id - 1;
+ i >= next_filter_id; i--) {
+ BUG_ON(!cursor);
+ cursor->id = i;
+ filters[i] = cursor;
+ cursor = cursor->prev;
+ }
+
+ next_filter_id += info_count;
+
+ dmpi(item)->pi_creds->last_filter = infos->id;
+
+ /* Don't free the part of the tree we just successfully acquired */
+ infos = NULL;
+ ret = 0;
+out:
+ while (infos) {
+ struct seccomp_info *freeme = infos;
+ infos = infos->prev;
+ xfree(freeme->filter.filter.data);
+ xfree(freeme);
+ }
+
+ return ret;
+}
+
+static int dump_seccomp_filters(void)
+{
+ SeccompEntry se = SECCOMP_ENTRY__INIT;
+ int ret = -1, i;
+
+ /* If we didn't collect any filters, don't create a seccomp image at all. */
+ if (next_filter_id == 0)
+ return 0;
+
+ se.seccomp_filters = xzalloc(sizeof(*se.seccomp_filters) * next_filter_id);
+ if (!se.seccomp_filters)
+ return -1;
+
+ se.n_seccomp_filters = next_filter_id;
+
+ for (i = 0; i < next_filter_id; i++) {
+ SeccompFilter *sf;
+ struct seccomp_info *cur = filters[i];
+
+ sf = se.seccomp_filters[cur->id] = &cur->filter;
+ if (cur->prev) {
+ sf->has_prev = true;
+ sf->prev = cur->prev->id;
+ }
+ }
+
+ ret = pb_write_one(img_from_set(glob_imgset, CR_FD_SECCOMP), &se, PB_SECCOMP);
+
+ xfree(se.seccomp_filters);
+
+ for (i = 0; i < next_filter_id; i++) {
+ struct seccomp_info *freeme = filters[i];
+
+ xfree(freeme->filter.filter.data);
+ xfree(freeme);
+ }
+ xfree(filters);
+
+ return ret;
+}
+
+int collect_seccomp_filters(void)
+{
+ if (preorder_pstree_traversal(root_item, collect_filter_for_pstree) < 0)
+ return -1;
+
+ if (dump_seccomp_filters())
+ return -1;
+
+ return 0;
+}
+
+/* Populated on restore by prepare_seccomp_filters */
+static SeccompEntry *se;
+
+int prepare_seccomp_filters(void)
+{
+ struct cr_img *img;
+ int ret;
+
+ img = open_image(CR_FD_SECCOMP, O_RSTR);
+ if (!img)
+ return -1;
+
+ ret = pb_read_one_eof(img, &se, PB_SECCOMP);
+ close_image(img);
+ if (ret <= 0)
+ return 0; /* there were no filters */
+
+ BUG_ON(!se);
+
+ return 0;
+}
+
+int seccomp_filters_get_rst_pos(CoreEntry *core, int *count, unsigned long *pos)
+{
+ SeccompFilter *sf = NULL;
+ struct sock_fprog *arr = NULL;
+ void *filter_data = NULL;
+ int ret = -1, i;
+ size_t filter_size = 0;
+
+ if (!core->tc->has_seccomp_filter) {
+ *count = 0;
+ return 0;
+ }
+
+ *count = 0;
+ *pos = rst_mem_align_cpos(RM_PRIVATE);
+
+ BUG_ON(core->tc->seccomp_filter > se->n_seccomp_filters);
+ sf = se->seccomp_filters[core->tc->seccomp_filter];
+
+ while (1) {
+ (*count)++;
+
+ filter_size += sf->filter.len;
+
+ if (!sf->has_prev)
+ break;
+
+ sf = se->seccomp_filters[sf->prev];
+ }
+
+ arr = rst_mem_alloc(sizeof(struct sock_fprog) * (*count) + filter_size, RM_PRIVATE);
+ if (!arr)
+ goto out;
+
+ filter_data = &arr[*count];
+ sf = se->seccomp_filters[core->tc->seccomp_filter];
+ for (i = 0; i < *count; i++) {
+ struct sock_fprog *fprog = &arr[i];
+
+ BUG_ON(sf->filter.len % sizeof(struct sock_filter));
+ fprog->len = sf->filter.len / sizeof(struct sock_filter);
+
+ memcpy(filter_data, sf->filter.data, sf->filter.len);
+
+ filter_data += sf->filter.len;
+ sf = se->seccomp_filters[sf->prev];
+ }
+
+ ret = 0;
+
+out:
+ seccomp_entry__free_unpacked(se, NULL);
+ return ret;
+}
diff --git a/criu/seize.c b/criu/seize.c
new file mode 100644
index 000000000000..7d1f77c46dab
--- /dev/null
+++ b/criu/seize.c
@@ -0,0 +1,688 @@
+#include <stdbool.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/wait.h>
+#include <time.h>
+
+#include "compiler.h"
+#include "cr_options.h"
+#include "cr-errno.h"
+#include "pstree.h"
+#include "ptrace.h"
+#include "seize.h"
+#include "stats.h"
+#include "xmalloc.h"
+#include "util.h"
+
+#define NR_ATTEMPTS 5
+
+static const char frozen[] = "FROZEN";
+static const char freezing[] = "FREEZING";
+static const char thawed[] = "THAWED";
+
+static const char *get_freezer_state(int fd)
+{
+ char state[32];
+ int ret;
+
+ BUILD_BUG_ON((sizeof(state) < sizeof(frozen)) ||
+ (sizeof(state) < sizeof(freezing)) ||
+ (sizeof(state) < sizeof(thawed)));
+
+ lseek(fd, 0, SEEK_SET);
+ ret = read(fd, state, sizeof(state) - 1);
+ if (ret <= 0) {
+ pr_perror("Unable to get a current state");
+ goto err;
+ }
+ if (state[ret - 1] == '\n')
+ state[ret - 1] = 0;
+ else
+ state[ret] = 0;
+
+ pr_debug("freezer.state=%s\n", state);
+ if (strcmp(state, frozen) == 0)
+ return frozen;
+ else if (strcmp(state, freezing) == 0)
+ return freezing;
+ else if (strcmp(state, thawed) == 0)
+ return thawed;
+
+ pr_err("Unknown freezer state: %s\n", state);
+err:
+ return NULL;
+}
+
+static bool freezer_thawed;
+
+const char *get_real_freezer_state(void)
+{
+ return freezer_thawed ? thawed : frozen;
+}
+
+static int freezer_restore_state(void)
+{
+ int fd;
+ char path[PATH_MAX];
+
+ if (!opts.freeze_cgroup || freezer_thawed)
+ return 0;
+
+ snprintf(path, sizeof(path), "%s/freezer.state", opts.freeze_cgroup);
+ fd = open(path, O_RDWR);
+ if (fd < 0) {
+ pr_perror("Unable to open %s", path);
+ return -1;
+ }
+
+ if (write(fd, frozen, sizeof(frozen)) != sizeof(frozen)) {
+ pr_perror("Unable to freeze tasks");
+ close(fd);
+ return -1;
+ }
+ close(fd);
+ return 0;
+}
+
+static int seize_cgroup_tree(char *root_path, const char *state)
+{
+ DIR *dir;
+ struct dirent *de;
+ char path[PATH_MAX];
+ FILE *f;
+
+ /*
+ * New tasks can appear while a freezer state isn't
+ * frozen, so we need to catch all new tasks.
+ */
+ snprintf(path, sizeof(path), "%s/tasks", root_path);
+ f = fopen(path, "r");
+ if (f == NULL) {
+ pr_perror("Unable to open %s", path);
+ return -1;
+ }
+ while (fgets(path, sizeof(path), f)) {
+ pid_t pid;
+ int ret;
+
+ pid = atoi(path);
+
+ /* Here we are going to skip tasks which are already traced. */
+ ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL);
+ if (ret == 0)
+ continue;
+ if (errno != ESRCH) {
+ pr_perror("Unexpected error");
+ fclose(f);
+ return -1;
+ }
+
+ if (!seize_catch_task(pid)) {
+ pr_debug("SEIZE %d: success\n", pid);
+ processes_to_wait++;
+ } else if (state == frozen) {
+ char buf[] = "/proc/XXXXXXXXXX/exe";
+ struct stat st;
+
+ /* skip kernel threads */
+ snprintf(buf, sizeof(buf), "/proc/%d/exe", pid);
+ if (stat(buf, &st) == -1 && errno == ENOENT)
+ continue;
+
+ /* fails when meets a zombie */
+ pr_err("zombie found while seizing\n");
+ fclose(f);
+ return -1;
+ }
+ }
+ fclose(f);
+
+ dir = opendir(root_path);
+ if (!dir) {
+ pr_perror("Unable to open %s", root_path);
+ return -1;
+ }
+
+ while ((de = readdir(dir))) {
+ struct stat st;
+
+ if (dir_dots(de))
+ continue;
+
+ sprintf(path, "%s/%s", root_path, de->d_name);
+
+ if (fstatat(dirfd(dir), de->d_name, &st, 0) < 0) {
+ pr_perror("stat of %s failed", path);
+ closedir(dir);
+ return -1;
+ }
+
+ if (!S_ISDIR(st.st_mode))
+ continue;
+
+ if (seize_cgroup_tree(path, state) < 0) {
+ closedir(dir);
+ return -1;
+ }
+ }
+ closedir(dir);
+
+ return 0;
+}
+
+/* A number of tasks in a freezer cgroup which are not going to be dumped */
+int processes_to_wait;
+static pid_t *processes_to_wait_pids;
+
+/*
+ * A freezer cgroup can contain tasks which will not be dumped
+ * and we need to wait them, because the are interupted them by ptrace.
+ */
+static int freezer_wait_processes()
+{
+ int i;
+
+ processes_to_wait_pids = xmalloc(sizeof(pid_t) * processes_to_wait);
+ if (processes_to_wait_pids == NULL)
+ return -1;
+
+ for (i = 0; i < processes_to_wait; i++) {
+ int status;
+ pid_t pid;
+
+ /*
+ * Here we are going to skip tasks which are already traced.
+ * Ptraced tasks looks like children for us, so if
+ * a task isn't ptraced yet, waitpid() will return a error.
+ */
+ pid = waitpid(-1, &status, 0);
+ if (pid < 0) {
+ pr_perror("Unable to wait processes");
+ xfree(processes_to_wait_pids);
+ return -1;
+ }
+ pr_warn("Unexpected process %d in the freezer cgroup (status 0x%x)\n", pid, status);
+
+ processes_to_wait_pids[i] = pid;
+ }
+
+ return 0;
+}
+
+static int freezer_detach(void)
+{
+ int i;
+
+ if (!opts.freeze_cgroup)
+ return 0;
+
+ for (i = 0; i < processes_to_wait; i++) {
+ pid_t pid = processes_to_wait_pids[i];
+ int status, save_errno;
+
+ if (ptrace(PTRACE_DETACH, pid, NULL, NULL) == 0)
+ continue;
+
+ save_errno = errno;
+
+ /* A process may be killed by SIGKILL */
+ if (wait4(pid, &status, __WALL, NULL) == pid) {
+ pr_warn("The %d process returned 0x %x\n", pid, status);
+ continue;
+ }
+ errno = save_errno;
+ pr_perror("Unable to detach from %d", pid);
+ }
+
+ return 0;
+}
+
+static int freeze_processes(void)
+{
+ int i, fd, exit_code = -1;
+ char path[PATH_MAX];
+ const char *state = thawed;
+
+ snprintf(path, sizeof(path), "%s/freezer.state", opts.freeze_cgroup);
+ fd = open(path, O_RDWR);
+ if (fd < 0) {
+ pr_perror("Unable to open %s", path);
+ return -1;
+ }
+ state = get_freezer_state(fd);
+ if (!state) {
+ close(fd);
+ return -1;
+ }
+ if (state == thawed) {
+ freezer_thawed = true;
+
+ lseek(fd, 0, SEEK_SET);
+ if (write(fd, frozen, sizeof(frozen)) != sizeof(frozen)) {
+ pr_perror("Unable to freeze tasks");
+ close(fd);
+ return -1;
+ }
+ }
+
+ /*
+ * There is not way to wait a specified state, so we need to poll the
+ * freezer.state.
+ * Here is one extra attempt to check that everything are frozen.
+ */
+ for (i = 0; i <= NR_ATTEMPTS; i++) {
+ struct timespec req = {};
+ u64 timeout;
+
+ if (seize_cgroup_tree(opts.freeze_cgroup, state) < 0)
+ goto err;
+
+ if (state == frozen)
+ break;
+
+ state = get_freezer_state(fd);
+ if (!state)
+ goto err;
+
+ if (state == frozen) {
+ /*
+ * Enumerate all tasks one more time to collect all new
+ * tasks, which can be born while the cgroup is being frozen.
+ */
+
+ continue;
+ }
+
+ timeout = 100000000 * (i + 1); /* 100 msec */
+ req.tv_nsec = timeout % 1000000000;
+ req.tv_sec = timeout / 1000000000;
+ nanosleep(&req, NULL);
+ }
+
+ if (i > NR_ATTEMPTS) {
+ pr_err("Unable to freeze cgroup %s\n", opts.freeze_cgroup);
+ goto err;
+ }
+
+ exit_code = 0;
+err:
+ if (exit_code == 0 || freezer_thawed) {
+ lseek(fd, 0, SEEK_SET);
+ if (write(fd, thawed, sizeof(thawed)) != sizeof(thawed)) {
+ pr_perror("Unable to thaw tasks");
+ exit_code = -1;
+ }
+ }
+ if (close(fd)) {
+ pr_perror("Unable to thaw tasks");
+ return -1;
+ }
+
+ return exit_code;
+}
+
+static inline bool child_collected(struct pstree_item *i, pid_t pid)
+{
+ struct pstree_item *c;
+
+ list_for_each_entry(c, &i->children, sibling)
+ if (c->pid.real == pid)
+ return true;
+
+ return false;
+}
+
+static int collect_task(struct pstree_item *item);
+static int collect_children(struct pstree_item *item)
+{
+ pid_t *ch;
+ int ret, i, nr_children, nr_inprogress;
+
+ ret = parse_children(item->pid.real, &ch, &nr_children);
+ if (ret < 0)
+ return ret;
+
+ nr_inprogress = 0;
+ for (i = 0; i < nr_children; i++) {
+ struct pstree_item *c;
+ pid_t pid = ch[i];
+
+ /* Is it already frozen? */
+ if (child_collected(item, pid))
+ continue;
+
+ nr_inprogress++;
+
+ pr_info("Seized task %d, state %d\n", pid, ret);
+
+ c = alloc_pstree_item();
+ if (c == NULL) {
+ ret = -1;
+ goto free;
+ }
+
+ if (!opts.freeze_cgroup)
+ /* fails when meets a zombie */
+ seize_catch_task(pid);
+
+ ret = seize_wait_task(pid, item->pid.real, &dmpi(c)->pi_creds);
+ if (ret < 0) {
+ /*
+ * Here is a race window between parse_children() and seize(),
+ * so the task could die for these time.
+ * Don't worry, will try again on the next attempt. The number
+ * of attempts is restricted, so it will exit if something
+ * really wrong.
+ */
+ ret = 0;
+ xfree(c);
+ continue;
+ }
+
+ c->pid.real = pid;
+ c->parent = item;
+ c->state = ret;
+ list_add_tail(&c->sibling, &item->children);
+
+ /* Here is a recursive call (Depth-first search) */
+ ret = collect_task(c);
+ if (ret < 0)
+ goto free;
+ }
+free:
+ xfree(ch);
+ return ret < 0 ? ret : nr_inprogress;
+}
+
+static void unseize_task_and_threads(const struct pstree_item *item, int st)
+{
+ int i;
+
+ if (item->state == TASK_DEAD)
+ return;
+
+ /*
+ * The st is the state we want to switch tasks into,
+ * the item->state is the state task was in when we seized one.
+ */
+
+ unseize_task(item->pid.real, item->state, st);
+
+ if (st == TASK_DEAD)
+ return;
+
+ for (i = 1; i < item->nr_threads; i++)
+ if (ptrace(PTRACE_DETACH, item->threads[i].real, NULL, NULL))
+ pr_perror("Unable to detach from %d", item->threads[i].real);
+}
+
+static void pstree_wait(struct pstree_item *root_item)
+{
+ struct pstree_item *item = root_item;
+ int pid, status, i;
+
+ for_each_pstree_item(item) {
+
+ if (item->state == TASK_DEAD)
+ continue;
+
+ for (i = 0; i < item->nr_threads; i++) {
+ pid = wait4(-1, &status, __WALL, NULL);
+ if (pid < 0) {
+ pr_perror("wait4 failed");
+ break;
+ } else {
+ if (!WIFSIGNALED(status) || WTERMSIG(status) != SIGKILL) {
+ pr_err("Unexpected exit code %d of %d\n", status, pid);
+ BUG();
+ }
+ }
+ }
+ }
+
+ pid = wait4(-1, &status, __WALL, NULL);
+ if (pid > 0) {
+ pr_err("Unexpected child %d\n", pid);
+ BUG();
+ }
+}
+
+void pstree_switch_state(struct pstree_item *root_item, int st)
+{
+ struct pstree_item *item = root_item;
+
+ if (st != TASK_DEAD)
+ freezer_restore_state();
+
+ /*
+ * We need to detach from all processes before waiting the init
+ * process, because one of these processes may collect processes from a
+ * target pid namespace. The pid namespace is destroyed only when all
+ * processes have been killed and collected.
+ */
+ freezer_detach();
+
+ pr_info("Unfreezing tasks into %d\n", st);
+ for_each_pstree_item(item)
+ unseize_task_and_threads(item, st);
+
+ if (st == TASK_DEAD)
+ pstree_wait(root_item);
+}
+
+static pid_t item_ppid(const struct pstree_item *item)
+{
+ item = item->parent;
+ return item ? item->pid.real : -1;
+}
+
+static inline bool thread_collected(struct pstree_item *i, pid_t tid)
+{
+ int t;
+
+ if (i->pid.real == tid) /* thread leader is collected as task */
+ return true;
+
+ for (t = 0; t < i->nr_threads; t++)
+ if (tid == i->threads[t].real)
+ return true;
+
+ return false;
+}
+
+static int collect_threads(struct pstree_item *item)
+{
+ struct pid *threads = NULL;
+ int nr_threads = 0, i = 0, ret, nr_inprogress, nr_stopped = 0;
+
+ ret = parse_threads(item->pid.real, &threads, &nr_threads);
+ if (ret < 0)
+ goto err;
+
+ if ((item->state == TASK_DEAD) && (nr_threads > 1)) {
+ pr_err("Zombies with threads are not supported\n");
+ goto err;
+ }
+
+ /* The number of threads can't be less than allready frozen */
+ item->threads = xrealloc(item->threads, nr_threads * sizeof(struct pid));
+ if (item->threads == NULL)
+ return -1;
+
+ if (item->nr_threads == 0) {
+ item->threads[0].real = item->pid.real;
+ item->nr_threads = 1;
+ }
+
+ nr_inprogress = 0;
+ for (i = 0; i < nr_threads; i++) {
+ pid_t pid = threads[i].real;
+
+ if (thread_collected(item, pid))
+ continue;
+
+ nr_inprogress++;
+
+ pr_info("\tSeizing %d's %d thread\n",
+ item->pid.real, pid);
+
+ if (!opts.freeze_cgroup && seize_catch_task(pid))
+ continue;
+
+ ret = seize_wait_task(pid, item_ppid(item), &dmpi(item)->pi_creds);
+ if (ret < 0) {
+ /*
+ * Here is a race window between parse_threads() and seize(),
+ * so the task could die for these time.
+ * Don't worry, will try again on the next attempt. The number
+ * of attempts is restricted, so it will exit if something
+ * really wrong.
+ */
+ continue;
+ }
+
+ BUG_ON(item->nr_threads + 1 > nr_threads);
+ item->threads[item->nr_threads].real = pid;
+ item->nr_threads++;
+
+ if (ret == TASK_DEAD) {
+ pr_err("Zombie thread not supported\n");
+ goto err;
+ }
+
+ if (ret == TASK_STOPPED) {
+ nr_stopped++;
+ }
+ }
+
+ if (nr_stopped && nr_stopped != nr_inprogress) {
+ pr_err("Individually stopped threads not supported\n");
+ goto err;
+ }
+
+ xfree(threads);
+ return nr_inprogress;
+
+err:
+ xfree(threads);
+ return -1;
+}
+
+static int collect_loop(struct pstree_item *item,
+ int (*collect)(struct pstree_item *))
+{
+ int attempts = NR_ATTEMPTS, nr_inprogress = 1;
+
+ if (opts.freeze_cgroup)
+ attempts = 1;
+
+ /*
+ * While we scan the proc and seize the children/threads
+ * new ones can appear (with clone(CLONE_PARENT) or with
+ * pthread_create). Thus, after one go, we need to repeat
+ * the scan-and-freeze again collecting new arrivals. As
+ * new guys may appear again we do NR_ATTEMPTS passes and
+ * fail to seize the item if new tasks/threads still
+ * appear.
+ */
+
+ while (nr_inprogress > 0 && attempts >= 0) {
+ attempts--;
+ nr_inprogress = collect(item);
+ }
+
+ pr_info("Collected (%d attempts, %d in_progress)\n", attempts, nr_inprogress);
+
+ /*
+ * We may fail to collect items or run out of attempts.
+ * In the former case nr_inprogress will be negative, in
+ * the latter -- positive. Thus it's enough just to check
+ * for "no more new stuff" and say "we're OK" if so.
+ */
+
+ return (nr_inprogress == 0) ? 0 : -1;
+}
+
+static int collect_task(struct pstree_item *item)
+{
+ int ret;
+
+ ret = collect_loop(item, collect_threads);
+ if (ret < 0)
+ goto err_close;
+
+ /* Depth-first search (DFS) is used for traversing a process tree. */
+ ret = collect_loop(item, collect_children);
+ if (ret < 0)
+ goto err_close;
+
+ if ((item->state == TASK_DEAD) && !list_empty(&item->children)) {
+ pr_err("Zombie with children?! O_o Run, run, run!\n");
+ goto err_close;
+ }
+
+ if (pstree_alloc_cores(item))
+ goto err_close;
+
+ pr_info("Collected %d in %d state\n", item->pid.real, item->state);
+ return 0;
+
+err_close:
+ close_pid_proc();
+ return -1;
+}
+
+int collect_pstree(pid_t pid)
+{
+ int ret = -1;
+
+ timing_start(TIME_FREEZING);
+
+ if (opts.freeze_cgroup && freeze_processes())
+ goto err;
+
+ root_item = alloc_pstree_item();
+ if (root_item == NULL)
+ goto err;
+
+ root_item->pid.real = pid;
+
+ if (!opts.freeze_cgroup && seize_catch_task(pid)) {
+ set_cr_errno(ESRCH);
+ goto err;
+ }
+
+ /*
+ * wait4() may hang for some reason. Enable timer and fire SIGALRM
+ * if timeout reached. SIGALRM handler will do the necessary
+ * cleanups and terminate current process.
+ */
+ alarm(opts.timeout);
+
+ ret = seize_wait_task(pid, -1, &dmpi(root_item)->pi_creds);
+ if (ret < 0)
+ goto err;
+ pr_info("Seized task %d, state %d\n", pid, ret);
+ root_item->state = ret;
+
+ ret = collect_task(root_item);
+ if (ret < 0)
+ goto err;
+
+ if (opts.freeze_cgroup && freezer_wait_processes())
+ goto err;
+
+ ret = 0;
+ timing_stop(TIME_FREEZING);
+ timing_start(TIME_FROZEN);
+
+err:
+ /* Freezing stage finished in time - disable timer. */
+ alarm(0);
+ return ret;
+}
+
diff --git a/criu/shmem.c b/criu/shmem.c
new file mode 100644
index 000000000000..ad3cdbbdeedd
--- /dev/null
+++ b/criu/shmem.c
@@ -0,0 +1,449 @@
+#include <unistd.h>
+#include <sys/mman.h>
+#include <stdlib.h>
+#include <fcntl.h>
+
+#include "pid.h"
+#include "shmem.h"
+#include "image.h"
+#include "cr_options.h"
+#include "kerndat.h"
+#include "page-pipe.h"
+#include "page-xfer.h"
+#include "rst-malloc.h"
+#include "vma.h"
+#include "config.h"
+
+#include "protobuf.h"
+#include "protobuf/pagemap.pb-c.h"
+
+/*
+ * pid is a pid of a creater
+ * start, end are used for open mapping
+ * fd is a file discriptor, which is valid for creater,
+ * it's opened in cr-restor, because pgoff may be non zero
+ */
+struct shmem_info {
+ unsigned long shmid;
+ unsigned long size;
+ int pid;
+ int fd;
+
+ /*
+ * 0. lock is initilized to zero
+ * 1. the master opens a descriptor and set lock to 1
+ * 2. slaves open their descriptors and increment lock
+ * 3. the master waits all slaves on lock. After that
+ * it can close the descriptor.
+ */
+ futex_t lock;
+
+ /*
+ * Here is a problem, that we don't know, which process will restore
+ * an region. Each time when we found a process with a smaller pid,
+ * we reset self_count, so we can't have only one counter.
+ */
+ int count; /* the number of regions */
+ int self_count; /* the number of regions, which belongs to "pid" */
+
+ struct list_head l;
+};
+
+/*
+ * This list is filled with shared objects before we fork
+ * any tasks. Thus the head is private (COW-ed) and the
+ * entries are all in shmem.
+ */
+static LIST_HEAD(shmems); /* XXX hash? tree? */
+
+void show_saved_shmems(void)
+{
+ struct shmem_info *si;
+
+ pr_info("\tSaved shmems:\n");
+ list_for_each_entry(si, &shmems, l)
+ pr_info("\t\tshmid: 0x%lx pid: %d\n", si->shmid, si->pid);
+}
+
+static struct shmem_info *find_shmem_by_id(unsigned long shmid)
+{
+ struct shmem_info *si;
+
+ list_for_each_entry(si, &shmems, l)
+ if (si->shmid == shmid)
+ return si;
+
+ return NULL;
+}
+
+int collect_shmem(int pid, VmaEntry *vi)
+{
+ unsigned long size = vi->pgoff + vi->end - vi->start;
+ struct shmem_info *si;
+
+ si = find_shmem_by_id(vi->shmid);
+ if (si) {
+
+ if (si->size < size)
+ si->size = size;
+ si->count++;
+
+ /*
+ * Only the shared mapping with a lowest
+ * pid will be created in real, other processes
+ * will wait until the kernel propagate this mapping
+ * into /proc
+ */
+ if (!pid_rst_prio(pid, si->pid)) {
+ if (si->pid == pid)
+ si->self_count++;
+
+ return 0;
+ }
+
+ si->pid = pid;
+ si->self_count = 1;
+
+ return 0;
+ }
+
+ si = shmalloc(sizeof(struct shmem_info));
+ if (!si)
+ return -1;
+
+ pr_info("Add new shmem 0x%"PRIx64" (0x%016"PRIx64"-0x%016"PRIx64")\n",
+ vi->shmid, vi->start, vi->end);
+
+ si->shmid = vi->shmid;
+ si->pid = pid;
+ si->size = size;
+ si->fd = -1;
+ si->count = 1;
+ si->self_count = 1;
+ futex_init(&si->lock);
+ list_add_tail(&si->l, &shmems);
+
+ return 0;
+}
+
+static int shmem_wait_and_open(int pid, struct shmem_info *si)
+{
+ char path[128];
+ int ret;
+
+ pr_info("Waiting for the %lx shmem to appear\n", si->shmid);
+ futex_wait_while(&si->lock, 0);
+
+ snprintf(path, sizeof(path), "/proc/%d/fd/%d",
+ si->pid, si->fd);
+
+ pr_info("Opening shmem [%s] \n", path);
+ ret = open_proc_rw(si->pid, "fd/%d", si->fd);
+ if (ret < 0)
+ pr_perror(" %d: Can't stat shmem at %s",
+ si->pid, path);
+ futex_inc_and_wake(&si->lock);
+ return ret;
+}
+
+static int restore_shmem_content(void *addr, struct shmem_info *si)
+{
+ int ret = 0, fd_pg;
+ struct page_read pr;
+ unsigned long off_real;
+
+ ret = open_page_read(si->shmid, &pr, PR_SHMEM);
+ if (ret <= 0)
+ return -1;
+
+ fd_pg = img_raw_fd(pr.pi);
+ while (1) {
+ unsigned long vaddr;
+ unsigned nr_pages;
+ struct iovec iov;
+
+ ret = pr.get_pagemap(&pr, &iov);
+ if (ret <= 0)
+ break;
+
+ vaddr = (unsigned long)iov.iov_base;
+ nr_pages = iov.iov_len / PAGE_SIZE;
+
+ if (vaddr + nr_pages * PAGE_SIZE > si->size)
+ break;
+
+ off_real = lseek(fd_pg, 0, SEEK_CUR);
+
+ ret = read(fd_pg, addr + vaddr, nr_pages * PAGE_SIZE);
+ if (ret != nr_pages * PAGE_SIZE) {
+ ret = -1;
+ break;
+ }
+
+ if (opts.auto_dedup) {
+ ret = punch_hole(&pr, off_real, nr_pages * PAGE_SIZE, false);
+ if (ret == -1) {
+ break;
+ }
+ }
+
+ if (pr.put_pagemap)
+ pr.put_pagemap(&pr);
+ }
+
+ pr.close(&pr);
+ return ret;
+}
+
+int get_shmem_fd(int pid, VmaEntry *vi)
+{
+ struct shmem_info *si;
+ void *addr = MAP_FAILED;
+ int f = -1;
+ int flags;
+
+ si = find_shmem_by_id(vi->shmid);
+ pr_info("Search for 0x%016"PRIx64" shmem 0x%"PRIx64" %p/%d\n", vi->start, vi->shmid, si, si ? si->pid : -1);
+ if (!si) {
+ pr_err("Can't find my shmem 0x%016"PRIx64"\n", vi->start);
+ return -1;
+ }
+
+ if (si->pid != pid)
+ return shmem_wait_and_open(pid, si);
+
+ if (si->fd != -1)
+ return dup(si->fd);
+
+ flags = MAP_SHARED;
+#ifdef CONFIG_HAS_MEMFD
+ if (kdat.has_memfd) {
+ f = syscall(SYS_memfd_create, "", 0);
+ if (f < 0) {
+ pr_perror("Unable to create memfd");
+ goto err;
+ }
+
+ if (ftruncate(f, si->size)) {
+ pr_perror("Unable to truncate memfd");
+ goto err;
+ }
+ flags |= MAP_FILE;
+ } else
+#endif
+ flags |= MAP_ANONYMOUS;
+
+ /*
+ * The following hack solves problems:
+ * vi->pgoff may be not zero in a target process.
+ * This mapping may be mapped more then once.
+ * The restorer doesn't have snprintf.
+ * Here is a good place to restore content
+ */
+ addr = mmap(NULL, si->size, PROT_WRITE | PROT_READ, flags, f, 0);
+ if (addr == MAP_FAILED) {
+ pr_err("Can't mmap shmid=0x%"PRIx64" size=%ld\n",
+ vi->shmid, si->size);
+ goto err;
+ }
+
+ if (restore_shmem_content(addr, si) < 0) {
+ pr_err("Can't restore shmem content\n");
+ goto err;
+ }
+
+ if (f == -1) {
+ f = open_proc_rw(getpid(), "map_files/%lx-%lx",
+ (unsigned long) addr,
+ (unsigned long) addr + si->size);
+ if (f < 0)
+ goto err;
+ }
+ munmap(addr, si->size);
+
+ si->fd = f;
+
+ /* Send signal to slaves, that they can open fd for this shmem */
+ futex_inc_and_wake(&si->lock);
+ /*
+ * All other regions in this process will duplicate
+ * the file descriptor, so we don't wait them.
+ */
+ futex_wait_until(&si->lock, si->count - si->self_count + 1);
+
+ return f;
+err:
+ if (addr != MAP_FAILED)
+ munmap(addr, si->size);
+ close_safe(&f);
+ return -1;
+}
+
+struct shmem_info_dump {
+ unsigned long size;
+ unsigned long shmid;
+ unsigned long start;
+ unsigned long end;
+ int pid;
+
+ struct shmem_info_dump *next;
+};
+
+#define SHMEM_HASH_SIZE 32
+static struct shmem_info_dump *shmems_hash[SHMEM_HASH_SIZE];
+
+static struct shmem_info_dump *shmem_find(struct shmem_info_dump **chain,
+ unsigned long shmid)
+{
+ struct shmem_info_dump *sh;
+
+ for (sh = *chain; sh; sh = sh->next)
+ if (sh->shmid == shmid)
+ return sh;
+
+ return NULL;
+}
+
+int add_shmem_area(pid_t pid, VmaEntry *vma)
+{
+ struct shmem_info_dump *si, **chain;
+ unsigned long size = vma->pgoff + (vma->end - vma->start);
+
+ chain = &shmems_hash[vma->shmid % SHMEM_HASH_SIZE];
+ si = shmem_find(chain, vma->shmid);
+ if (si) {
+ if (si->size < size)
+ si->size = size;
+ return 0;
+ }
+
+ si = xmalloc(sizeof(*si));
+ if (!si)
+ return -1;
+
+ si->next = *chain;
+ *chain = si;
+
+ si->size = size;
+ si->pid = pid;
+ si->start = vma->start;
+ si->end = vma->end;
+ si->shmid = vma->shmid;
+
+ return 0;
+}
+
+static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer, void *addr)
+{
+ struct page_pipe_buf *ppb;
+
+ list_for_each_entry(ppb, &pp->bufs, l)
+ if (vmsplice(ppb->p[1], ppb->iov, ppb->nr_segs,
+ SPLICE_F_GIFT | SPLICE_F_NONBLOCK) !=
+ ppb->pages_in * PAGE_SIZE) {
+ pr_perror("Can't get shmem into page-pipe");
+ return -1;
+ }
+
+ return page_xfer_dump_pages(xfer, pp, (unsigned long)addr);
+}
+
+static int dump_one_shmem(struct shmem_info_dump *si)
+{
+ struct iovec *iovs;
+ struct page_pipe *pp;
+ struct page_xfer xfer;
+ int err, ret = -1, fd;
+ unsigned char *map = NULL;
+ void *addr = NULL;
+ unsigned long pfn, nrpages;
+
+ pr_info("Dumping shared memory %ld\n", si->shmid);
+
+ nrpages = (si->size + PAGE_SIZE - 1) / PAGE_SIZE;
+ map = xmalloc(nrpages * sizeof(*map));
+ if (!map)
+ goto err;
+
+ fd = open_proc(si->pid, "map_files/%lx-%lx", si->start, si->end);
+ if (fd < 0)
+ goto err;
+
+ addr = mmap(NULL, si->size, PROT_READ, MAP_SHARED, fd, 0);
+ close(fd);
+ if (addr == MAP_FAILED) {
+ pr_err("Can't map shmem 0x%lx (0x%lx-0x%lx)\n",
+ si->shmid, si->start, si->end);
+ goto err;
+ }
+
+ /*
+ * We can't use pagemap here, because this vma is
+ * not mapped to us at all, but mincore reports the
+ * pagecache status of a file, which is correct in
+ * this case.
+ */
+
+ err = mincore(addr, si->size, map);
+ if (err)
+ goto err_unmap;
+
+ iovs = xmalloc(((nrpages + 1) / 2) * sizeof(struct iovec));
+ if (!iovs)
+ goto err_unmap;
+
+ pp = create_page_pipe((nrpages + 1) / 2, iovs, true);
+ if (!pp)
+ goto err_iovs;
+
+ err = open_page_xfer(&xfer, CR_FD_SHMEM_PAGEMAP, si->shmid);
+ if (err)
+ goto err_pp;
+
+ for (pfn = 0; pfn < nrpages; pfn++) {
+ if (!(map[pfn] & PAGE_RSS))
+ continue;
+again:
+ ret = page_pipe_add_page(pp, (unsigned long)addr + pfn * PAGE_SIZE);
+ if (ret == -EAGAIN) {
+ ret = dump_pages(pp, &xfer, addr);
+ if (ret)
+ goto err_xfer;
+ page_pipe_reinit(pp);
+ goto again;
+ } else if (ret)
+ goto err_xfer;
+ }
+
+ ret = dump_pages(pp, &xfer, addr);
+
+err_xfer:
+ xfer.close(&xfer);
+err_pp:
+ destroy_page_pipe(pp);
+err_iovs:
+ xfree(iovs);
+err_unmap:
+ munmap(addr, si->size);
+err:
+ xfree(map);
+ return ret;
+}
+
+#define for_each_shmem_dump(_i, _si) \
+ for (i = 0; i < SHMEM_HASH_SIZE; i++) \
+ for (si = shmems_hash[i]; si; si = si->next)
+
+int cr_dump_shmem(void)
+{
+ int ret = 0, i;
+ struct shmem_info_dump *si;
+
+ for_each_shmem_dump (i, si) {
+ ret = dump_one_shmem(si);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
diff --git a/criu/sigframe.c b/criu/sigframe.c
new file mode 100644
index 000000000000..448749320231
--- /dev/null
+++ b/criu/sigframe.c
@@ -0,0 +1,36 @@
+#include <unistd.h>
+#include <string.h>
+
+#include "asm/restore.h"
+#include "asm/restorer.h"
+
+#include "protobuf/core.pb-c.h"
+
+int construct_sigframe(struct rt_sigframe *sigframe,
+ struct rt_sigframe *rsigframe,
+ CoreEntry *core)
+{
+ k_rtsigset_t *blk_sigset = (k_rtsigset_t*)&RT_SIGFRAME_UC(sigframe).uc_sigmask;
+
+ if (core->tc)
+ memcpy(blk_sigset, &core->tc->blk_sigset, sizeof(k_rtsigset_t));
+ else if (core->thread_core->has_blk_sigset) {
+ memcpy(blk_sigset,
+ &core->thread_core->blk_sigset, sizeof(k_rtsigset_t));
+ } else
+ memset(blk_sigset, 0, sizeof(k_rtsigset_t));
+
+ if (restore_fpu(sigframe, core))
+ return -1;
+
+ if (RT_SIGFRAME_HAS_FPU(sigframe))
+ if (sigreturn_prep_fpu_frame(sigframe, &RT_SIGFRAME_FPU(rsigframe)))
+ return -1;
+
+ if (restore_gpregs(sigframe, CORE_THREAD_ARCH_INFO(core)->gpregs))
+ return -1;
+
+ setup_sas(sigframe, core->thread_core->sas);
+
+ return 0;
+}
diff --git a/criu/signalfd.c b/criu/signalfd.c
new file mode 100644
index 000000000000..6d686d44c443
--- /dev/null
+++ b/criu/signalfd.c
@@ -0,0 +1,123 @@
+#include <unistd.h>
+#include <signal.h>
+#include <sys/signalfd.h>
+
+#include "compiler.h"
+#include "asm/types.h"
+#include "signalfd.h"
+#include "proc_parse.h"
+#include "imgset.h"
+#include "image.h"
+#include "util.h"
+#include "log.h"
+#include "files.h"
+
+#include "protobuf.h"
+#include "protobuf/signalfd.pb-c.h"
+
+struct signalfd_info {
+ SignalfdEntry *sfe;
+ struct file_desc d;
+};
+
+int is_signalfd_link(char *link)
+{
+ return is_anon_link_type(link, "[signalfd]");
+}
+
+struct signalfd_dump_arg {
+ u32 id;
+ const struct fd_parms *p;
+ bool dumped;
+};
+
+static int dump_signalfd_entry(union fdinfo_entries *e, void *arg)
+{
+ struct signalfd_dump_arg *da = arg;
+
+ if (da->dumped) {
+ pr_err("Several counters in a file?\n");
+ return -1;
+ }
+
+ da->dumped = true;
+ e->sfd.id = da->id;
+ e->sfd.flags = da->p->flags;
+ e->sfd.fown = (FownEntry *)&da->p->fown;
+
+ return pb_write_one(img_from_set(glob_imgset, CR_FD_SIGNALFD),
+ &e->sfd, PB_SIGNALFD);
+}
+
+static int dump_one_signalfd(int lfd, u32 id, const struct fd_parms *p)
+{
+ struct signalfd_dump_arg da = { .id = id, .p = p, };
+ return parse_fdinfo(lfd, FD_TYPES__SIGNALFD, dump_signalfd_entry, &da);
+}
+
+const struct fdtype_ops signalfd_dump_ops = {
+ .type = FD_TYPES__SIGNALFD,
+ .dump = dump_one_signalfd,
+};
+
+static void sigset_fill(sigset_t *to, unsigned long long from)
+{
+ int sig;
+
+ pr_info("\tCalculating sigmask for %Lx\n", from);
+ sigemptyset(to);
+ for (sig = 1; sig < NSIG; sig++)
+ if (from & (1ULL << (sig - 1))) {
+ pr_debug("\t\tAdd %d signal to mask\n", sig);
+ sigaddset(to, sig);
+ }
+}
+
+static int signalfd_open(struct file_desc *d)
+{
+ struct signalfd_info *info;
+ int tmp;
+ sigset_t mask;
+
+ info = container_of(d, struct signalfd_info, d);
+ pr_info("Restoring signalfd %#x\n", info->sfe->id);
+
+ sigset_fill(&mask, info->sfe->sigmask);
+ tmp = signalfd(-1, &mask, 0);
+ if (tmp < 0) {
+ pr_perror("Can't create signalfd %#08x", info->sfe->id);
+ return -1;
+ }
+
+ if (rst_file_params(tmp, info->sfe->fown, info->sfe->flags)) {
+ pr_perror("Can't restore params on signalfd %#08x",
+ info->sfe->id);
+ goto err_close;
+ }
+
+ return tmp;
+
+err_close:
+ close(tmp);
+ return -1;
+}
+
+static struct file_desc_ops signalfd_desc_ops = {
+ .type = FD_TYPES__SIGNALFD,
+ .open = signalfd_open,
+};
+
+static int collect_one_sigfd(void *o, ProtobufCMessage *msg)
+{
+ struct signalfd_info *info = o;
+
+ info->sfe = pb_msg(msg, SignalfdEntry);
+ return file_desc_add(&info->d, info->sfe->id, &signalfd_desc_ops);
+}
+
+struct collect_image_info signalfd_cinfo = {
+ .fd_type = CR_FD_SIGNALFD,
+ .pb_type = PB_SIGNALFD,
+ .priv_size = sizeof(struct signalfd_info),
+ .collect = collect_one_sigfd,
+};
diff --git a/criu/sk-inet.c b/criu/sk-inet.c
new file mode 100644
index 000000000000..4d1110767c33
--- /dev/null
+++ b/criu/sk-inet.c
@@ -0,0 +1,758 @@
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <net/if.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "asm/types.h"
+#include "libnetlink.h"
+#include "cr_options.h"
+#include "imgset.h"
+#include "inet_diag.h"
+#include "files.h"
+#include "image.h"
+#include "log.h"
+#include "util.h"
+#include "sockets.h"
+#include "sk-inet.h"
+
+#define PB_ALEN_INET 1
+#define PB_ALEN_INET6 4
+
+static LIST_HEAD(inet_ports);
+
+struct inet_port {
+ int port;
+ int type;
+ futex_t users;
+ mutex_t reuseaddr_lock;
+ struct list_head list;
+};
+
+static struct inet_port *port_add(int type, int port)
+{
+ struct inet_port *e;
+
+ list_for_each_entry(e, &inet_ports, list)
+ if (e->type == type && e->port == port) {
+ futex_inc(&e->users);
+ return e;
+ }
+
+ e = shmalloc(sizeof(*e));
+ if (e == NULL) {
+ pr_err("Not enough memory\n");
+ return NULL;
+ }
+
+ e->port = port;
+ e->type = type;
+ futex_init(&e->users);
+ futex_inc(&e->users);
+ mutex_init(&e->reuseaddr_lock);
+
+ list_add(&e->list, &inet_ports);
+
+ return e;
+}
+
+static void show_one_inet(const char *act, const struct inet_sk_desc *sk)
+{
+ char src_addr[INET_ADDR_LEN] = "<unknown>";
+
+ if (inet_ntop(sk->sd.family, (void *)sk->src_addr, src_addr,
+ INET_ADDR_LEN) == NULL) {
+ pr_perror("Failed to translate address");
+ }
+
+ pr_debug("\t%s: ino 0x%8x family %4d type %4d port %8d "
+ "state %2d src_addr %s\n",
+ act, sk->sd.ino, sk->sd.family, sk->type, sk->src_port,
+ sk->state, src_addr);
+}
+
+static void show_one_inet_img(const char *act, const InetSkEntry *e)
+{
+ char src_addr[INET_ADDR_LEN] = "<unknown>";
+
+ if (inet_ntop(e->family, (void *)e->src_addr, src_addr,
+ INET_ADDR_LEN) == NULL) {
+ pr_perror("Failed to translate address");
+ }
+
+ pr_debug("\t%s: family %d type %d proto %d port %d "
+ "state %d src_addr %s\n",
+ act, e->family, e->type, e->proto, e->src_port,
+ e->state, src_addr);
+}
+
+static int can_dump_ipproto(int ino, int proto)
+{
+ /* Make sure it's a proto we support */
+ switch (proto) {
+ case IPPROTO_IP:
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ break;
+ default:
+ pr_err("Unsupported proto %d for socket %x\n", proto, ino);
+ return 0;
+ }
+
+ return 1;
+}
+
+static int can_dump_inet_sk(const struct inet_sk_desc *sk)
+{
+ BUG_ON((sk->sd.family != AF_INET) && (sk->sd.family != AF_INET6));
+
+ if (sk->shutdown) {
+ pr_err("Can't dump shutdown inet socket %x\n",
+ sk->sd.ino);
+ return 0;
+ }
+
+ if (sk->type == SOCK_DGRAM) {
+ if (sk->wqlen != 0) {
+ pr_err("Can't dump corked dgram socket %x\n",
+ sk->sd.ino);
+ return 0;
+ }
+
+ if (sk->rqlen)
+ pr_warn("Read queue is dropped for socket %x\n",
+ sk->sd.ino);
+
+ return 1;
+ }
+
+ if (sk->type != SOCK_STREAM) {
+ pr_err("Can't dump %d inet socket %x. "
+ "Only can stream and dgram.\n",
+ sk->type, sk->sd.ino);
+ return 0;
+ }
+
+ switch (sk->state) {
+ case TCP_LISTEN:
+ if (sk->rqlen != 0) {
+ /*
+ * Currently the ICONS nla reports the conn
+ * requests for listen sockets. Need to pick
+ * those up and fix the connect job respectively
+ */
+ pr_err("In-flight connection (l) for %x\n",
+ sk->sd.ino);
+ return 0;
+ }
+ break;
+ case TCP_ESTABLISHED:
+ if (!opts.tcp_established_ok) {
+ pr_err("Connected TCP socket, consider using --%s option.\n",
+ SK_EST_PARAM);
+ return 0;
+ }
+ break;
+ case TCP_CLOSE:
+ /* Trivial case, we just need to create a socket on restore */
+ break;
+ default:
+ pr_err("Unknown inet socket %x state %d\n", sk->sd.ino, sk->state);
+ return 0;
+ }
+
+ return 1;
+}
+
+static struct inet_sk_desc *gen_uncon_sk(int lfd, const struct fd_parms *p, int proto)
+{
+ struct inet_sk_desc *sk;
+ char address;
+ socklen_t aux;
+ int ret;
+
+ sk = xzalloc(sizeof(*sk));
+ if (!sk)
+ goto err;
+
+ /* It should has no peer name */
+ aux = sizeof(address);
+ ret = getsockopt(lfd, SOL_SOCKET, SO_PEERNAME, &address, &aux);
+ if (ret < 0) {
+ if (errno != ENOTCONN) {
+ pr_perror("Unexpected error returned from unconnected socket");
+ goto err;
+ }
+ } else if (ret == 0) {
+ pr_err("Name resolved on unconnected socket\n");
+ goto err;
+ }
+
+ sk->sd.ino = p->stat.st_ino;
+
+ ret = do_dump_opt(lfd, SOL_SOCKET, SO_DOMAIN, &sk->sd.family, sizeof(sk->sd.family));
+ ret |= do_dump_opt(lfd, SOL_SOCKET, SO_TYPE, &sk->type, sizeof(sk->type));
+ if (ret)
+ goto err;
+
+ if (proto == IPPROTO_TCP) {
+ struct tcp_info info;
+
+ aux = sizeof(info);
+ ret = getsockopt(lfd, SOL_TCP, TCP_INFO, &info, &aux);
+ if (ret) {
+ pr_perror("Failed to obtain TCP_INFO");
+ goto err;
+ }
+
+ if (info.tcpi_state != TCP_CLOSE) {
+ pr_err("Socket state %d obtained but expected %d\n",
+ info.tcpi_state, TCP_CLOSE);
+ goto err;
+ }
+
+ sk->wqlen = info.tcpi_backoff;
+ }
+
+ sk->state = TCP_CLOSE;
+
+ sk_collect_one(sk->sd.ino, sk->sd.family, &sk->sd);
+
+ return sk;
+err:
+ xfree(sk);
+ return NULL;
+}
+
+static int dump_ip_opts(int sk, IpOptsEntry *ioe)
+{
+ int ret = 0;
+
+ ret |= dump_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind);
+ ioe->has_freebind = ioe->freebind;
+
+ return ret;
+}
+
+/* Stolen from the kernel's __ipv6_addr_type/__ipv6_addr_needs_scopeid;
+ * link local and (multicast + loopback + linklocal) addrs require a
+ * scope id.
+ */
+#define IPV6_ADDR_SCOPE_NODELOCAL 0x01
+#define IPV6_ADDR_SCOPE_LINKLOCAL 0x02
+static bool needs_scope_id(uint32_t *src_addr)
+{
+ if ((src_addr[0] & htonl(0xFF00000)) == htonl(0xFF000000)) {
+ if (src_addr[1] & (IPV6_ADDR_SCOPE_LINKLOCAL|IPV6_ADDR_SCOPE_NODELOCAL))
+ return true;
+ }
+
+ if ((src_addr[0] & htonl(0xFFC00000)) == htonl(0xFE800000))
+ return true;
+
+ return false;
+}
+
+static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int family)
+{
+ struct inet_sk_desc *sk;
+ InetSkEntry ie = INET_SK_ENTRY__INIT;
+ IpOptsEntry ipopts = IP_OPTS_ENTRY__INIT;
+ SkOptsEntry skopts = SK_OPTS_ENTRY__INIT;
+ int ret = -1, err = -1, proto;
+
+ ret = do_dump_opt(lfd, SOL_SOCKET, SO_PROTOCOL,
+ &proto, sizeof(proto));
+ if (ret)
+ goto err;
+
+ if (!can_dump_ipproto(p->stat.st_ino, proto))
+ goto err;
+
+ sk = (struct inet_sk_desc *)lookup_socket(p->stat.st_ino, family, proto);
+ if (IS_ERR(sk))
+ goto err;
+ if (!sk) {
+ sk = gen_uncon_sk(lfd, p, proto);
+ if (!sk)
+ goto err;
+ }
+
+ if (!can_dump_inet_sk(sk))
+ goto err;
+
+ BUG_ON(sk->sd.already_dumped);
+
+ ie.id = id;
+ ie.ino = sk->sd.ino;
+ ie.family = family;
+ ie.proto = proto;
+ ie.type = sk->type;
+ ie.state = sk->state;
+ ie.src_port = sk->src_port;
+ ie.dst_port = sk->dst_port;
+ ie.backlog = sk->wqlen;
+ ie.flags = p->flags;
+
+ ie.fown = (FownEntry *)&p->fown;
+ ie.opts = &skopts;
+ ie.ip_opts = &ipopts;
+
+ ie.n_src_addr = PB_ALEN_INET;
+ ie.n_dst_addr = PB_ALEN_INET;
+ if (ie.family == AF_INET6) {
+ int val;
+ char device[IFNAMSIZ];
+ socklen_t len = sizeof(device);
+
+ ie.n_src_addr = PB_ALEN_INET6;
+ ie.n_dst_addr = PB_ALEN_INET6;
+
+ ret = dump_opt(lfd, SOL_IPV6, IPV6_V6ONLY, &val);
+ if (ret < 0)
+ goto err;
+
+ ie.v6only = val ? true : false;
+ ie.has_v6only = true;
+
+ /* ifindex only matters on source ports for bind, so let's
+ * find only that ifindex. */
+ if (sk->src_port && needs_scope_id(sk->src_addr)) {
+ if (getsockopt(lfd, SOL_SOCKET, SO_BINDTODEVICE, device, &len) < 0) {
+ pr_perror("can't get ifname");
+ goto err;
+ }
+
+ if (len > 0) {
+ ie.ifname = xstrdup(device);
+ if (!ie.ifname)
+ goto err;
+ } else {
+ pr_err("couldn't find ifname for %d, can't bind\n", id);
+ goto err;
+ }
+ }
+ }
+
+ ie.src_addr = xmalloc(pb_repeated_size(&ie, src_addr));
+ ie.dst_addr = xmalloc(pb_repeated_size(&ie, dst_addr));
+
+ if (!ie.src_addr || !ie.dst_addr)
+ goto err;
+
+ memcpy(ie.src_addr, sk->src_addr, pb_repeated_size(&ie, src_addr));
+ memcpy(ie.dst_addr, sk->dst_addr, pb_repeated_size(&ie, dst_addr));
+
+ if (dump_ip_opts(lfd, &ipopts))
+ goto err;
+
+ if (dump_socket_opts(lfd, &skopts))
+ goto err;
+
+ if (pb_write_one(img_from_set(glob_imgset, CR_FD_INETSK), &ie, PB_INET_SK))
+ goto err;
+
+ pr_info("Dumping inet socket at %d\n", p->fd);
+ show_one_inet("Dumping", sk);
+ show_one_inet_img("Dumped", &ie);
+ sk->sd.already_dumped = 1;
+ sk->cpt_reuseaddr = skopts.reuseaddr;
+
+ switch (proto) {
+ case IPPROTO_TCP:
+ err = dump_one_tcp(lfd, sk);
+ break;
+ default:
+ err = 0;
+ break;
+ }
+err:
+ release_skopts(&skopts);
+ xfree(ie.src_addr);
+ xfree(ie.dst_addr);
+ return err;
+}
+
+static int dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p)
+{
+ return do_dump_one_inet_fd(lfd, id, p, PF_INET);
+}
+
+const struct fdtype_ops inet_dump_ops = {
+ .type = FD_TYPES__INETSK,
+ .dump = dump_one_inet_fd,
+};
+
+static int dump_one_inet6_fd(int lfd, u32 id, const struct fd_parms *p)
+{
+ return do_dump_one_inet_fd(lfd, id, p, PF_INET6);
+}
+
+const struct fdtype_ops inet6_dump_ops = {
+ .type = FD_TYPES__INETSK,
+ .dump = dump_one_inet6_fd,
+};
+
+int inet_collect_one(struct nlmsghdr *h, int family, int type)
+{
+ struct inet_sk_desc *d;
+ struct inet_diag_msg *m = NLMSG_DATA(h);
+ struct rtattr *tb[INET_DIAG_MAX+1];
+ int ret;
+
+ parse_rtattr(tb, INET_DIAG_MAX, (struct rtattr *)(m + 1),
+ h->nlmsg_len - NLMSG_LENGTH(sizeof(*m)));
+
+ d = xzalloc(sizeof(*d));
+ if (!d)
+ return -1;
+
+ d->type = type;
+ d->src_port = ntohs(m->id.idiag_sport);
+ d->dst_port = ntohs(m->id.idiag_dport);
+ d->state = m->idiag_state;
+ d->rqlen = m->idiag_rqueue;
+ d->wqlen = m->idiag_wqueue;
+ memcpy(d->src_addr, m->id.idiag_src, sizeof(u32) * 4);
+ memcpy(d->dst_addr, m->id.idiag_dst, sizeof(u32) * 4);
+
+ if (tb[INET_DIAG_SHUTDOWN])
+ d->shutdown = *(u8 *)RTA_DATA(tb[INET_DIAG_SHUTDOWN]);
+ else
+ pr_err_once("Can't check shutdown state of inet socket\n");
+
+ ret = sk_collect_one(m->idiag_inode, family, &d->sd);
+
+ show_one_inet("Collected", d);
+
+ return ret;
+}
+
+static int open_inet_sk(struct file_desc *d);
+static int post_open_inet_sk(struct file_desc *d, int sk);
+
+static struct file_desc_ops inet_desc_ops = {
+ .type = FD_TYPES__INETSK,
+ .open = open_inet_sk,
+ .post_open = post_open_inet_sk,
+};
+
+static inline int tcp_connection(InetSkEntry *ie)
+{
+ return (ie->proto == IPPROTO_TCP) && (ie->state == TCP_ESTABLISHED);
+}
+
+static int collect_one_inetsk(void *o, ProtobufCMessage *base)
+{
+ struct inet_sk_info *ii = o;
+
+ ii->ie = pb_msg(base, InetSkEntry);
+ if (tcp_connection(ii->ie))
+ tcp_locked_conn_add(ii);
+
+ /*
+ * A socket can reuse addr only if all previous sockets allow that,
+ * so a value of SO_REUSEADDR can be restored after restoring all
+ * sockets.
+ */
+ ii->port = port_add(ii->ie->type, ii->ie->src_port);
+ if (ii->port == NULL)
+ return -1;
+
+ return file_desc_add(&ii->d, ii->ie->id, &inet_desc_ops);
+}
+
+struct collect_image_info inet_sk_cinfo = {
+ .fd_type = CR_FD_INETSK,
+ .pb_type = PB_INET_SK,
+ .priv_size = sizeof(struct inet_sk_info),
+ .collect = collect_one_inetsk,
+};
+
+int collect_inet_sockets(void)
+{
+ return collect_image(&inet_sk_cinfo);
+}
+
+static int inet_validate_address(InetSkEntry *ie)
+{
+ if ((ie->family == AF_INET) &&
+ /* v0.1 had 4 in ipv4 addr len */
+ (ie->n_src_addr >= PB_ALEN_INET) &&
+ (ie->n_dst_addr >= PB_ALEN_INET))
+ return 0;
+
+ if ((ie->family == AF_INET6) &&
+ (ie->n_src_addr == PB_ALEN_INET6) &&
+ (ie->n_dst_addr == PB_ALEN_INET6))
+ return 0;
+
+ pr_err("Addr len mismatch f %d ss %zu ds %zu\n", ie->family,
+ pb_repeated_size(ie, src_addr),
+ pb_repeated_size(ie, dst_addr));
+
+ return -1;
+}
+
+static int post_open_inet_sk(struct file_desc *d, int sk)
+{
+ struct inet_sk_info *ii;
+ int val;
+
+ ii = container_of(d, struct inet_sk_info, d);
+
+ /*
+ * TCP sockets are handled at the last moment
+ * after unlocking connections.
+ */
+ if (tcp_connection(ii->ie)) {
+ pr_debug("Schedule %d socket for repair off\n", sk);
+ BUG_ON(ii->sk_fd != -1);
+ ii->sk_fd = sk;
+ return 0;
+ }
+
+ /* SO_REUSEADDR is set for all sockets */
+ if (ii->ie->opts->reuseaddr)
+ return 0;
+
+ futex_wait_until(&ii->port->users, 0);
+
+ val = ii->ie->opts->reuseaddr;
+ if (restore_opt(sk, SOL_SOCKET, SO_REUSEADDR, &val))
+ return -1;
+
+ return 0;
+}
+
+int restore_ip_opts(int sk, IpOptsEntry *ioe)
+{
+ int ret = 0;
+
+ if (ioe->has_freebind)
+ ret |= restore_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind);
+
+ return ret;
+}
+static int open_inet_sk(struct file_desc *d)
+{
+ struct inet_sk_info *ii;
+ InetSkEntry *ie;
+ int sk, yes = 1;
+
+ ii = container_of(d, struct inet_sk_info, d);
+ ie = ii->ie;
+
+ show_one_inet_img("Restore", ie);
+
+ if (ie->family != AF_INET && ie->family != AF_INET6) {
+ pr_err("Unsupported socket family: %d\n", ie->family);
+ return -1;
+ }
+
+ if ((ie->type != SOCK_STREAM) && (ie->type != SOCK_DGRAM)) {
+ pr_err("Unsupported socket type: %d\n", ie->type);
+ return -1;
+ }
+
+ if (inet_validate_address(ie))
+ return -1;
+
+ sk = socket(ie->family, ie->type, ie->proto);
+ if (sk < 0) {
+ pr_perror("Can't create inet socket");
+ return -1;
+ }
+
+ if (ie->v6only) {
+ if (restore_opt(sk, SOL_IPV6, IPV6_V6ONLY, &yes) == -1)
+ goto err;
+ }
+
+ /*
+ * Set SO_REUSEADDR, because some sockets can be bound to one addr.
+ * The origin value of SO_REUSEADDR will be restored in post_open.
+ */
+ if (restore_opt(sk, SOL_SOCKET, SO_REUSEADDR, &yes))
+ goto err;
+
+ if (tcp_connection(ie)) {
+ if (!opts.tcp_established_ok) {
+ pr_err("Connected TCP socket in image\n");
+ goto err;
+ }
+
+ if (restore_one_tcp(sk, ii))
+ goto err;
+
+ goto done;
+ }
+
+ /*
+ * Listen sockets are easiest ones -- simply
+ * bind() and listen(), and that's all.
+ */
+
+ if (ie->src_port) {
+ if (inet_bind(sk, ii))
+ goto err;
+ }
+
+ if (ie->state == TCP_LISTEN) {
+ if (ie->proto != IPPROTO_TCP) {
+ pr_err("Wrong socket in listen state %d\n", ie->proto);
+ goto err;
+ }
+
+ mutex_lock(&ii->port->reuseaddr_lock);
+ if (listen(sk, ie->backlog) == -1) {
+ pr_perror("Can't listen on a socket");
+ mutex_unlock(&ii->port->reuseaddr_lock);
+ goto err;
+ }
+ mutex_unlock(&ii->port->reuseaddr_lock);
+ }
+
+ if (ie->state == TCP_ESTABLISHED &&
+ inet_connect(sk, ii))
+ goto err;
+done:
+ futex_dec_and_wake(&ii->port->users);
+
+ if (rst_file_params(sk, ie->fown, ie->flags))
+ goto err;
+
+ if (ie->ip_opts && restore_ip_opts(sk, ie->ip_opts))
+ goto err;
+
+ if (restore_socket_opts(sk, ie->opts))
+ goto err;
+
+ return sk;
+
+err:
+ close(sk);
+ return -1;
+}
+
+union sockaddr_inet {
+ struct sockaddr_in v4;
+ struct sockaddr_in6 v6;
+};
+
+static int restore_sockaddr(union sockaddr_inet *sa,
+ int family, u32 pb_port, u32 *pb_addr, u32 ifindex)
+{
+ BUILD_BUG_ON(sizeof(sa->v4.sin_addr.s_addr) > PB_ALEN_INET * sizeof(u32));
+ BUILD_BUG_ON(sizeof(sa->v6.sin6_addr.s6_addr) > PB_ALEN_INET6 * sizeof(u32));
+
+ memzero(sa, sizeof(*sa));
+
+ if (family == AF_INET) {
+ sa->v4.sin_family = AF_INET;
+ sa->v4.sin_port = htons(pb_port);
+ memcpy(&sa->v4.sin_addr.s_addr, pb_addr, sizeof(sa->v4.sin_addr.s_addr));
+ return sizeof(sa->v4);
+ }
+
+ if (family == AF_INET6) {
+ sa->v6.sin6_family = AF_INET6;
+ sa->v6.sin6_port = htons(pb_port);
+ memcpy(sa->v6.sin6_addr.s6_addr, pb_addr, sizeof(sa->v6.sin6_addr.s6_addr));
+
+ /* Here although the struct member is called scope_id, the
+ * kernel really wants ifindex. See
+ * /net/ipv6/af_inet6.c:inet6_bind for details.
+ */
+ sa->v6.sin6_scope_id = ifindex;
+ return sizeof(sa->v6);
+ }
+
+ BUG();
+ return -1;
+}
+
+int inet_bind(int sk, struct inet_sk_info *ii)
+{
+ bool rst_freebind = false;
+ union sockaddr_inet addr;
+ int addr_size, ifindex = 0;
+
+ if (ii->ie->ifname) {
+ ifindex = if_nametoindex(ii->ie->ifname);
+ if (!ifindex) {
+ pr_err("couldn't find ifindex for %s\n", ii->ie->ifname);
+ return -1;
+ }
+ }
+
+ addr_size = restore_sockaddr(&addr, ii->ie->family,
+ ii->ie->src_port, ii->ie->src_addr, ifindex);
+
+ /*
+ * ipv6 addresses go through a “tentative” phase and
+ * sockets could not be bound to them in this moment
+ * without setting IP_FREEBIND.
+ */
+ if (ii->ie->family == AF_INET6) {
+ int yes = 1;
+
+ if (restore_opt(sk, SOL_IP, IP_FREEBIND, &yes))
+ return -1;
+
+ if (ii->ie->ip_opts && ii->ie->ip_opts->freebind)
+ /*
+ * The right value is already set, so
+ * don't need to restore it in restore_ip_opts()
+ */
+ ii->ie->ip_opts->has_freebind = false;
+ else
+ rst_freebind = true;
+ }
+
+ if (bind(sk, (struct sockaddr *)&addr, addr_size) == -1) {
+ pr_perror("Can't bind inet socket (id %d)", ii->ie->id);
+ return -1;
+ }
+
+ if (rst_freebind) {
+ int no = 0;
+
+ /*
+ * The "no" value is default, so it will not be
+ * restore in restore_ip_opts()
+ */
+ if (restore_opt(sk, SOL_IP, IP_FREEBIND, &no))
+ return -1;
+ }
+
+ return 0;
+}
+
+int inet_connect(int sk, struct inet_sk_info *ii)
+{
+ union sockaddr_inet addr;
+ int addr_size;
+
+ addr_size = restore_sockaddr(&addr, ii->ie->family,
+ ii->ie->dst_port, ii->ie->dst_addr, 0);
+
+ if (connect(sk, (struct sockaddr *)&addr, addr_size) == -1) {
+ pr_perror("Can't connect inet socket back");
+ return -1;
+ }
+
+ return 0;
+}
+
+mutex_t *inet_get_reuseaddr_lock(struct inet_sk_info *ii)
+{
+ return &ii->port->reuseaddr_lock;
+}
diff --git a/criu/sk-netlink.c b/criu/sk-netlink.c
new file mode 100644
index 000000000000..a98b26dc82b4
--- /dev/null
+++ b/criu/sk-netlink.c
@@ -0,0 +1,233 @@
+#include <unistd.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+
+#include "imgset.h"
+#include "files.h"
+#include "sockets.h"
+#include "util.h"
+
+#include "protobuf.h"
+#include "protobuf/sk-netlink.pb-c.h"
+#include "netlink_diag.h"
+#include "libnetlink.h"
+
+struct netlink_sk_desc {
+ struct socket_desc sd;
+ u32 portid;
+ u32 *groups;
+ u32 gsize;
+ u32 dst_portid;
+ u32 dst_group;
+ u8 state;
+ u8 protocol;
+};
+
+int netlink_receive_one(struct nlmsghdr *hdr, void *arg)
+{
+ struct rtattr *tb[NETLINK_DIAG_MAX+1];
+ struct netlink_diag_msg *m;
+ struct netlink_sk_desc *sd;
+ unsigned long *groups;
+
+ m = NLMSG_DATA(hdr);
+ pr_debug("Collect netlink sock 0x%x\n", m->ndiag_ino);
+
+ sd = xmalloc(sizeof(*sd));
+ if (!sd)
+ return -1;
+
+ sd->protocol = m->ndiag_protocol;
+ sd->portid = m->ndiag_portid;
+ sd->dst_portid = m->ndiag_dst_portid;
+ sd->dst_group = m->ndiag_dst_group;
+ sd->state = m->ndiag_state;
+
+ parse_rtattr(tb, NETLINK_DIAG_MAX, (struct rtattr *)(m + 1),
+ hdr->nlmsg_len - NLMSG_LENGTH(sizeof(*m)));
+
+ if (tb[NETLINK_DIAG_GROUPS]) {
+ sd->gsize = RTA_PAYLOAD(tb[NETLINK_DIAG_GROUPS]);
+ groups = RTA_DATA(tb[NETLINK_DIAG_GROUPS]);
+
+ sd->groups = xmalloc(sd->gsize);
+ if (!sd->groups) {
+ xfree(sd);
+ return -1;
+ }
+ memcpy(sd->groups, groups, sd->gsize);
+ } else {
+ sd->groups = NULL;
+ sd->gsize = 0;
+ }
+
+ return sk_collect_one(m->ndiag_ino, PF_NETLINK, &sd->sd);
+}
+
+static bool can_dump_netlink_sk(int lfd)
+{
+ int ret;
+
+ ret = fd_has_data(lfd);
+ if (ret == 1)
+ pr_err("The socket has data to read\n");
+
+ return ret == 0;
+}
+
+static int dump_one_netlink_fd(int lfd, u32 id, const struct fd_parms *p)
+{
+ struct netlink_sk_desc *sk;
+ NetlinkSkEntry ne = NETLINK_SK_ENTRY__INIT;
+ SkOptsEntry skopts = SK_OPTS_ENTRY__INIT;
+
+ sk = (struct netlink_sk_desc *)lookup_socket(p->stat.st_ino, PF_NETLINK, 0);
+ if (IS_ERR(sk))
+ goto err;
+
+ ne.id = id;
+ ne.ino = p->stat.st_ino;
+
+ if (!can_dump_netlink_sk(lfd))
+ goto err;
+
+ if (sk) {
+ BUG_ON(sk->sd.already_dumped);
+
+ ne.protocol = sk->protocol;
+ ne.portid = sk->portid;
+ ne.groups = sk->groups;
+
+
+ ne.n_groups = sk->gsize / sizeof(ne.groups[0]);
+ /*
+ * On 64-bit sk->gsize is multiple to 8 bytes (sizeof(long)),
+ * so remove the last 4 bytes if they are empty.
+ */
+ if (ne.n_groups && sk->groups[ne.n_groups - 1] == 0)
+ ne.n_groups -= 1;
+
+ if (ne.n_groups > 1) {
+ pr_err("%d %x\n", sk->gsize, sk->groups[1]);
+ pr_err("The netlink socket 0x%x has more than 32 groups\n", ne.ino);
+ return -1;
+ }
+ if (sk->groups && !sk->portid) {
+ pr_err("The netlink socket 0x%x is bound to groups but not to portid\n", ne.ino);
+ return -1;
+ }
+ ne.state = sk->state;
+ ne.dst_portid = sk->dst_portid;
+ ne.dst_group = sk->dst_group;
+ } else { /* unconnected and unbound socket */
+ int val;
+ socklen_t aux = sizeof(val);
+
+ if (getsockopt(lfd, SOL_SOCKET, SO_PROTOCOL, &val, &aux) < 0) {
+ pr_perror("Unable to get protocol for netlink socket");
+ goto err;
+ }
+
+ ne.protocol = val;
+ }
+
+ ne.fown = (FownEntry *)&p->fown;
+ ne.opts = &skopts;
+
+ if (dump_socket_opts(lfd, &skopts))
+ goto err;
+
+ if (pb_write_one(img_from_set(glob_imgset, CR_FD_NETLINK_SK), &ne, PB_NETLINK_SK))
+ goto err;
+
+ return 0;
+err:
+ return -1;
+}
+
+const struct fdtype_ops netlink_dump_ops = {
+ .type = FD_TYPES__NETLINKSK,
+ .dump = dump_one_netlink_fd,
+};
+
+struct netlink_sock_info {
+ NetlinkSkEntry *nse;
+ struct file_desc d;
+};
+
+static int open_netlink_sk(struct file_desc *d)
+{
+ struct netlink_sock_info *nsi;
+ NetlinkSkEntry *nse;
+ struct sockaddr_nl addr;
+ int sk = -1;
+
+ nsi = container_of(d, struct netlink_sock_info, d);
+ nse = nsi->nse;
+
+ pr_info("Opening netlink socket id %#x\n", nse->id);
+
+ sk = socket(PF_NETLINK, SOCK_RAW, nse->protocol);
+ if (sk < 0) {
+ pr_perror("Can't create netlink sock");
+ goto err;
+ }
+
+ if (nse->portid) {
+ memset(&addr, 0, sizeof(addr));
+ addr.nl_family = AF_NETLINK;
+ if (nse->n_groups > 1) {
+ pr_err("Groups above 32 are not supported yet\n");
+ goto err;
+ }
+ if (nse->n_groups)
+ addr.nl_groups = nse->groups[0];
+ addr.nl_pid = nse->portid;
+
+ if (bind(sk, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
+ pr_perror("Can't bind netlink socket");
+ goto err;
+ }
+ }
+
+ if (nse->state == NETLINK_CONNECTED) {
+ addr.nl_family = AF_NETLINK;
+ addr.nl_groups = 1 << (nse->dst_group - 1);
+ addr.nl_pid = nse->dst_portid;
+ if (connect(sk, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
+ pr_perror("Can't connect netlink socket");
+ goto err;
+ }
+ }
+
+ if (rst_file_params(sk, nse->fown, nse->flags))
+ goto err;
+
+ if (restore_socket_opts(sk, nse->opts))
+ goto err;
+
+ return sk;
+err:
+ close(sk);
+ return -1;
+}
+
+static struct file_desc_ops netlink_sock_desc_ops = {
+ .type = FD_TYPES__NETLINKSK,
+ .open = open_netlink_sk,
+};
+
+static int collect_one_netlink_sk(void *o, ProtobufCMessage *base)
+{
+ struct netlink_sock_info *si = o;
+
+ si->nse = pb_msg(base, NetlinkSkEntry);
+ return file_desc_add(&si->d, si->nse->id, &netlink_sock_desc_ops);
+}
+
+struct collect_image_info netlink_sk_cinfo = {
+ .fd_type = CR_FD_NETLINK_SK,
+ .pb_type = PB_NETLINK_SK,
+ .priv_size = sizeof(struct netlink_sock_info),
+ .collect = collect_one_netlink_sk,
+};
diff --git a/criu/sk-packet.c b/criu/sk-packet.c
new file mode 100644
index 000000000000..a296dfa16a7f
--- /dev/null
+++ b/criu/sk-packet.c
@@ -0,0 +1,504 @@
+#include <linux/if_packet.h>
+#include <sys/socket.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <unistd.h>
+#include <string.h>
+#include "asm/types.h"
+#include "imgset.h"
+#include "files.h"
+#include "sockets.h"
+#include "libnetlink.h"
+#include "sk-packet.h"
+#include "packet_diag.h"
+#include "vma.h"
+#include <arpa/inet.h>
+
+#include "protobuf.h"
+#include "protobuf/packet-sock.pb-c.h"
+#include "protobuf/fdinfo.pb-c.h"
+
+struct packet_sock_info {
+ PacketSockEntry *pse;
+ struct file_desc d;
+};
+
+struct packet_mreq_max {
+ int mr_ifindex;
+ unsigned short mr_type;
+ unsigned short mr_alen;
+ unsigned char mr_address[MAX_ADDR_LEN];
+};
+
+struct packet_sock_desc {
+ struct socket_desc sd;
+ unsigned int file_id;
+ unsigned int type;
+ unsigned short proto;
+ struct packet_diag_info nli;
+ int mreq_n;
+ struct packet_diag_mclist *mreqs;
+ unsigned int fanout;
+ struct packet_diag_ring *rx, *tx;
+};
+
+#define NO_FANOUT ((unsigned int)-1)
+
+static int dump_mreqs(PacketSockEntry *psk, struct packet_sock_desc *sd)
+{
+ int i;
+
+ if (!sd->mreq_n)
+ return 0;
+
+ pr_debug("\tdumping %d mreqs\n", sd->mreq_n);
+ psk->mclist = xmalloc(sd->mreq_n * sizeof(psk->mclist[0]));
+ if (!psk->mclist)
+ return -1;
+
+ for (i = 0; i < sd->mreq_n; i++) {
+ struct packet_diag_mclist *m = &sd->mreqs[i];
+ PacketMclist *im;
+
+ if (m->pdmc_count != 1) {
+ pr_err("Multiple MC membership not supported (but can be)\n");
+ goto err;
+ }
+
+ pr_debug("\tmr%d: idx %d type %d\n", i,
+ m->pdmc_index, m->pdmc_type);
+
+ im = xmalloc(sizeof(*im));
+ if (!im)
+ goto err;
+
+ packet_mclist__init(im);
+ psk->mclist[i] = im;
+ psk->n_mclist++;
+
+ im->index = m->pdmc_index;
+ im->type = m->pdmc_type;
+
+ switch (m->pdmc_type) {
+ case PACKET_MR_MULTICAST:
+ case PACKET_MR_UNICAST:
+ im->addr.len = m->pdmc_alen;
+ im->addr.data = xmalloc(m->pdmc_alen);
+ if (!im->addr.data)
+ goto err;
+
+ memcpy(im->addr.data, m->pdmc_addr, m->pdmc_alen);
+ break;
+ case PACKET_MR_PROMISC:
+ case PACKET_MR_ALLMULTI:
+ break;
+ default:
+ pr_err("Unknown mc membership type %d\n", m->pdmc_type);
+ goto err;
+ }
+ }
+
+ return 0;
+err:
+ return -1;
+}
+
+static PacketRing *dump_ring(struct packet_diag_ring *dr)
+{
+ PacketRing *ring;
+
+ ring = xmalloc(sizeof(*ring));
+ if (!ring)
+ return NULL;
+
+ packet_ring__init(ring);
+
+ ring->block_size = dr->pdr_block_size;
+ ring->block_nr = dr->pdr_block_nr;
+ ring->frame_size = dr->pdr_frame_size;
+ ring->frame_nr = dr->pdr_frame_nr;
+ ring->retire_tmo = dr->pdr_retire_tmo;
+ ring->sizeof_priv = dr->pdr_sizeof_priv;
+ ring->features = dr->pdr_features;
+
+ return ring;
+}
+
+static int dump_rings(PacketSockEntry *psk, struct packet_sock_desc *sd)
+{
+ if (sd->rx) {
+ psk->rx_ring = dump_ring(sd->rx);
+ if (!psk->rx_ring)
+ return -1;
+ }
+
+ if (sd->tx) {
+ psk->tx_ring = dump_ring(sd->tx);
+ if (!psk->tx_ring)
+ return -1;
+ }
+
+ return 0;
+}
+
+static int dump_one_packet_fd(int lfd, u32 id, const struct fd_parms *p)
+{
+ PacketSockEntry psk = PACKET_SOCK_ENTRY__INIT;
+ SkOptsEntry skopts = SK_OPTS_ENTRY__INIT;
+ struct packet_sock_desc *sd;
+ int i, ret;
+
+ sd = (struct packet_sock_desc *)lookup_socket(p->stat.st_ino, PF_PACKET, 0);
+ if (IS_ERR_OR_NULL(sd)) {
+ pr_err("Can't find packet socket %"PRIu64"\n", p->stat.st_ino);
+ return -1;
+ }
+
+ pr_info("Dumping packet socket fd %d id %#x\n", lfd, id);
+ BUG_ON(sd->sd.already_dumped);
+ sd->sd.already_dumped = 1;
+
+ psk.id = sd->file_id = id;
+ psk.type = sd->type;
+ psk.flags = p->flags;
+ psk.fown = (FownEntry *)&p->fown;
+ psk.opts = &skopts;
+
+ if (dump_socket_opts(lfd, &skopts))
+ return -1;
+
+ psk.protocol = sd->proto;
+ psk.ifindex = sd->nli.pdi_index;
+ psk.version = sd->nli.pdi_version;
+ psk.reserve = sd->nli.pdi_reserve;
+ psk.timestamp = sd->nli.pdi_tstamp;
+ psk.copy_thresh = sd->nli.pdi_copy_thresh;
+ psk.aux_data = (sd->nli.pdi_flags & PDI_AUXDATA ? true : false);
+ psk.orig_dev = (sd->nli.pdi_flags & PDI_ORIGDEV ? true : false);
+ psk.vnet_hdr = (sd->nli.pdi_flags & PDI_VNETHDR ? true : false);
+ psk.loss = (sd->nli.pdi_flags & PDI_LOSS ? true : false);
+
+ ret = dump_mreqs(&psk, sd);
+ if (ret)
+ goto out;
+
+ if (sd->fanout != NO_FANOUT) {
+ psk.has_fanout = true;
+ psk.fanout = sd->fanout;
+ }
+
+ ret = dump_rings(&psk, sd);
+ if (ret)
+ goto out;
+
+ ret = pb_write_one(img_from_set(glob_imgset, CR_FD_PACKETSK), &psk, PB_PACKET_SOCK);
+out:
+ release_skopts(&skopts);
+ xfree(psk.rx_ring);
+ xfree(psk.tx_ring);
+ for (i = 0; i < psk.n_mclist; i++)
+ xfree(psk.mclist[i]->addr.data);
+ xfree(psk.mclist);
+ return ret;
+}
+
+const struct fdtype_ops packet_dump_ops = {
+ .type = FD_TYPES__PACKETSK,
+ .dump = dump_one_packet_fd,
+};
+
+int dump_socket_map(struct vma_area *vma)
+{
+ struct packet_sock_desc *sd;
+
+ sd = (struct packet_sock_desc *)lookup_socket(vma->vm_socket_id, PF_PACKET, 0);
+ if (IS_ERR_OR_NULL(sd)) {
+ pr_err("Can't find packet socket %u to mmap\n", vma->vm_socket_id);
+ return -1;
+ }
+
+ if (!sd->file_id) {
+ pr_err("Mmap-ed socket %u not open\n", vma->vm_socket_id);
+ return -1;
+ }
+
+ pr_info("Dumping socket map %x -> %"PRIx64"\n", sd->file_id, vma->e->start);
+ vma->e->shmid = sd->file_id;
+ return 0;
+}
+
+static int packet_save_mreqs(struct packet_sock_desc *sd, struct rtattr *mc)
+{
+ sd->mreq_n = RTA_PAYLOAD(mc) / sizeof(struct packet_diag_mclist);
+ pr_debug("\tGot %d mreqs\n", sd->mreq_n);
+ sd->mreqs = xmalloc(RTA_PAYLOAD(mc));
+ if (!sd->mreqs)
+ return -1;
+
+ memcpy(sd->mreqs, RTA_DATA(mc), RTA_PAYLOAD(mc));
+ return 0;
+}
+
+int packet_receive_one(struct nlmsghdr *hdr, void *arg)
+{
+ struct packet_diag_msg *m;
+ struct rtattr *tb[PACKET_DIAG_MAX + 1];
+ struct packet_sock_desc *sd;
+
+ m = NLMSG_DATA(hdr);
+ parse_rtattr(tb, PACKET_DIAG_MAX, (struct rtattr *)(m + 1),
+ hdr->nlmsg_len - NLMSG_LENGTH(sizeof(*m)));
+ pr_info("Collect packet sock %u %u\n", m->pdiag_ino, (unsigned int)m->pdiag_num);
+
+ if (!tb[PACKET_DIAG_INFO]) {
+ pr_err("No packet sock info in nlm\n");
+ return -1;
+ }
+
+ if (!tb[PACKET_DIAG_MCLIST]) {
+ pr_err("No packet sock mclist in nlm\n");
+ return -1;
+ }
+
+ sd = xmalloc(sizeof(*sd));
+ if (!sd)
+ return -1;
+
+ sd->file_id = 0;
+ sd->type = m->pdiag_type;
+ sd->proto = htons(m->pdiag_num);
+ sd->rx = NULL;
+ sd->tx = NULL;
+ memcpy(&sd->nli, RTA_DATA(tb[PACKET_DIAG_INFO]), sizeof(sd->nli));
+
+ if (packet_save_mreqs(sd, tb[PACKET_DIAG_MCLIST]))
+ goto err;
+
+ if (tb[PACKET_DIAG_FANOUT])
+ sd->fanout = *(__u32 *)RTA_DATA(tb[PACKET_DIAG_FANOUT]);
+ else
+ sd->fanout = NO_FANOUT;
+
+ if (tb[PACKET_DIAG_RX_RING]) {
+ sd->rx = xmalloc(sizeof(*sd->rx));
+ if (sd->rx == NULL)
+ goto err;
+ memcpy(sd->rx, RTA_DATA(tb[PACKET_DIAG_RX_RING]), sizeof(*sd->rx));
+ }
+
+ if (tb[PACKET_DIAG_TX_RING]) {
+ sd->tx = xmalloc(sizeof(*sd->tx));
+ if (sd->tx == NULL)
+ goto err;
+ memcpy(sd->tx, RTA_DATA(tb[PACKET_DIAG_TX_RING]), sizeof(*sd->tx));
+ }
+
+ return sk_collect_one(m->pdiag_ino, PF_PACKET, &sd->sd);
+err:
+ xfree(sd->tx);
+ xfree(sd->rx);
+ xfree(sd);
+ return -1;
+}
+
+int get_socket_fd(int pid, VmaEntry *vma)
+{
+ struct file_desc *fd;
+ struct fdinfo_list_entry *le;
+
+ pr_info("Getting packet socket fd for %d:%x\n",
+ pid, (int)vma->shmid);
+ fd = find_file_desc_raw(FD_TYPES__PACKETSK, vma->shmid);
+ if (!fd) {
+ pr_err("No packet socket %x\n", (int)vma->shmid);
+ return -1;
+ }
+
+ list_for_each_entry(le, &fd->fd_info_head, desc_list)
+ if (le->pid == pid) {
+ int fd;
+
+ /*
+ * Restorer will close the mmap-ed fd
+ */
+
+ fd = dup(le->fe->fd);
+ if (!fd) {
+ pr_perror("Can't dup packet sk");
+ return -1;
+ }
+
+ return fd;
+ }
+
+ pr_err("No open packet socket %x by %d\n", (int)vma->shmid, pid);
+ return -1;
+}
+
+static int restore_mreqs(int sk, PacketSockEntry *pse)
+{
+ int i;
+
+ for (i = 0; i < pse->n_mclist; i++) {
+ PacketMclist *ml;
+ struct packet_mreq_max mreq;
+
+ ml = pse->mclist[i];
+ pr_info("Restoring mreq type %d\n", ml->type);
+
+ if (ml->addr.len > sizeof(mreq.mr_address)) {
+ pr_err("To big mcaddr %zu\n", ml->addr.len);
+ return -1;
+ }
+
+ mreq.mr_ifindex = ml->index;
+ mreq.mr_type = ml->type;
+ mreq.mr_alen = ml->addr.len;
+ memcpy(mreq.mr_address, ml->addr.data, ml->addr.len);
+
+ if (restore_opt(sk, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mreq))
+ return -1;
+ }
+
+ return 0;
+}
+
+static int restore_ring(int sk, int type, PacketRing *ring)
+{
+ struct tpacket_req3 req;
+
+ if (!ring)
+ return 0;
+
+ pr_debug("\tRestoring %d ring\n", type);
+
+ req.tp_block_size = ring->block_size;
+ req.tp_block_nr = ring->block_nr;
+ req.tp_frame_size = ring->frame_size;
+ req.tp_frame_nr = ring->frame_nr;
+ req.tp_retire_blk_tov = ring->retire_tmo;
+ req.tp_sizeof_priv = ring->sizeof_priv;
+ req.tp_feature_req_word = ring->features;
+
+ return restore_opt(sk, SOL_PACKET, type, &req);
+}
+
+static int restore_rings(int sk, PacketSockEntry *psk)
+{
+ if (restore_ring(sk, PACKET_RX_RING, psk->rx_ring))
+ return -1;
+
+ if (restore_ring(sk, PACKET_TX_RING, psk->tx_ring))
+ return -1;
+
+ return 0;
+}
+
+static int open_packet_sk(struct file_desc *d)
+{
+ struct packet_sock_info *psi;
+ PacketSockEntry *pse;
+ struct sockaddr_ll addr;
+ int sk, yes;
+
+ psi = container_of(d, struct packet_sock_info, d);
+ pse = psi->pse;
+
+ pr_info("Opening packet socket id %#x\n", pse->id);
+
+ sk = socket(PF_PACKET, pse->type, pse->protocol);
+ if (sk < 0) {
+ pr_perror("Can't create packet sock");
+ goto err;
+ }
+
+ memset(&addr, 0, sizeof(addr));
+ addr.sll_family = AF_PACKET;
+ addr.sll_ifindex = pse->ifindex;
+
+ if (bind(sk, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
+ pr_perror("Can't bind packet socket");
+ goto err_cl;
+ }
+
+ if (restore_opt(sk, SOL_PACKET, PACKET_VERSION, &pse->version))
+ goto err_cl;
+
+ if (restore_opt(sk, SOL_PACKET, PACKET_RESERVE, &pse->reserve))
+ goto err_cl;
+
+ if (restore_opt(sk, SOL_PACKET, PACKET_TIMESTAMP, &pse->timestamp))
+ goto err_cl;
+
+ if (restore_opt(sk, SOL_PACKET, PACKET_COPY_THRESH, &pse->copy_thresh))
+ goto err_cl;
+
+ if (pse->aux_data) {
+ yes = 1;
+ if (restore_opt(sk, SOL_PACKET, PACKET_AUXDATA, &yes))
+ goto err_cl;
+ }
+
+ if (pse->orig_dev) {
+ yes = 1;
+ if (restore_opt(sk, SOL_PACKET, PACKET_ORIGDEV, &yes))
+ goto err_cl;
+ }
+
+ if (pse->vnet_hdr) {
+ yes = 1;
+ if (restore_opt(sk, SOL_PACKET, PACKET_VNET_HDR, &yes))
+ goto err_cl;
+ }
+
+ if (pse->loss) {
+ yes = 1;
+ if (restore_opt(sk, SOL_PACKET, PACKET_LOSS, &yes))
+ goto err_cl;
+ }
+
+ if (restore_mreqs(sk, pse))
+ goto err_cl;
+
+ if (restore_rings(sk, pse))
+ goto err_cl;
+
+ if (pse->has_fanout) {
+ pr_info("Restoring fanout %x\n", pse->fanout);
+ if (restore_opt(sk, SOL_PACKET, PACKET_FANOUT, &pse->fanout))
+ goto err_cl;
+ }
+
+ if (rst_file_params(sk, pse->fown, pse->flags))
+ goto err_cl;
+
+ if (restore_socket_opts(sk, pse->opts))
+ goto err_cl;
+
+ return sk;
+
+err_cl:
+ close(sk);
+err:
+ return -1;
+}
+
+static struct file_desc_ops packet_sock_desc_ops = {
+ .type = FD_TYPES__PACKETSK,
+ .open = open_packet_sk,
+};
+
+static int collect_one_packet_sk(void *o, ProtobufCMessage *base)
+{
+ struct packet_sock_info *si = o;
+
+ si->pse = pb_msg(base, PacketSockEntry);
+ return file_desc_add(&si->d, si->pse->id, &packet_sock_desc_ops);
+}
+
+struct collect_image_info packet_sk_cinfo = {
+ .fd_type = CR_FD_PACKETSK,
+ .pb_type = PB_PACKET_SOCK,
+ .priv_size = sizeof(struct packet_sock_info),
+ .collect = collect_one_packet_sk,
+};
diff --git a/criu/sk-queue.c b/criu/sk-queue.c
new file mode 100644
index 000000000000..6a39c4b35a8c
--- /dev/null
+++ b/criu/sk-queue.c
@@ -0,0 +1,256 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <limits.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/sendfile.h>
+
+#include "asm/types.h"
+#include "list.h"
+#include "imgset.h"
+#include "image.h"
+#include "servicefd.h"
+#include "cr_options.h"
+#include "util.h"
+#include "util-pie.h"
+#include "sockets.h"
+
+#include "sk-queue.h"
+
+#include "protobuf.h"
+#include "protobuf/sk-packet.pb-c.h"
+
+struct sk_packet {
+ struct list_head list;
+ SkPacketEntry *entry;
+ off_t img_off;
+};
+
+static LIST_HEAD(packets_list);
+
+int read_sk_queues(void)
+{
+ struct sk_packet *pkt;
+ int ret;
+ struct cr_img *img;
+
+ pr_info("Trying to read socket queues image\n");
+
+ img = open_image(CR_FD_SK_QUEUES, O_RSTR);
+ if (!img)
+ return -1;
+
+ while (1) {
+ ret = -1;
+ pkt = xmalloc(sizeof(*pkt));
+ if (!pkt) {
+ pr_err("Failed to allocate packet header\n");
+ break;
+ }
+ ret = pb_read_one_eof(img, &pkt->entry, PB_SK_QUEUES);
+ if (ret <= 0)
+ break;
+
+ pkt->img_off = lseek(img_raw_fd(img), 0, SEEK_CUR);
+ /*
+ * NOTE: packet must be added to the tail. Otherwise sequence
+ * will be broken.
+ */
+ list_add_tail(&pkt->list, &packets_list);
+ lseek(img_raw_fd(img), pkt->entry->length, SEEK_CUR);
+ }
+ close_image(img);
+ xfree(pkt);
+
+ return ret;
+}
+
+int dump_sk_queue(int sock_fd, int sock_id)
+{
+ SkPacketEntry pe = SK_PACKET_ENTRY__INIT;
+ int ret, size, orig_peek_off;
+ void *data;
+ socklen_t tmp;
+
+ /*
+ * Save original peek offset.
+ */
+ tmp = sizeof(orig_peek_off);
+ orig_peek_off = 0;
+ ret = getsockopt(sock_fd, SOL_SOCKET, SO_PEEK_OFF, &orig_peek_off, &tmp);
+ if (ret < 0) {
+ pr_perror("getsockopt failed");
+ return ret;
+ }
+ /*
+ * Discover max DGRAM size
+ */
+ tmp = sizeof(size);
+ size = 0;
+ ret = getsockopt(sock_fd, SOL_SOCKET, SO_SNDBUF, &size, &tmp);
+ if (ret < 0) {
+ pr_perror("getsockopt failed");
+ return ret;
+ }
+
+ /* Note: 32 bytes will be used by kernel for protocol header. */
+ size -= 32;
+
+ /*
+ * Allocate data for a stream.
+ */
+ data = xmalloc(size);
+ if (!data)
+ return -1;
+
+ /*
+ * Enable peek offset incrementation.
+ */
+ ret = setsockopt(sock_fd, SOL_SOCKET, SO_PEEK_OFF, &ret, sizeof(int));
+ if (ret < 0) {
+ pr_perror("setsockopt fail");
+ goto err_brk;
+ }
+
+ pe.id_for = sock_id;
+
+ while (1) {
+ struct iovec iov = {
+ .iov_base = data,
+ .iov_len = size,
+ };
+ struct msghdr msg = {
+ .msg_iov = &iov,
+ .msg_iovlen = 1,
+ };
+
+ ret = pe.length = recvmsg(sock_fd, &msg, MSG_DONTWAIT | MSG_PEEK);
+ if (!ret)
+ /*
+ * It means, that peer has performed an
+ * orderly shutdown, so we're done.
+ */
+ break;
+ else if (ret < 0) {
+ if (errno == EAGAIN)
+ break; /* we're done */
+ pr_perror("recvmsg fail: error");
+ goto err_set_sock;
+ }
+ if (msg.msg_flags & MSG_TRUNC) {
+ /*
+ * DGRAM truncated. This should not happen. But we have
+ * to check...
+ */
+ pr_err("sys_recvmsg failed: truncated\n");
+ ret = -E2BIG;
+ goto err_set_sock;
+ }
+
+ ret = pb_write_one(img_from_set(glob_imgset, CR_FD_SK_QUEUES), &pe, PB_SK_QUEUES);
+ if (ret < 0) {
+ ret = -EIO;
+ goto err_set_sock;
+ }
+
+ ret = write_img_buf(img_from_set(glob_imgset, CR_FD_SK_QUEUES), data, pe.length);
+ if (ret < 0) {
+ ret = -EIO;
+ goto err_set_sock;
+ }
+ }
+ ret = 0;
+
+err_set_sock:
+ /*
+ * Restore original peek offset.
+ */
+ if (setsockopt(sock_fd, SOL_SOCKET, SO_PEEK_OFF, &orig_peek_off, sizeof(int))) {
+ pr_perror("setsockopt failed on restore");
+ ret = -1;
+ }
+err_brk:
+ xfree(data);
+ return ret;
+}
+
+void sk_queue_data_handler(struct cr_img *img, void *obj)
+{
+ SkPacketEntry *e = obj;
+ print_image_data(img, e->length, opts.show_pages_content);
+}
+
+int restore_sk_queue(int fd, unsigned int peer_id)
+{
+ struct sk_packet *pkt, *tmp;
+ int ret;
+ struct cr_img *img;
+
+ pr_info("Trying to restore recv queue for %u\n", peer_id);
+
+ if (restore_prepare_socket(fd))
+ return -1;
+
+ img = open_image(CR_FD_SK_QUEUES, O_RSTR);
+ if (!img)
+ return -1;
+
+ list_for_each_entry_safe(pkt, tmp, &packets_list, list) {
+ SkPacketEntry *entry = pkt->entry;
+ char *buf;
+
+ if (entry->id_for != peer_id)
+ continue;
+
+ pr_info("\tRestoring %d-bytes skb for %u\n",
+ (unsigned int)entry->length, peer_id);
+
+ /*
+ * Don't try to use sendfile here, because it use sendpage() and
+ * all data are split on pages and a new skb is allocated for
+ * each page. It creates a big overhead on SNDBUF.
+ * sendfile() isn't suitable for DGRAM sockets, because message
+ * boundaries messages should be saved.
+ */
+
+ buf = xmalloc(entry->length);
+ if (buf ==NULL)
+ goto err;
+
+ if (lseek(img_raw_fd(img), pkt->img_off, SEEK_SET) == -1) {
+ pr_perror("lseek() failed");
+ xfree(buf);
+ goto err;
+ }
+ if (read_img_buf(img, buf, entry->length) != 1) {
+ xfree(buf);
+ goto err;
+ }
+
+ ret = write(fd, buf, entry->length);
+ xfree(buf);
+ if (ret < 0) {
+ pr_perror("Failed to send packet");
+ goto err;
+ }
+ if (ret != entry->length) {
+ pr_err("Restored skb trimmed to %d/%d\n",
+ ret, (unsigned int)entry->length);
+ goto err;
+ }
+ list_del(&pkt->list);
+ sk_packet_entry__free_unpacked(entry, NULL);
+ xfree(pkt);
+ }
+
+ close_image(img);
+ return 0;
+err:
+ close_image(img);
+ return -1;
+}
diff --git a/criu/sk-tcp.c b/criu/sk-tcp.c
new file mode 100644
index 000000000000..f653446a3604
--- /dev/null
+++ b/criu/sk-tcp.c
@@ -0,0 +1,771 @@
+#include <netinet/tcp.h>
+#include <sys/ioctl.h>
+#include <linux/sockios.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <string.h>
+#include <sched.h>
+#include <netinet/in.h>
+
+#include "cr_options.h"
+#include "util.h"
+#include "list.h"
+#include "log.h"
+#include "asm/types.h"
+#include "files.h"
+#include "sockets.h"
+#include "sk-inet.h"
+#include "netfilter.h"
+#include "image.h"
+#include "namespaces.h"
+#include "xmalloc.h"
+#include "config.h"
+#include "cr-show.h"
+#include "kerndat.h"
+#include "rst-malloc.h"
+
+#include "protobuf.h"
+#include "protobuf/tcp-stream.pb-c.h"
+
+#ifndef SIOCOUTQNSD
+/* MAO - Define SIOCOUTQNSD ioctl if we don't have it */
+#define SIOCOUTQNSD 0x894B
+#endif
+
+#ifndef CONFIG_HAS_TCP_REPAIR
+/*
+ * It's been reported that both tcp_repair_opt
+ * and TCP_ enum already shipped in netinet/tcp.h
+ * system header by some distros thus we need a
+ * test if we can use predefined ones or provide
+ * our own.
+ */
+struct tcp_repair_opt {
+ u32 opt_code;
+ u32 opt_val;
+};
+
+enum {
+ TCP_NO_QUEUE,
+ TCP_RECV_QUEUE,
+ TCP_SEND_QUEUE,
+ TCP_QUEUES_NR,
+};
+#endif
+
+#ifndef TCP_TIMESTAMP
+#define TCP_TIMESTAMP 24
+#endif
+
+#ifndef TCPOPT_SACK_PERM
+#define TCPOPT_SACK_PERM TCPOPT_SACK_PERMITTED
+#endif
+
+static LIST_HEAD(cpt_tcp_repair_sockets);
+static LIST_HEAD(rst_tcp_repair_sockets);
+
+static int tcp_repair_on(int fd)
+{
+ int ret, aux = 1;
+
+ ret = setsockopt(fd, SOL_TCP, TCP_REPAIR, &aux, sizeof(aux));
+ if (ret < 0)
+ pr_perror("Can't turn TCP repair mode ON");
+
+ return ret;
+}
+
+static int refresh_inet_sk(struct inet_sk_desc *sk)
+{
+ int size;
+ struct tcp_info info;
+
+ if (dump_opt(sk->rfd, SOL_TCP, TCP_INFO, &info)) {
+ pr_perror("Failed to obtain TCP_INFO");
+ return -1;
+ }
+
+ switch (info.tcpi_state) {
+ case TCP_ESTABLISHED:
+ case TCP_CLOSE:
+ break;
+ default:
+ pr_err("Unknown state %d\n", sk->state);
+ return -1;
+ }
+
+ if (ioctl(sk->rfd, SIOCOUTQ, &size) == -1) {
+ pr_perror("Unable to get size of snd queue");
+ return -1;
+ }
+
+ sk->wqlen = size;
+
+ if (ioctl(sk->rfd, SIOCOUTQNSD, &size) == -1) {
+ pr_perror("Unable to get size of unsent data");
+ return -1;
+ }
+
+ sk->uwqlen = size;
+
+ if (ioctl(sk->rfd, SIOCINQ, &size) == -1) {
+ pr_perror("Unable to get size of recv queue");
+ return -1;
+ }
+
+ sk->rqlen = size;
+
+ return 0;
+}
+
+static int tcp_repair_establised(int fd, struct inet_sk_desc *sk)
+{
+ int ret;
+
+ pr_info("\tTurning repair on for socket %x\n", sk->sd.ino);
+ /*
+ * Keep the socket open in criu till the very end. In
+ * case we close this fd after one task fd dumping and
+ * fail we'll have to turn repair mode off
+ */
+ sk->rfd = dup(fd);
+ if (sk->rfd < 0) {
+ pr_perror("Can't save socket fd for repair");
+ goto err1;
+ }
+
+ if (!(root_ns_mask & CLONE_NEWNET)) {
+ ret = nf_lock_connection(sk);
+ if (ret < 0)
+ goto err2;
+ }
+
+ ret = tcp_repair_on(sk->rfd);
+ if (ret < 0)
+ goto err3;
+
+ list_add_tail(&sk->rlist, &cpt_tcp_repair_sockets);
+
+ ret = refresh_inet_sk(sk);
+ if (ret < 0)
+ goto err1;
+
+ return 0;
+
+err3:
+ if (!(root_ns_mask & CLONE_NEWNET))
+ nf_unlock_connection(sk);
+err2:
+ close(sk->rfd);
+err1:
+ return -1;
+}
+
+static void tcp_unlock_one(struct inet_sk_desc *sk)
+{
+ int ret;
+
+ list_del(&sk->rlist);
+
+ if (!(root_ns_mask & CLONE_NEWNET)) {
+ ret = nf_unlock_connection(sk);
+ if (ret < 0)
+ pr_perror("Failed to unlock TCP connection");
+ }
+
+ tcp_repair_off(sk->rfd);
+
+ /*
+ * tcp_repair_off modifies SO_REUSEADDR so
+ * don't forget to restore original value.
+ */
+ restore_opt(sk->rfd, SOL_SOCKET, SO_REUSEADDR, &sk->cpt_reuseaddr);
+
+ close(sk->rfd);
+}
+
+void cpt_unlock_tcp_connections(void)
+{
+ struct inet_sk_desc *sk, *n;
+
+ list_for_each_entry_safe(sk, n, &cpt_tcp_repair_sockets, rlist)
+ tcp_unlock_one(sk);
+}
+
+/*
+ * TCP queues sequences and their relations to the code below
+ *
+ * output queue
+ * net <----------------------------- sk
+ * ^ ^ ^ seq >>
+ * snd_una snd_nxt write_seq
+ *
+ * input queue
+ * net -----------------------------> sk
+ * << seq ^ ^
+ * rcv_nxt copied_seq
+ *
+ *
+ * inq_len = rcv_nxt - copied_seq = SIOCINQ
+ * outq_len = write_seq - snd_una = SIOCOUTQ
+ * inq_seq = rcv_nxt
+ * outq_seq = write_seq
+ *
+ * On restore kernel moves the option we configure with setsockopt,
+ * thus we should advance them on the _len value in restore_tcp_seqs.
+ *
+ */
+
+static int tcp_stream_get_queue(int sk, int queue_id,
+ u32 *seq, u32 len, char **bufp)
+{
+ int ret, aux;
+ socklen_t auxl;
+ char *buf;
+
+ pr_debug("\tSet repair queue %d\n", queue_id);
+ aux = queue_id;
+ auxl = sizeof(aux);
+ ret = setsockopt(sk, SOL_TCP, TCP_REPAIR_QUEUE, &aux, auxl);
+ if (ret < 0)
+ goto err_sopt;
+
+ pr_debug("\tGet queue seq\n");
+ auxl = sizeof(*seq);
+ ret = getsockopt(sk, SOL_TCP, TCP_QUEUE_SEQ, seq, &auxl);
+ if (ret < 0)
+ goto err_sopt;
+
+ pr_info("\t`- seq %u len %u\n", *seq, len);
+
+ if (len) {
+ /*
+ * Try to grab one byte more from the queue to
+ * make sure there are len bytes for real
+ */
+ buf = xmalloc(len + 1);
+ if (!buf)
+ goto err_buf;
+
+ pr_debug("\tReading queue (%d bytes)\n", len);
+ ret = recv(sk, buf, len + 1, MSG_PEEK | MSG_DONTWAIT);
+ if (ret != len)
+ goto err_recv;
+ } else
+ buf = NULL;
+
+ *bufp = buf;
+ return 0;
+
+err_sopt:
+ pr_perror("\tsockopt failed");
+err_buf:
+ return -1;
+
+err_recv:
+ pr_perror("\trecv failed (%d, want %d, errno %d)", ret, len, errno);
+ xfree(buf);
+ goto err_buf;
+}
+
+static int tcp_stream_get_options(int sk, TcpStreamEntry *tse)
+{
+ int ret;
+ socklen_t auxl;
+ struct tcp_info ti;
+ int val;
+
+ auxl = sizeof(ti);
+ ret = getsockopt(sk, SOL_TCP, TCP_INFO, &ti, &auxl);
+ if (ret < 0)
+ goto err_sopt;
+
+ auxl = sizeof(tse->mss_clamp);
+ ret = getsockopt(sk, SOL_TCP, TCP_MAXSEG, &tse->mss_clamp, &auxl);
+ if (ret < 0)
+ goto err_sopt;
+
+ tse->opt_mask = ti.tcpi_options;
+ if (ti.tcpi_options & TCPI_OPT_WSCALE) {
+ tse->snd_wscale = ti.tcpi_snd_wscale;
+ tse->rcv_wscale = ti.tcpi_rcv_wscale;
+ tse->has_rcv_wscale = true;
+ }
+
+ if (ti.tcpi_options & TCPI_OPT_TIMESTAMPS) {
+ auxl = sizeof(val);
+ ret = getsockopt(sk, SOL_TCP, TCP_TIMESTAMP, &val, &auxl);
+ if (ret < 0)
+ goto err_sopt;
+
+ tse->has_timestamp = true;
+ tse->timestamp = val;
+ }
+
+ pr_info("\toptions: mss_clamp %x wscale %x tstamp %d sack %d\n",
+ (int)tse->mss_clamp,
+ ti.tcpi_options & TCPI_OPT_WSCALE ? (int)tse->snd_wscale : -1,
+ ti.tcpi_options & TCPI_OPT_TIMESTAMPS ? 1 : 0,
+ ti.tcpi_options & TCPI_OPT_SACK ? 1 : 0);
+
+ return 0;
+
+err_sopt:
+ pr_perror("\tsockopt failed");
+ return -1;
+}
+
+static int dump_tcp_conn_state(struct inet_sk_desc *sk)
+{
+ int ret, aux;
+ struct cr_img *img;
+ TcpStreamEntry tse = TCP_STREAM_ENTRY__INIT;
+ char *in_buf, *out_buf;
+
+ /*
+ * Read queue
+ */
+
+ pr_info("Reading inq for socket\n");
+ tse.inq_len = sk->rqlen;
+ ret = tcp_stream_get_queue(sk->rfd, TCP_RECV_QUEUE,
+ &tse.inq_seq, tse.inq_len, &in_buf);
+ if (ret < 0)
+ goto err_in;
+
+ /*
+ * Write queue
+ */
+
+ pr_info("Reading outq for socket\n");
+ tse.outq_len = sk->wqlen;
+ tse.unsq_len = sk->uwqlen;
+ tse.has_unsq_len = true;
+ ret = tcp_stream_get_queue(sk->rfd, TCP_SEND_QUEUE,
+ &tse.outq_seq, tse.outq_len, &out_buf);
+ if (ret < 0)
+ goto err_out;
+
+ /*
+ * Initial options
+ */
+
+ pr_info("Reading options for socket\n");
+ ret = tcp_stream_get_options(sk->rfd, &tse);
+ if (ret < 0)
+ goto err_opt;
+
+ /*
+ * TCP socket options
+ */
+
+ if (dump_opt(sk->rfd, SOL_TCP, TCP_NODELAY, &aux))
+ goto err_opt;
+
+ if (aux) {
+ tse.has_nodelay = true;
+ tse.nodelay = true;
+ }
+
+ if (dump_opt(sk->rfd, SOL_TCP, TCP_CORK, &aux))
+ goto err_opt;
+
+ if (aux) {
+ tse.has_cork = true;
+ tse.cork = true;
+ }
+
+ /*
+ * Push the stuff to image
+ */
+
+ img = open_image(CR_FD_TCP_STREAM, O_DUMP, sk->sd.ino);
+ if (!img)
+ goto err_img;
+
+ ret = pb_write_one(img, &tse, PB_TCP_STREAM);
+ if (ret < 0)
+ goto err_iw;
+
+ if (in_buf) {
+ ret = write_img_buf(img, in_buf, tse.inq_len);
+ if (ret < 0)
+ goto err_iw;
+ }
+
+ if (out_buf) {
+ ret = write_img_buf(img, out_buf, tse.outq_len);
+ if (ret < 0)
+ goto err_iw;
+ }
+
+ pr_info("Done\n");
+err_iw:
+ close_image(img);
+err_img:
+err_opt:
+ xfree(out_buf);
+err_out:
+ xfree(in_buf);
+err_in:
+ return ret;
+}
+
+int dump_one_tcp(int fd, struct inet_sk_desc *sk)
+{
+ if (sk->state != TCP_ESTABLISHED)
+ return 0;
+
+ pr_info("Dumping TCP connection\n");
+
+ if (tcp_repair_establised(fd, sk))
+ return -1;
+
+ if (dump_tcp_conn_state(sk))
+ return -1;
+
+ /*
+ * Socket is left in repair mode, so that at the end it's just
+ * closed and the connection is silently terminated
+ */
+ return 0;
+}
+
+static int set_tcp_queue_seq(int sk, int queue, u32 seq)
+{
+ pr_debug("\tSetting %d queue seq to %u\n", queue, seq);
+
+ if (setsockopt(sk, SOL_TCP, TCP_REPAIR_QUEUE, &queue, sizeof(queue)) < 0) {
+ pr_perror("Can't set repair queue");
+ return -1;
+ }
+
+ if (setsockopt(sk, SOL_TCP, TCP_QUEUE_SEQ, &seq, sizeof(seq)) < 0) {
+ pr_perror("Can't set queue seq");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int restore_tcp_seqs(int sk, TcpStreamEntry *tse)
+{
+ if (set_tcp_queue_seq(sk, TCP_RECV_QUEUE,
+ tse->inq_seq - tse->inq_len))
+ return -1;
+ if (set_tcp_queue_seq(sk, TCP_SEND_QUEUE,
+ tse->outq_seq - tse->outq_len))
+ return -1;
+
+ return 0;
+}
+
+static int __send_tcp_queue(int sk, int queue, u32 len, struct cr_img *img)
+{
+ int ret, err = -1, max_chunk;
+ int off;
+ char *buf;
+
+ buf = xmalloc(len);
+ if (!buf)
+ return -1;
+
+ if (read_img_buf(img, buf, len) < 0)
+ goto err;
+
+ max_chunk = (queue == TCP_RECV_QUEUE ? kdat.tcp_max_rshare : len);
+ off = 0;
+ while (len) {
+ int chunk = len;
+
+ if (chunk > max_chunk)
+ chunk = max_chunk;
+
+ ret = send(sk, buf + off, chunk, 0);
+ if (ret <= 0) {
+ if ((queue == TCP_RECV_QUEUE) && (max_chunk > 1024) && (errno == ENOMEM)) {
+ /*
+ * When restoring recv queue in repair mode
+ * kernel doesn't try hard and just allocates
+ * a linear skb with the size we pass to the
+ * system call. Thus, if the size is too big
+ * for slab allocator, the send just fails
+ * with ENOMEM. Try smaller chunk, hopefully
+ * there's still enough memory in the system.
+ */
+ max_chunk >>= 1;
+ continue;
+ }
+
+ pr_perror("Can't restore %d queue data (%d), want (%d:%d:%d)",
+ queue, ret, chunk, len, max_chunk);
+ goto err;
+ }
+ off += ret;
+ len -= ret;
+ }
+
+ err = 0;
+err:
+ xfree(buf);
+
+ return err;
+}
+
+static int send_tcp_queue(int sk, int queue, u32 len, struct cr_img *img)
+{
+ pr_debug("\tRestoring TCP %d queue data %u bytes\n", queue, len);
+
+ if (setsockopt(sk, SOL_TCP, TCP_REPAIR_QUEUE, &queue, sizeof(queue)) < 0) {
+ pr_perror("Can't set repair queue");
+ return -1;
+ }
+
+ return __send_tcp_queue(sk, queue, len, img);
+}
+
+static int restore_tcp_queues(int sk, TcpStreamEntry *tse, struct cr_img *img, mutex_t *reuse_lock)
+{
+ u32 len;
+
+ if (restore_prepare_socket(sk))
+ return -1;
+
+ len = tse->inq_len;
+ if (len && send_tcp_queue(sk, TCP_RECV_QUEUE, len, img))
+ return -1;
+
+ /*
+ * All data in a write buffer can be divided on two parts sent
+ * but not yet acknowledged data and unsent data.
+ * The TCP stack must know which data have been sent, because
+ * acknowledgment can be received for them. These data must be
+ * restored in repair mode.
+ */
+ len = tse->outq_len - tse->unsq_len;
+ if (len && send_tcp_queue(sk, TCP_SEND_QUEUE, len, img))
+ return -1;
+
+ /*
+ * The second part of data have never been sent to outside, so
+ * they can be restored without any tricks.
+ */
+ len = tse->unsq_len;
+ mutex_lock(reuse_lock);
+ tcp_repair_off(sk);
+ if (len && __send_tcp_queue(sk, TCP_SEND_QUEUE, len, img)) {
+ mutex_unlock(reuse_lock);
+ return -1;
+ }
+ if (tcp_repair_on(sk)) {
+ mutex_unlock(reuse_lock);
+ return -1;
+ }
+ mutex_unlock(reuse_lock);
+
+ return 0;
+}
+
+static int restore_tcp_opts(int sk, TcpStreamEntry *tse)
+{
+ struct tcp_repair_opt opts[4];
+ int onr = 0;
+
+ pr_debug("\tRestoring TCP options\n");
+
+ if (tse->opt_mask & TCPI_OPT_SACK) {
+ pr_debug("\t\tWill turn SAK on\n");
+ opts[onr].opt_code = TCPOPT_SACK_PERM;
+ opts[onr].opt_val = 0;
+ onr++;
+ }
+
+ if (tse->opt_mask & TCPI_OPT_WSCALE) {
+ pr_debug("\t\tWill set snd_wscale to %u\n", tse->snd_wscale);
+ pr_debug("\t\tWill set rcv_wscale to %u\n", tse->rcv_wscale);
+ opts[onr].opt_code = TCPOPT_WINDOW;
+ opts[onr].opt_val = tse->snd_wscale + (tse->rcv_wscale << 16);
+ onr++;
+ }
+
+ if (tse->opt_mask & TCPI_OPT_TIMESTAMPS) {
+ pr_debug("\t\tWill turn timestamps on\n");
+ opts[onr].opt_code = TCPOPT_TIMESTAMP;
+ opts[onr].opt_val = 0;
+ onr++;
+ }
+
+ pr_debug("Will set mss clamp to %u\n", tse->mss_clamp);
+ opts[onr].opt_code = TCPOPT_MAXSEG;
+ opts[onr].opt_val = tse->mss_clamp;
+ onr++;
+
+ if (setsockopt(sk, SOL_TCP, TCP_REPAIR_OPTIONS,
+ opts, onr * sizeof(struct tcp_repair_opt)) < 0) {
+ pr_perror("Can't repair options");
+ return -1;
+ }
+
+ if (tse->has_timestamp) {
+ if (setsockopt(sk, SOL_TCP, TCP_TIMESTAMP,
+ &tse->timestamp, sizeof(tse->timestamp)) < 0) {
+ pr_perror("Can't set timestamp");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+static int restore_tcp_conn_state(int sk, struct inet_sk_info *ii)
+{
+ int aux;
+ struct cr_img *img;
+ TcpStreamEntry *tse;
+
+ pr_info("Restoring TCP connection id %x ino %x\n", ii->ie->id, ii->ie->ino);
+
+ img = open_image(CR_FD_TCP_STREAM, O_RSTR, ii->ie->ino);
+ if (!img)
+ goto err;
+
+ if (pb_read_one(img, &tse, PB_TCP_STREAM) < 0)
+ goto err_c;
+
+ if (restore_tcp_seqs(sk, tse))
+ goto err_c;
+
+ if (inet_bind(sk, ii))
+ goto err_c;
+
+ if (inet_connect(sk, ii))
+ goto err_c;
+
+ if (restore_tcp_opts(sk, tse))
+ goto err_c;
+
+ if (restore_tcp_queues(sk, tse, img, inet_get_reuseaddr_lock(ii)))
+ goto err_c;
+
+ if (tse->has_nodelay && tse->nodelay) {
+ aux = 1;
+ if (restore_opt(sk, SOL_TCP, TCP_NODELAY, &aux))
+ goto err_c;
+ }
+
+ if (tse->has_cork && tse->cork) {
+ aux = 1;
+ if (restore_opt(sk, SOL_TCP, TCP_CORK, &aux))
+ goto err_c;
+ }
+
+ tcp_stream_entry__free_unpacked(tse, NULL);
+ close_image(img);
+ return 0;
+
+err_c:
+ tcp_stream_entry__free_unpacked(tse, NULL);
+ close_image(img);
+err:
+ return -1;
+}
+
+unsigned long rst_tcp_socks_cpos;
+unsigned int rst_tcp_socks_nr = 0;
+
+int rst_tcp_socks_prep(void)
+{
+ struct inet_sk_info *ii;
+
+ rst_tcp_socks_cpos = rst_mem_align_cpos(RM_PRIVATE);
+ list_for_each_entry(ii, &rst_tcp_repair_sockets, rlist) {
+ struct rst_tcp_sock *rs;
+
+ /*
+ * rst_tcp_repair_sockets contains all sockets, so we need to
+ * select sockets which restored in a current porcess.
+ */
+ if (ii->sk_fd == -1)
+ continue;
+
+ rs = rst_mem_alloc(sizeof(*rs), RM_PRIVATE);
+ if (!rs)
+ return -1;
+
+ rs->sk = ii->sk_fd;
+ rs->reuseaddr = ii->ie->opts->reuseaddr;
+ rst_tcp_socks_nr++;
+ }
+
+ return 0;
+}
+
+int restore_one_tcp(int fd, struct inet_sk_info *ii)
+{
+ pr_info("Restoring TCP connection\n");
+
+ if (tcp_repair_on(fd))
+ return -1;
+
+ if (restore_tcp_conn_state(fd, ii))
+ return -1;
+
+ return 0;
+}
+
+void tcp_locked_conn_add(struct inet_sk_info *ii)
+{
+ list_add_tail(&ii->rlist, &rst_tcp_repair_sockets);
+ ii->sk_fd = -1;
+}
+
+void rst_unlock_tcp_connections(void)
+{
+ struct inet_sk_info *ii;
+
+ /* Network will be unlocked by network-unlock scripts */
+ if (root_ns_mask & CLONE_NEWNET)
+ return;
+
+ list_for_each_entry(ii, &rst_tcp_repair_sockets, rlist)
+ nf_unlock_connection_info(ii);
+}
+
+int check_tcp(void)
+{
+ socklen_t optlen;
+ int sk, ret;
+ int val;
+
+ sk = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP);
+ if (sk < 0) {
+ pr_perror("Can't create TCP socket :(");
+ return -1;
+ }
+
+ ret = tcp_repair_on(sk);
+ if (ret)
+ goto out;
+
+ optlen = sizeof(val);
+ ret = getsockopt(sk, SOL_TCP, TCP_TIMESTAMP, &val, &optlen);
+ if (ret)
+ pr_perror("Can't get TCP_TIMESTAMP");
+
+out:
+ close(sk);
+
+ return ret;
+}
+
+void show_tcp_stream(struct cr_img *img, void *obj)
+{
+ TcpStreamEntry *e = obj;
+ if (opts.show_pages_content) {
+ pr_msg("In-queue:");
+ print_image_data(img, e->inq_len, 1);
+ pr_msg("Out-queue:");
+ print_image_data(img, e->outq_len, 1);
+ }
+}
diff --git a/criu/sk-unix.c b/criu/sk-unix.c
new file mode 100644
index 000000000000..608eb3eb3376
--- /dev/null
+++ b/criu/sk-unix.c
@@ -0,0 +1,1435 @@
+#include <sys/socket.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <unistd.h>
+#include <netinet/tcp.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/un.h>
+#include <stdlib.h>
+#include <dlfcn.h>
+
+#include "asm/types.h"
+#include "libnetlink.h"
+#include "cr_options.h"
+#include "imgset.h"
+#include "unix_diag.h"
+#include "files.h"
+#include "file-ids.h"
+#include "image.h"
+#include "log.h"
+#include "util.h"
+#include "util-pie.h"
+#include "sockets.h"
+#include "sk-queue.h"
+#include "mount.h"
+#include "cr-service.h"
+#include "plugin.h"
+#include "namespaces.h"
+#include "pstree.h"
+
+#include "protobuf.h"
+#include "protobuf/sk-unix.pb-c.h"
+
+#undef LOG_PREFIX
+#define LOG_PREFIX "sk unix: "
+
+typedef struct {
+ char *dir;
+ unsigned int udiag_vfs_dev;
+ unsigned int udiag_vfs_ino;
+} rel_name_desc_t;
+
+struct unix_sk_desc {
+ struct socket_desc sd;
+ unsigned int type;
+ unsigned int state;
+ unsigned int peer_ino;
+ unsigned int rqlen;
+ unsigned int wqlen;
+ unsigned int namelen;
+ char *name;
+ rel_name_desc_t *rel_name;
+ unsigned int nr_icons;
+ unsigned int *icons;
+ unsigned char shutdown;
+
+ mode_t mode;
+ uid_t uid;
+ gid_t gid;
+
+ struct list_head list;
+
+ int fd;
+ struct list_head peer_list;
+ struct list_head peer_node;
+
+ UnixSkEntry *ue;
+};
+
+static LIST_HEAD(unix_sockets);
+
+struct unix_sk_listen_icon {
+ unsigned int peer_ino;
+ struct unix_sk_desc *sk_desc;
+ struct unix_sk_listen_icon *next;
+};
+
+struct unix_sk_exception {
+ struct list_head unix_sk_list;
+ ino_t unix_sk_ino;
+};
+
+#define SK_HASH_SIZE 32
+
+static struct unix_sk_listen_icon *unix_listen_icons[SK_HASH_SIZE];
+
+static struct unix_sk_listen_icon *lookup_unix_listen_icons(int peer_ino)
+{
+ struct unix_sk_listen_icon *ic;
+
+ for (ic = unix_listen_icons[peer_ino % SK_HASH_SIZE];
+ ic; ic = ic->next)
+ if (ic->peer_ino == peer_ino)
+ return ic;
+ return NULL;
+}
+
+static void show_one_unix(char *act, const struct unix_sk_desc *sk)
+{
+ pr_debug("\t%s: ino %#x peer_ino %#x family %4d type %4d state %2d name %s\n",
+ act, sk->sd.ino, sk->peer_ino, sk->sd.family, sk->type, sk->state, sk->name);
+
+ if (sk->nr_icons) {
+ int i;
+
+ for (i = 0; i < sk->nr_icons; i++)
+ pr_debug("\t\ticon: %4d\n", sk->icons[i]);
+ }
+}
+
+static void show_one_unix_img(const char *act, const UnixSkEntry *e)
+{
+ pr_info("\t%s: id %#x ino %#x peer %#x type %d state %d name %d bytes\n",
+ act, e->id, e->ino, e->peer, e->type, e->state, (int)e->name.len);
+}
+
+static int can_dump_unix_sk(const struct unix_sk_desc *sk)
+{
+ /*
+ * The last case in this "if" is seqpacket socket,
+ * that is connected to cr_service. We will dump
+ * it properly below.
+ */
+ if (sk->type != SOCK_STREAM &&
+ sk->type != SOCK_DGRAM &&
+ sk->type != SOCK_SEQPACKET) {
+ pr_err("Unsupported type (%d) on socket %x.\n"
+ "Only stream/dgram/seqpacket are supported.\n",
+ sk->type, sk->sd.ino);
+ return 0;
+ }
+
+ switch (sk->state) {
+ case TCP_LISTEN:
+ case TCP_ESTABLISHED:
+ case TCP_CLOSE:
+ break;
+ default:
+ pr_err("Unknown state %d for unix socket %x\n",
+ sk->state, sk->sd.ino);
+ return 0;
+ }
+
+ return 1;
+}
+
+static bool unix_sk_exception_lookup_id(ino_t ino)
+{
+ bool ret = false;
+ struct unix_sk_exception *sk;
+
+ list_for_each_entry(sk, &opts.ext_unixsk_ids, unix_sk_list) {
+ if (sk->unix_sk_ino == ino) {
+ pr_debug("Found ino %u in exception unix sk list\n", (unsigned int)ino);
+ ret = true;
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static int write_unix_entry(struct unix_sk_desc *sk)
+{
+ int ret;
+
+ ret = pb_write_one(img_from_set(glob_imgset, CR_FD_UNIXSK), sk->ue, PB_UNIX_SK);
+
+ show_one_unix_img("Dumped", sk->ue);
+
+ release_skopts(sk->ue->opts);
+ xfree(sk->ue);
+
+ sk->ue = NULL;
+
+ return ret;
+}
+
+static int resolve_rel_name(struct unix_sk_desc *sk, const struct fd_parms *p)
+{
+ rel_name_desc_t *rel_name = sk->rel_name;
+ const char *dirs[] = { "cwd", "root" };
+ struct pstree_item *task;
+ int mntns_root, i;
+ struct ns_id *ns;
+
+ for_each_pstree_item(task) {
+ if (task->pid.real == p->pid)
+ break;
+ }
+ if (!task) {
+ pr_err("Can't find task with pid %d\n", p->pid);
+ return -ENOENT;
+ }
+
+ ns = lookup_ns_by_id(task->ids->mnt_ns_id, &mnt_ns_desc);
+ if (!ns) {
+ pr_err("Can't resolve mount namespace for pid %d\n", p->pid);
+ return -ENOENT;
+ }
+
+ mntns_root = mntns_get_root_fd(ns);
+ if (mntns_root < 0) {
+ pr_err("Can't resolve fs root for pid %d\n", p->pid);
+ return -ENOENT;
+ }
+
+ pr_debug("Resolving relative name %s for socket %x\n",
+ sk->name, sk->sd.ino);
+
+ for (i = 0; i < ARRAY_SIZE(dirs); i++) {
+ char dir[PATH_MAX], path[PATH_MAX];
+ struct stat st;
+ int ret;
+
+ snprintf(path, sizeof(path), "/proc/%d/%s", p->pid, dirs[i]);
+ ret = readlink(path, dir, sizeof(dir));
+ if (ret < 0 || (size_t)ret == sizeof(dir)) {
+ pr_err("Can't readlink for %s\n", dirs[i]);
+ return -1;
+ }
+ dir[ret] = 0;
+
+ snprintf(path, sizeof(path), ".%s/%s", dir, sk->name);
+ if (fstatat(mntns_root, path, &st, 0)) {
+ if (errno == ENOENT)
+ continue;
+ goto err;
+ }
+
+ if ((st.st_ino == rel_name->udiag_vfs_ino) &&
+ phys_stat_dev_match(st.st_dev, rel_name->udiag_vfs_dev, ns, &path[1])) {
+ rel_name->dir = xstrdup(dir);
+ if (!rel_name->dir)
+ return -ENOMEM;
+
+ pr_debug("Resolved relative socket name to dir %s\n", rel_name->dir);
+ sk->mode = st.st_mode;
+ sk->uid = st.st_uid;
+ sk->gid = st.st_gid;
+ return 0;
+ }
+ }
+
+err:
+ pr_err("Can't resolve name for socket %#x\n", rel_name->udiag_vfs_ino);
+ return -ENOENT;
+}
+
+static int dump_one_unix_fd(int lfd, u32 id, const struct fd_parms *p)
+{
+ struct unix_sk_desc *sk, *peer;
+ UnixSkEntry *ue;
+ SkOptsEntry *skopts;
+ FilePermsEntry *perms;
+ FownEntry *fown;
+
+ ue = xmalloc(sizeof(UnixSkEntry) +
+ sizeof(SkOptsEntry) +
+ sizeof(FilePermsEntry) +
+ sizeof(FownEntry));
+ if (ue == NULL)
+ return -1;
+
+ skopts = (void *) ue + sizeof(UnixSkEntry);
+ perms = (void *) skopts + sizeof(SkOptsEntry);
+ fown = (void *) perms + sizeof(FilePermsEntry);
+
+ unix_sk_entry__init(ue);
+ sk_opts_entry__init(skopts);
+ file_perms_entry__init(perms);
+
+ *fown = p->fown;
+
+ sk = (struct unix_sk_desc *)lookup_socket(p->stat.st_ino, PF_UNIX, 0);
+ if (IS_ERR_OR_NULL(sk)) {
+ pr_err("Unix socket %#x not found\n", (int)p->stat.st_ino);
+ goto err;
+ }
+
+ if (!can_dump_unix_sk(sk))
+ goto err;
+
+ BUG_ON(sk->sd.already_dumped);
+
+ ue->name.len = (size_t)sk->namelen;
+ ue->name.data = (void *)sk->name;
+
+ ue->id = id;
+ ue->ino = sk->sd.ino;
+ ue->type = sk->type;
+ ue->state = sk->state;
+ ue->flags = p->flags;
+ ue->backlog = sk->wqlen;
+ ue->peer = sk->peer_ino;
+ ue->fown = fown;
+ ue->opts = skopts;
+ ue->uflags = 0;
+
+ if (sk->rel_name) {
+ if (resolve_rel_name(sk, p))
+ goto err;
+ ue->name_dir = sk->rel_name->dir;
+ }
+
+ /*
+ * Check if this socket is connected to criu service.
+ * Dump it like closed one and mark it for restore.
+ */
+ if (unlikely(ue->peer == service_sk_ino)) {
+ ue->state = TCP_CLOSE;
+ ue->peer = 0;
+ ue->uflags |= USK_SERVICE;
+ }
+
+ if (sk->namelen && *sk->name) {
+ ue->file_perms = perms;
+
+ perms->mode = sk->mode;
+ perms->uid = userns_uid(sk->uid);
+ perms->gid = userns_gid(sk->gid);
+ }
+
+ sk_encode_shutdown(ue, sk->shutdown);
+
+ if (ue->peer) {
+ peer = (struct unix_sk_desc *)lookup_socket(ue->peer, PF_UNIX, 0);
+ if (IS_ERR_OR_NULL(peer)) {
+ pr_err("Unix socket %#x without peer %#x\n",
+ ue->ino, ue->peer);
+ goto err;
+ }
+
+ /*
+ * Peer should have us as peer or have a name by which
+ * we can access one.
+ */
+ if (peer->peer_ino != ue->ino) {
+ if (!peer->name) {
+ pr_err("Unix socket %#x with unreachable peer %#x (%#x/%s)\n",
+ ue->ino, ue->peer, peer->peer_ino, peer->name);
+ goto err;
+ }
+ }
+
+ /*
+ * It can be external socket, so we defer dumping
+ * until all sockets the program owns are processed.
+ */
+ if (!peer->sd.already_dumped) {
+ if (list_empty(&peer->list)) {
+ show_one_unix("Add a peer", peer);
+ list_add_tail(&peer->list, &unix_sockets);
+ }
+
+ list_add(&sk->peer_node, &peer->peer_list);
+ sk->fd = dup(lfd);
+ if (sk->fd < 0) {
+ pr_perror("Unable to dup(%d)", lfd);
+ goto err;
+ }
+ }
+
+ if ((ue->type != SOCK_DGRAM) && (
+ ((ue->shutdown == SK_SHUTDOWN__READ) &&
+ (peer->shutdown != SK_SHUTDOWN__WRITE)) ||
+ ((ue->shutdown == SK_SHUTDOWN__WRITE) &&
+ (peer->shutdown != SK_SHUTDOWN__READ)) ||
+ ((ue->shutdown == SK_SHUTDOWN__BOTH) &&
+ (peer->shutdown != SK_SHUTDOWN__BOTH)) )) {
+ /*
+ * On restore we assume, that stream pairs must
+ * be shut down from one end only
+ */
+ pr_err("Shutdown mismatch %u:%d -> %u:%d\n",
+ ue->ino, ue->shutdown, peer->sd.ino, peer->shutdown);
+ goto err;
+ }
+ } else if (ue->state == TCP_ESTABLISHED) {
+ const struct unix_sk_listen_icon *e;
+
+ e = lookup_unix_listen_icons(ue->ino);
+ if (!e) {
+ /*
+ * ESTABLISHED socket without peer and without
+ * anyone waiting for it should be semi-closed
+ * connection.
+ */
+
+ if (ue->shutdown == SK_SHUTDOWN__BOTH) {
+ pr_info("Dumping semi-closed connection\n");
+ goto dump;
+ }
+
+ pr_err("Dangling connection %#x\n", ue->ino);
+ goto err;
+ }
+
+ /*
+ * If this is in-flight connection we need to figure
+ * out where to connect it on restore. Thus, tune up peer
+ * id by searching an existing listening socket.
+ *
+ * Note the socket name will be found at restore stage,
+ * not now, just to reduce size of dump files.
+ */
+
+ /* e->sk_desc is _never_ NULL */
+ if (e->sk_desc->state != TCP_LISTEN) {
+ pr_err("In-flight connection on "
+ "non-listening socket %d\n", ue->ino);
+ goto err;
+ }
+
+ ue->peer = e->sk_desc->sd.ino;
+
+ pr_debug("\t\tFixed inflight socket %#x peer %#x)\n",
+ ue->ino, ue->peer);
+ }
+dump:
+ if (dump_socket_opts(lfd, skopts))
+ goto err;
+
+ /*
+ * If a stream listening socket has non-zero rqueue, this
+ * means there are in-flight connections waiting to get
+ * accept()-ed. We handle them separately with the "icons"
+ * (i stands for in-flight, cons -- for connections) things.
+ */
+ if (sk->rqlen != 0 && !(sk->type == SOCK_STREAM &&
+ sk->state == TCP_LISTEN))
+ if (dump_sk_queue(lfd, id))
+ goto err;
+
+ pr_info("Dumping unix socket at %d\n", p->fd);
+ show_one_unix("Dumping", sk);
+
+ sk->ue = ue;
+ /*
+ * Postpone writing the entry if a peer isn't found yet.
+ * It's required, because we may need to modify the entry.
+ * For example, if a socket is external and is dumped by
+ * a callback, the USK_CALLBACK flag must be set.
+ */
+ if (list_empty(&sk->peer_node) && write_unix_entry(sk))
+ return -1;
+
+ list_del_init(&sk->list);
+ sk->sd.already_dumped = 1;
+
+ while (!list_empty(&sk->peer_list)) {
+ struct unix_sk_desc *psk;
+ psk = list_first_entry(&sk->peer_list, struct unix_sk_desc, peer_node);
+ close_safe(&psk->fd);
+ list_del_init(&psk->peer_node);
+
+ if (write_unix_entry(psk))
+ return -1;
+ }
+
+ return 0;
+
+err:
+ release_skopts(skopts);
+ xfree(ue);
+ return -1;
+}
+
+const struct fdtype_ops unix_dump_ops = {
+ .type = FD_TYPES__UNIXSK,
+ .dump = dump_one_unix_fd,
+};
+
+/*
+ * Returns: < 0 on error, 0 if OK, 1 to skip the socket
+ */
+static int unix_process_name(struct unix_sk_desc *d, const struct unix_diag_msg *m, struct rtattr **tb)
+{
+ int len, ret;
+ char *name;
+
+ len = RTA_PAYLOAD(tb[UNIX_DIAG_NAME]);
+ name = xmalloc(len + 1);
+ if (!name)
+ return -ENOMEM;
+
+ memcpy(name, RTA_DATA(tb[UNIX_DIAG_NAME]), len);
+ name[len] = '\0';
+
+ if (name[0] != '\0') {
+ struct unix_diag_vfs *uv;
+ bool drop_path = false;
+ char rpath[PATH_MAX];
+ struct ns_id *ns;
+ struct stat st;
+ int mntns_root;
+
+ if (!tb[UNIX_DIAG_VFS]) {
+ pr_err("Bound socket w/o inode %#x\n", m->udiag_ino);
+ goto skip;
+ }
+
+ ns = lookup_ns_by_id(root_item->ids->mnt_ns_id, &mnt_ns_desc);
+ if (!ns) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ mntns_root = mntns_get_root_fd(ns);
+ if (mntns_root < 0) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ uv = RTA_DATA(tb[UNIX_DIAG_VFS]);
+ if (name[0] != '/') {
+ /*
+ * Relative names are be resolved later at first
+ * dump attempt.
+ */
+ rel_name_desc_t *rel_name = xzalloc(sizeof(*rel_name));
+ if (!rel_name) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ rel_name->udiag_vfs_dev = uv->udiag_vfs_dev;
+ rel_name->udiag_vfs_ino = uv->udiag_vfs_ino;
+
+ d->rel_name = rel_name;
+ goto postprone;
+ }
+
+ snprintf(rpath, sizeof(rpath), ".%s", name);
+ if (fstatat(mntns_root, rpath, &st, 0)) {
+ if (errno != ENOENT) {
+ pr_warn("Can't stat socket %#x(%s), skipping: %m (err %d)\n",
+ m->udiag_ino, rpath, errno);
+ goto skip;
+ }
+
+ pr_info("unix: Dropping path %s for unlinked sk %#x\n",
+ name, m->udiag_ino);
+ drop_path = true;
+ } else if ((st.st_ino != uv->udiag_vfs_ino) ||
+ !phys_stat_dev_match(st.st_dev, uv->udiag_vfs_dev, ns, name)) {
+ pr_info("unix: Dropping path %s for unlinked bound "
+ "sk %#x.%#x real %#x.%#x\n",
+ name, (int)st.st_dev, (int)st.st_ino,
+ (int)uv->udiag_vfs_dev, (int)uv->udiag_vfs_ino);
+ drop_path = true;
+ }
+
+ if (drop_path) {
+ /*
+ * When a socket is bound to unlinked file, we
+ * just drop his name, since no one will access
+ * it via one.
+ */
+ xfree(name);
+ len = 0;
+ name = NULL;
+ }
+
+ d->mode = st.st_mode;
+ d->uid = st.st_uid;
+ d->gid = st.st_gid;
+ }
+
+postprone:
+ d->namelen = len;
+ d->name = name;
+ return 0;
+
+out:
+ xfree(name);
+ return ret;
+skip:
+ ret = 1;
+ goto out;
+}
+
+static int unix_collect_one(const struct unix_diag_msg *m,
+ struct rtattr **tb)
+{
+ struct unix_sk_desc *d;
+ int ret = 0;
+
+ d = xzalloc(sizeof(*d));
+ if (!d)
+ return -1;
+
+ d->type = m->udiag_type;
+ d->state = m->udiag_state;
+ INIT_LIST_HEAD(&d->list);
+
+ INIT_LIST_HEAD(&d->peer_list);
+ INIT_LIST_HEAD(&d->peer_node);
+ d->fd = -1;
+
+ if (tb[UNIX_DIAG_SHUTDOWN])
+ d->shutdown = *(u8 *)RTA_DATA(tb[UNIX_DIAG_SHUTDOWN]);
+ else
+ pr_err_once("No socket shutdown info\n");
+
+ if (tb[UNIX_DIAG_PEER])
+ d->peer_ino = *(int *)RTA_DATA(tb[UNIX_DIAG_PEER]);
+
+ if (tb[UNIX_DIAG_NAME]) {
+ ret = unix_process_name(d, m, tb);
+ if (ret < 0)
+ goto err;
+ else if (ret == 1)
+ goto skip;
+ BUG_ON(ret != 0);
+ }
+
+ if (tb[UNIX_DIAG_ICONS]) {
+ int len = RTA_PAYLOAD(tb[UNIX_DIAG_ICONS]);
+ int i;
+
+ d->icons = xmalloc(len);
+ if (!d->icons)
+ goto err;
+
+ memcpy(d->icons, RTA_DATA(tb[UNIX_DIAG_ICONS]), len);
+ d->nr_icons = len / sizeof(u32);
+
+ /*
+ * Remember these sockets, we will need them
+ * to fix up in-flight sockets peers.
+ */
+ for (i = 0; i < d->nr_icons; i++) {
+ struct unix_sk_listen_icon *e, **chain;
+ int n;
+
+ e = xzalloc(sizeof(*e));
+ if (!e)
+ goto err;
+
+ n = d->icons[i];
+ chain = &unix_listen_icons[n % SK_HASH_SIZE];
+ e->next = *chain;
+ *chain = e;
+
+ pr_debug("\t\tCollected icon %d\n", d->icons[i]);
+
+ e->peer_ino = n;
+ e->sk_desc = d;
+ }
+ }
+
+ if (tb[UNIX_DIAG_RQLEN]) {
+ struct unix_diag_rqlen *rq;
+
+ rq = (struct unix_diag_rqlen *)RTA_DATA(tb[UNIX_DIAG_RQLEN]);
+ d->rqlen = rq->udiag_rqueue;
+ d->wqlen = rq->udiag_wqueue;
+ }
+
+ sk_collect_one(m->udiag_ino, AF_UNIX, &d->sd);
+ show_one_unix("Collected", d);
+
+ return 0;
+err:
+ ret = -1;
+skip:
+ xfree(d->icons);
+ xfree(d->name);
+ xfree(d);
+ return ret;
+}
+
+int unix_receive_one(struct nlmsghdr *h, void *arg)
+{
+ struct unix_diag_msg *m = NLMSG_DATA(h);
+ struct rtattr *tb[UNIX_DIAG_MAX+1];
+
+ parse_rtattr(tb, UNIX_DIAG_MAX, (struct rtattr *)(m + 1),
+ h->nlmsg_len - NLMSG_LENGTH(sizeof(*m)));
+
+ return unix_collect_one(m, tb);
+}
+
+static int dump_external_sockets(struct unix_sk_desc *peer)
+{
+ struct unix_sk_desc *sk;
+ int ret;
+
+ while (!list_empty(&peer->peer_list)) {
+ sk = list_first_entry(&peer->peer_list, struct unix_sk_desc, peer_node);
+
+ ret = run_plugins(DUMP_UNIX_SK, sk->fd, sk->sd.ino);
+ if (ret == -ENOTSUP) {
+ if (!opts.ext_unix_sk) {
+ show_one_unix("Runaway socket", peer);
+ pr_err("External socket is used. "
+ "Consider using --" USK_EXT_PARAM " option.\n");
+ return -1;
+ }
+
+ if (unix_sk_exception_lookup_id(sk->sd.ino)) {
+ pr_debug("found exception for unix name-less external socket.\n");
+ } else {
+ if (peer->type != SOCK_DGRAM) {
+ show_one_unix("Ext stream not supported", peer);
+ pr_err("Can't dump half of stream unix connection.\n");
+ return -1;
+ }
+
+ if (!peer->name) {
+ show_one_unix("Ext dgram w/o name", peer);
+ pr_err("Can't dump name-less external socket.\n");
+ pr_err("%d\n", sk->fd);
+ return -1;
+ }
+ }
+ } else if (ret < 0)
+ return -1;
+ else
+ sk->ue->uflags |= USK_CALLBACK;
+
+ if (write_unix_entry(sk))
+ return -1;
+ close_safe(&sk->fd);
+ list_del_init(&sk->peer_node);
+ }
+
+ return 0;
+}
+
+int fix_external_unix_sockets(void)
+{
+ struct unix_sk_desc *sk;
+
+ pr_debug("Dumping external sockets\n");
+
+ list_for_each_entry(sk, &unix_sockets, list) {
+ UnixSkEntry e = UNIX_SK_ENTRY__INIT;
+ FownEntry fown = FOWN_ENTRY__INIT;
+ SkOptsEntry skopts = SK_OPTS_ENTRY__INIT;
+
+ show_one_unix("Dumping extern", sk);
+
+ BUG_ON(sk->sd.already_dumped);
+
+ fd_id_generate_special(NULL, &e.id);
+ e.ino = sk->sd.ino;
+ e.type = SOCK_DGRAM;
+ e.state = TCP_LISTEN;
+ e.name.data = (void *)sk->name;
+ e.name.len = (size_t)sk->namelen;
+ e.uflags = USK_EXTERN;
+ e.peer = 0;
+ e.fown = &fown;
+ e.opts = &skopts;
+
+ if (pb_write_one(img_from_set(glob_imgset, CR_FD_UNIXSK), &e, PB_UNIX_SK))
+ goto err;
+
+ show_one_unix_img("Dumped extern", &e);
+
+ if (dump_external_sockets(sk))
+ goto err;
+ }
+
+ return 0;
+err:
+ return -1;
+}
+
+struct unix_sk_info {
+ UnixSkEntry *ue;
+ struct list_head list;
+ char *name;
+ char *name_dir;
+ unsigned flags;
+ struct unix_sk_info *peer;
+ struct file_desc d;
+
+ /*
+ * Futex to signal when the socket is prepared. In particular, we
+ * signal after bind()ing the socket if it is not in TCP_LISTEN, or
+ * after listen() if the socket is in TCP_LISTEN.
+ */
+ futex_t prepared;
+
+ /*
+ * For DGRAM sockets with queues, we should only restore the queue
+ * once although it may be open by more than one tid. This is the peer
+ * that should do the queueing.
+ */
+ u32 queuer;
+};
+
+#define USK_PAIR_MASTER 0x1
+#define USK_PAIR_SLAVE 0x2
+
+static struct unix_sk_info *find_unix_sk_by_ino(int ino)
+{
+ struct unix_sk_info *ui;
+
+ list_for_each_entry(ui, &unix_sockets, list) {
+ if (ui->ue->ino == ino)
+ return ui;
+ }
+
+ return NULL;
+}
+
+static int shutdown_unix_sk(int sk, struct unix_sk_info *ui)
+{
+ int how;
+ UnixSkEntry *ue = ui->ue;
+
+ if (!ue->has_shutdown || ue->shutdown == SK_SHUTDOWN__NONE)
+ return 0;
+
+ how = sk_decode_shutdown(ue->shutdown);
+ if (shutdown(sk, how)) {
+ pr_perror("Can't shutdown unix socket");
+ return -1;
+ }
+
+ pr_debug("Socket %#x is shut down %d\n", ue->ino, how);
+ return 0;
+}
+
+static void revert_unix_sk_cwd(int *prev_cwd_fd)
+{
+ if (prev_cwd_fd && *prev_cwd_fd >= 0) {
+ if (fchdir(*prev_cwd_fd))
+ pr_perror("Can't revert working dir");
+ else
+ pr_debug("Reverted working dir\n");
+ close(*prev_cwd_fd);
+ *prev_cwd_fd = -1;
+ }
+}
+
+static int prep_unix_sk_cwd(struct unix_sk_info *ui, int *prev_cwd_fd)
+{
+ if (ui->name_dir) {
+ *prev_cwd_fd = open(".", O_RDONLY);
+ if (*prev_cwd_fd < 0) {
+ pr_err("Can't open current dir\n");
+ return -1;
+ }
+ if (chdir(ui->name_dir)) {
+ pr_perror("Can't change working dir %s",
+ ui->name_dir);
+ close(*prev_cwd_fd);
+ *prev_cwd_fd = -1;
+ return -1;
+ }
+ pr_debug("Change working dir to %s\n", ui->name_dir);
+ } else
+ *prev_cwd_fd = -1;
+ return 0;
+}
+
+static int post_open_unix_sk(struct file_desc *d, int fd)
+{
+ struct unix_sk_info *ui;
+ struct unix_sk_info *peer;
+ struct sockaddr_un addr;
+ int cwd_fd = -1;
+
+ ui = container_of(d, struct unix_sk_info, d);
+ if (ui->flags & (USK_PAIR_MASTER | USK_PAIR_SLAVE))
+ return 0;
+
+ peer = ui->peer;
+
+ if (peer == NULL)
+ return 0;
+
+ if (ui->ue->uflags & USK_CALLBACK)
+ return 0;
+
+ /* Skip external sockets */
+ if (!list_empty(&peer->d.fd_info_head))
+ futex_wait_while(&peer->prepared, 0);
+
+ if (ui->ue->uflags & USK_INHERIT)
+ return 0;
+
+ memset(&addr, 0, sizeof(addr));
+ addr.sun_family = AF_UNIX;
+ memcpy(&addr.sun_path, peer->name, peer->ue->name.len);
+
+ pr_info("\tConnect %#x to %#x\n", ui->ue->ino, peer->ue->ino);
+
+ if (prep_unix_sk_cwd(peer, &cwd_fd))
+ return -1;
+
+ if (connect(fd, (struct sockaddr *)&addr,
+ sizeof(addr.sun_family) +
+ peer->ue->name.len) < 0) {
+ revert_unix_sk_cwd(&cwd_fd);
+ pr_perror("Can't connect %#x socket", ui->ue->ino);
+ return -1;
+ }
+
+ revert_unix_sk_cwd(&cwd_fd);
+
+ if (peer->queuer == ui->ue->ino && restore_sk_queue(fd, peer->ue->id))
+ return -1;
+
+ if (rst_file_params(fd, ui->ue->fown, ui->ue->flags))
+ return -1;
+
+ if (restore_socket_opts(fd, ui->ue->opts))
+ return -1;
+
+ if (shutdown_unix_sk(fd, ui))
+ return -1;
+
+ return 0;
+}
+
+static int bind_unix_sk(int sk, struct unix_sk_info *ui)
+{
+ struct sockaddr_un addr;
+ int cwd_fd = -1;
+ int ret = -1;
+
+ if ((ui->ue->type == SOCK_STREAM) && (ui->ue->state == TCP_ESTABLISHED)) {
+ /*
+ * FIXME this can be done, but for doing this properly we
+ * need to bind socket to its name, then rename one to
+ * some temporary unique one and after all the sockets are
+ * restored we should walk those temp names and rename
+ * some of them back to real ones.
+ */
+ ret = 0;
+ goto done;
+ }
+
+ memset(&addr, 0, sizeof(addr));
+ addr.sun_family = AF_UNIX;
+ memcpy(&addr.sun_path, ui->name, ui->ue->name.len);
+
+ if (prep_unix_sk_cwd(ui, &cwd_fd))
+ return -1;
+
+ if (bind(sk, (struct sockaddr *)&addr,
+ sizeof(addr.sun_family) + ui->ue->name.len)) {
+ pr_perror("Can't bind socket");
+ goto done;
+ }
+
+ if (ui->ue->name.len && *ui->name && ui->ue->file_perms) {
+ FilePermsEntry *perms = ui->ue->file_perms;
+ char fname[PATH_MAX];
+
+ if (ui->ue->name.len >= sizeof(fname)) {
+ pr_err("The file name is too long\n");
+ goto done;
+ }
+
+ memcpy(fname, ui->name, ui->ue->name.len);
+ fname[ui->ue->name.len] = '\0';
+
+ if (fchownat(AT_FDCWD, fname, perms->uid, perms->gid, 0) == -1) {
+ pr_perror("Unable to change file owner and group");
+ goto done;
+ }
+
+ if (fchmodat(AT_FDCWD, fname, perms->mode, 0) == -1) {
+ pr_perror("Unable to change file mode bits");
+ goto done;
+ }
+ }
+
+ if (ui->ue->state != TCP_LISTEN)
+ futex_set_and_wake(&ui->prepared, 1);
+
+ ret = 0;
+done:
+ revert_unix_sk_cwd(&cwd_fd);
+ return ret;
+}
+
+static int unixsk_should_open_transport(FdinfoEntry *fe,
+ struct file_desc *d)
+{
+ struct unix_sk_info *ui;
+
+ ui = container_of(d, struct unix_sk_info, d);
+ return ui->flags & USK_PAIR_SLAVE;
+}
+
+static int open_unixsk_pair_master(struct unix_sk_info *ui)
+{
+ int sk[2], tsk;
+ struct unix_sk_info *peer = ui->peer;
+ struct fdinfo_list_entry *fle;
+
+ pr_info("Opening pair master (id %#x ino %#x peer %#x)\n",
+ ui->ue->id, ui->ue->ino, ui->ue->peer);
+
+ if (socketpair(PF_UNIX, ui->ue->type, 0, sk) < 0) {
+ pr_perror("Can't make socketpair");
+ return -1;
+ }
+
+ if (restore_sk_queue(sk[0], peer->ue->id))
+ return -1;
+ if (restore_sk_queue(sk[1], ui->ue->id))
+ return -1;
+
+ if (bind_unix_sk(sk[0], ui))
+ return -1;
+
+ if (rst_file_params(sk[0], ui->ue->fown, ui->ue->flags))
+ return -1;
+
+ if (restore_socket_opts(sk[0], ui->ue->opts))
+ return -1;
+
+ if (shutdown_unix_sk(sk[0], ui))
+ return -1;
+
+ tsk = socket(PF_UNIX, SOCK_DGRAM, 0);
+ if (tsk < 0) {
+ pr_perror("Can't make transport socket");
+ return -1;
+ }
+
+ fle = file_master(&peer->d);
+ if (send_fd_to_peer(sk[1], fle, tsk)) {
+ pr_err("Can't send pair slave\n");
+ return -1;
+ }
+
+ close(tsk);
+ close(sk[1]);
+
+ return sk[0];
+}
+
+static int open_unixsk_pair_slave(struct unix_sk_info *ui)
+{
+ struct fdinfo_list_entry *fle;
+ int sk;
+
+ fle = file_master(&ui->d);
+
+ pr_info("Opening pair slave (id %#x ino %#x peer %#x) on %d\n",
+ ui->ue->id, ui->ue->ino, ui->ue->peer, fle->fe->fd);
+
+ sk = recv_fd(fle->fe->fd);
+ if (sk < 0) {
+ pr_err("Can't recv pair slave\n");
+ return -1;
+ }
+ close(fle->fe->fd);
+
+ if (bind_unix_sk(sk, ui))
+ return -1;
+
+ if (rst_file_params(sk, ui->ue->fown, ui->ue->flags))
+ return -1;
+
+ if (restore_socket_opts(sk, ui->ue->opts))
+ return -1;
+
+ if (ui->ue->type == SOCK_DGRAM)
+ /*
+ * Stream socket's "slave" end will be shut down
+ * together with master
+ */
+ if (shutdown_unix_sk(sk, ui))
+ return -1;
+
+ return sk;
+}
+
+static int open_unixsk_standalone(struct unix_sk_info *ui)
+{
+ int sk;
+
+ pr_info("Opening standalone socket (id %#x ino %#x peer %#x)\n",
+ ui->ue->id, ui->ue->ino, ui->ue->peer);
+
+ /*
+ * Check if this socket was connected to criu service.
+ * If so, put response, that dumping and restoring
+ * was successful.
+ */
+ if (ui->ue->uflags & USK_SERVICE) {
+ int sks[2];
+
+ if (socketpair(PF_UNIX, ui->ue->type, 0, sks)) {
+ pr_perror("Can't create socketpair");
+ return -1;
+ }
+
+ if (send_criu_dump_resp(sks[1], true, true) == -1)
+ return -1;
+
+ close(sks[1]);
+ sk = sks[0];
+ } else if ((ui->ue->state == TCP_ESTABLISHED) && !ui->ue->peer) {
+ int ret, sks[2];
+
+ if (ui->ue->type != SOCK_STREAM) {
+ pr_err("Non-stream socket %x in established state\n",
+ ui->ue->ino);
+ return -1;
+ }
+
+ if (ui->ue->shutdown != SK_SHUTDOWN__BOTH) {
+ pr_err("Wrong shutdown/peer state for %x\n",
+ ui->ue->ino);
+ return -1;
+ }
+
+ ret = socketpair(PF_UNIX, ui->ue->type, 0, sks);
+ if (ret < 0) {
+ pr_perror("Can't create socketpair");
+ return -1;
+ }
+
+ /*
+ * Restore queue at the one end,
+ * before closing the second one.
+ */
+ if (restore_sk_queue(sks[1], ui->ue->id)) {
+ pr_perror("Can't restore socket queue");
+ return -1;
+ }
+
+ close(sks[1]);
+ sk = sks[0];
+ } else if (ui->ue->type == SOCK_DGRAM && !ui->queuer) {
+ struct sockaddr_un addr;
+ int sks[2];
+
+ if (socketpair(PF_UNIX, ui->ue->type, 0, sks) < 0) {
+ pr_perror("Can't create socketpair");
+ return -1;
+ }
+
+ sk = sks[0];
+ addr.sun_family = AF_UNSPEC;
+
+ /*
+ * socketpair() assigns sks[1] as a peer of sks[0]
+ * (and vice versa). But in this case (not zero peer)
+ * it's impossible for other sockets to connect
+ * to sks[0] (see unix_dgram_connect()->unix_may_send()).
+ * The below is hack: we use that connect with AF_UNSPEC
+ * clears socket's peer.
+ */
+ if (connect(sk, &addr, sizeof(addr.sun_family))) {
+ pr_perror("Can't clear socket's peer");
+ return -1;
+ }
+
+ /*
+ * This must be after the connect() hack, because
+ * connect() flushes receive queue.
+ */
+ if (restore_sk_queue(sks[1], ui->ue->id)) {
+ pr_perror("Can't restore socket queue");
+ return -1;
+ }
+ close(sks[1]);
+ } else {
+ if (ui->ue->uflags & USK_CALLBACK) {
+ sk = run_plugins(RESTORE_UNIX_SK, ui->ue->ino);
+ if (sk >= 0)
+ goto out;
+ }
+
+ /*
+ * Connect to external sockets requires
+ * special option to be passed.
+ */
+ if (ui->peer && (ui->peer->ue->uflags & USK_EXTERN) &&
+ !(opts.ext_unix_sk)) {
+ pr_err("External socket found in image. "
+ "Consider using the --" USK_EXT_PARAM
+ "option to allow restoring it.\n");
+ return -1;
+ }
+
+
+ sk = socket(PF_UNIX, ui->ue->type, 0);
+ if (sk < 0) {
+ pr_perror("Can't make unix socket");
+ return -1;
+ }
+ }
+
+ if (bind_unix_sk(sk, ui))
+ return -1;
+
+ if (ui->ue->state == TCP_LISTEN) {
+ pr_info("\tPutting %#x into listen state\n", ui->ue->ino);
+ if (listen(sk, ui->ue->backlog) < 0) {
+ pr_perror("Can't make usk listen");
+ return -1;
+ }
+ futex_set_and_wake(&ui->prepared, 1);
+ }
+out:
+ if (rst_file_params(sk, ui->ue->fown, ui->ue->flags))
+ return -1;
+
+ if (restore_socket_opts(sk, ui->ue->opts))
+ return -1;
+
+ return sk;
+}
+
+static int open_unix_sk(struct file_desc *d)
+{
+ struct unix_sk_info *ui;
+
+ ui = container_of(d, struct unix_sk_info, d);
+
+ int unixsk_fd = -1;
+
+ if (inherited_fd(d, &unixsk_fd)) {
+ ui->ue->uflags |= USK_INHERIT;
+ return unixsk_fd;
+ } else if (ui->flags & USK_PAIR_MASTER)
+ return open_unixsk_pair_master(ui);
+ else if (ui->flags & USK_PAIR_SLAVE)
+ return open_unixsk_pair_slave(ui);
+ else
+ return open_unixsk_standalone(ui);
+}
+
+static char *socket_d_name(struct file_desc *d, char *buf, size_t s)
+{
+ struct unix_sk_info *ui;
+
+ ui = container_of(d, struct unix_sk_info, d);
+
+ if (snprintf(buf, s, "socket:[%d]", ui->ue->ino) >= s) {
+ pr_err("Not enough room for unixsk %d identifier string\n",
+ ui->ue->ino);
+ return NULL;
+ }
+
+ return buf;
+}
+
+static struct file_desc_ops unix_desc_ops = {
+ .type = FD_TYPES__UNIXSK,
+ .open = open_unix_sk,
+ .post_open = post_open_unix_sk,
+ .want_transport = unixsk_should_open_transport,
+ .name = socket_d_name,
+};
+
+/*
+ * Make FS clean from sockets we're about to
+ * restore. See for how we bind them for details
+ */
+static void unlink_stale(struct unix_sk_info *ui)
+{
+ int ret, cwd_fd;
+
+ if (ui->name[0] == '\0' || (ui->ue->uflags & USK_EXTERN))
+ return;
+
+ if (prep_unix_sk_cwd(ui, &cwd_fd))
+ return;
+
+ ret = unlinkat(AT_FDCWD, ui->name, 0) ? -1 : 0;
+ if (ret < 0) {
+ pr_warn("Can't unlink stale socket %#x peer %#x (name %s dir %s)\n",
+ ui->ue->ino, ui->ue->peer,
+ ui->name ? (ui->name[0] ? ui->name : &ui->name[1]) : "-",
+ ui->name_dir ? ui->name_dir : "-");
+ }
+ revert_unix_sk_cwd(&cwd_fd);
+}
+
+static int collect_one_unixsk(void *o, ProtobufCMessage *base)
+{
+ struct unix_sk_info *ui = o;
+
+ ui->ue = pb_msg(base, UnixSkEntry);
+ ui->name_dir = (void *)ui->ue->name_dir;
+
+ if (ui->ue->name.len) {
+ if (ui->ue->name.len > UNIX_PATH_MAX) {
+ pr_err("Bad unix name len %d\n", (int)ui->ue->name.len);
+ return -1;
+ }
+
+ ui->name = (void *)ui->ue->name.data;
+
+ unlink_stale(ui);
+ } else
+ ui->name = NULL;
+
+ futex_init(&ui->prepared);
+ ui->queuer = 0;
+ ui->peer = NULL;
+ ui->flags = 0;
+ pr_info(" `- Got %#x peer %#x (name %s dir %s)\n",
+ ui->ue->ino, ui->ue->peer,
+ ui->name ? (ui->name[0] ? ui->name : &ui->name[1]) : "-",
+ ui->name_dir ? ui->name_dir : "-");
+ list_add_tail(&ui->list, &unix_sockets);
+ return file_desc_add(&ui->d, ui->ue->id, &unix_desc_ops);
+}
+
+struct collect_image_info unix_sk_cinfo = {
+ .fd_type = CR_FD_UNIXSK,
+ .pb_type = PB_UNIX_SK,
+ .priv_size = sizeof(struct unix_sk_info),
+ .collect = collect_one_unixsk,
+ .flags = COLLECT_SHARED,
+};
+
+int collect_unix_sockets(void)
+{
+ return read_sk_queues();
+}
+
+int resolve_unix_peers(void)
+{
+ struct unix_sk_info *ui, *peer;
+ struct fdinfo_list_entry *fle, *fle_peer;
+
+ list_for_each_entry(ui, &unix_sockets, list) {
+ if (ui->peer)
+ continue;
+ if (!ui->ue->peer)
+ continue;
+
+ peer = find_unix_sk_by_ino(ui->ue->peer);
+
+ if (!peer) {
+ pr_err("FATAL: Peer %#x unresolved for %#x\n",
+ ui->ue->peer, ui->ue->ino);
+ return -1;
+ }
+
+ ui->peer = peer;
+ if (!peer->queuer)
+ peer->queuer = ui->ue->ino;
+ if (ui == peer)
+ /* socket connected to self %) */
+ continue;
+ if (peer->ue->peer != ui->ue->ino)
+ continue;
+
+ /* socketpair or interconnected sockets */
+ peer->peer = ui;
+
+ /*
+ * Select who will restore the pair. Check is identical to
+ * the one in pipes.c and makes sure tasks wait for each other
+ * in pids sorting order (ascending).
+ */
+
+ fle = file_master(&ui->d);
+ fle_peer = file_master(&peer->d);
+
+ if (fdinfo_rst_prio(fle, fle_peer)) {
+ ui->flags |= USK_PAIR_MASTER;
+ peer->flags |= USK_PAIR_SLAVE;
+ } else {
+ peer->flags |= USK_PAIR_MASTER;
+ ui->flags |= USK_PAIR_SLAVE;
+ }
+ }
+
+ pr_info("Unix sockets:\n");
+ list_for_each_entry(ui, &unix_sockets, list) {
+ struct fdinfo_list_entry *fle;
+
+ pr_info("\t%#x -> %#x (%#x) flags %#x\n", ui->ue->ino, ui->ue->peer,
+ ui->peer ? ui->peer->ue->ino : 0, ui->flags);
+ list_for_each_entry(fle, &ui->d.fd_info_head, desc_list)
+ pr_info("\t\tfd %d in pid %d\n",
+ fle->fe->fd, fle->pid);
+
+ }
+
+ return 0;
+}
+
+int unix_sk_id_add(ino_t ino)
+{
+ struct unix_sk_exception *unix_sk;
+
+ /* TODO: validate inode here? */
+
+ unix_sk = xmalloc(sizeof *unix_sk);
+ if (unix_sk == NULL)
+ return -1;
+ unix_sk->unix_sk_ino = ino;
+ list_add_tail(&unix_sk->unix_sk_list, &opts.ext_unixsk_ids);
+
+ return 0;
+}
+
+int unix_sk_ids_parse(char *optarg)
+{
+ /*
+ * parsing option of the following form: --ext-unix-sk=<inode value>,<inode
+ * value>... or short form -x<inode>,<inode>...
+ */
+
+ char *iter = optarg;
+
+ while (*iter != '\0') {
+ if (*iter == ',')
+ iter++;
+ else {
+ ino_t ino = (ino_t)strtoul(iter, &iter, 10);
+
+ if (0 == ino) {
+ pr_err("Can't parse unix socket inode from optarg: %s\n", optarg);
+ return -1;
+ }
+ if (unix_sk_id_add(ino) < 0) {
+ pr_err("Can't add unix socket inode in list: %s\n", optarg);
+ return -1;
+ }
+ }
+ }
+
+ return 0;
+}
+
diff --git a/criu/sockets.c b/criu/sockets.c
new file mode 100644
index 000000000000..d8d09aae2d15
--- /dev/null
+++ b/criu/sockets.c
@@ -0,0 +1,731 @@
+#include <unistd.h>
+#include <sys/socket.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <netinet/tcp.h>
+#include <errno.h>
+#include <linux/if.h>
+#include <linux/filter.h>
+#include <string.h>
+#include <netinet/in.h>
+
+#include "libnetlink.h"
+#include "sockets.h"
+#include "unix_diag.h"
+#include "inet_diag.h"
+#include "packet_diag.h"
+#include "netlink_diag.h"
+#include "files.h"
+#include "util-pie.h"
+#include "sk-packet.h"
+#include "namespaces.h"
+#include "net.h"
+#include "fs-magic.h"
+
+#ifndef SOCK_DIAG_BY_FAMILY
+#define SOCK_DIAG_BY_FAMILY 20
+#endif
+
+#define SK_HASH_SIZE 32
+
+#ifndef SO_GET_FILTER
+#define SO_GET_FILTER SO_ATTACH_FILTER
+#endif
+
+struct sock_diag_greq {
+ u8 family;
+ u8 protocol;
+};
+
+struct sock_diag_req {
+ struct nlmsghdr hdr;
+ union {
+ struct unix_diag_req u;
+ struct inet_diag_req_v2 i;
+ struct packet_diag_req p;
+ struct netlink_diag_req n;
+ struct sock_diag_greq g;
+ } r;
+};
+
+enum socket_cl_bits
+{
+ NETLINK_CL_BIT,
+ INET_TCP_CL_BIT,
+ INET_UDP_CL_BIT,
+ INET_UDPLITE_CL_BIT,
+ INET6_TCP_CL_BIT,
+ INET6_UDP_CL_BIT,
+ INET6_UDPLITE_CL_BIT,
+ UNIX_CL_BIT,
+ PACKET_CL_BIT,
+ _MAX_CL_BIT,
+};
+
+#define MAX_CL_BIT (_MAX_CL_BIT - 1)
+
+static DECLARE_BITMAP(socket_cl_bits, MAX_CL_BIT);
+
+static inline
+enum socket_cl_bits get_collect_bit_nr(unsigned int family, unsigned int proto)
+{
+ if (family == AF_NETLINK)
+ return NETLINK_CL_BIT;
+ if (family == AF_UNIX)
+ return UNIX_CL_BIT;
+ if (family == AF_PACKET)
+ return PACKET_CL_BIT;
+ if (family == AF_INET) {
+ if (proto == IPPROTO_TCP)
+ return INET_TCP_CL_BIT;
+ if (proto == IPPROTO_UDP)
+ return INET_UDP_CL_BIT;
+ if (proto == IPPROTO_UDPLITE)
+ return INET_UDPLITE_CL_BIT;
+ }
+ if (family == AF_INET6) {
+ if (proto == IPPROTO_TCP)
+ return INET6_TCP_CL_BIT;
+ if (proto == IPPROTO_UDP)
+ return INET6_UDP_CL_BIT;
+ if (proto == IPPROTO_UDPLITE)
+ return INET6_UDPLITE_CL_BIT;
+ }
+
+ pr_err("Unknown pair family %d proto %d\n", family, proto);
+ BUG();
+ return -1;
+}
+
+static void set_collect_bit(unsigned int family, unsigned int proto)
+{
+ enum socket_cl_bits nr;
+
+ nr = get_collect_bit_nr(family, proto);
+ set_bit(nr, socket_cl_bits);
+}
+
+bool socket_test_collect_bit(unsigned int family, unsigned int proto)
+{
+ enum socket_cl_bits nr;
+
+ nr = get_collect_bit_nr(family, proto);
+ return test_bit(nr, socket_cl_bits) != 0;
+}
+
+static int probe_recv_one(struct nlmsghdr *h, void *arg)
+{
+ pr_err("PROBE RECEIVED\n");
+ return -1;
+}
+
+static int probe_err(int err, void *arg)
+{
+ int expected_err = *(int *)arg;
+
+ if (err == expected_err)
+ return 0;
+
+ pr_err("Diag module missing (%d)\n", err);
+ return err;
+}
+
+static inline void probe_diag(int nl, struct sock_diag_req *req, int expected_err)
+{
+ do_rtnl_req(nl, req, req->hdr.nlmsg_len, probe_recv_one, probe_err, &expected_err);
+}
+
+void preload_socket_modules()
+{
+ int nl;
+ struct sock_diag_req req;
+
+ /*
+ * If the task to dump (e.g. an LXC container) has any netlink
+ * KOBJECT_UEVENT socket open and the _diag modules aren't
+ * loaded is dumped, criu will freeze the task and then the
+ * kernel will send it messages on the socket, and then we will
+ * fail to dump because the socket has pending data. The Real
+ * Solution is to dump this pending data, but we just make sure
+ * modules are there beforehand for now so that the first dump
+ * doesn't fail.
+ */
+
+ nl = socket(PF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG);
+ if (nl < 0)
+ return;
+
+ pr_info("Probing sock diag modules\n");
+
+ memset(&req, 0, sizeof(req));
+ req.hdr.nlmsg_type = SOCK_DIAG_BY_FAMILY;
+ req.hdr.nlmsg_seq = CR_NLMSG_SEQ;
+
+ /*
+ * Probe UNIX, netlink and packet diag-s by feeding
+ * to the kernel request that is shorter than they
+ * expect, byt still containing the family to make
+ * sure the family handler is there. The family-level
+ * diag module would report EINVAL in this case.
+ */
+
+ req.hdr.nlmsg_len = sizeof(req.hdr) + sizeof(req.r.g);
+ req.hdr.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST;
+
+ req.r.g.family = AF_UNIX;
+ probe_diag(nl, &req, -EINVAL);
+
+ req.r.g.family = AF_PACKET;
+ probe_diag(nl, &req, -EINVAL);
+
+ req.r.g.family = AF_NETLINK;
+ probe_diag(nl, &req, -EINVAL);
+
+ /*
+ * TCP and UDP(LITE) diags do not support such trick, only
+ * inet_diag module can be probed like that. For the protocol
+ * level ones it's OK to request for exact non-existing socket
+ * and check for ENOENT being reported back as error.
+ */
+
+ req.hdr.nlmsg_len = sizeof(req.hdr) + sizeof(req.r.i);
+ req.hdr.nlmsg_flags = NLM_F_REQUEST;
+ req.r.i.sdiag_family = AF_INET;
+
+ req.r.i.sdiag_protocol = IPPROTO_TCP;
+ probe_diag(nl, &req, -ENOENT);
+
+ req.r.i.sdiag_protocol = IPPROTO_UDP; /* UDLITE is merged with UDP */
+ probe_diag(nl, &req, -ENOENT);
+
+ close(nl);
+ pr_info("Done probing\n");
+}
+
+static int dump_bound_dev(int sk, SkOptsEntry *soe)
+{
+ int ret;
+ char dev[IFNAMSIZ];
+ socklen_t len = sizeof(dev);
+
+ ret = getsockopt(sk, SOL_SOCKET, SO_BINDTODEVICE, &dev, &len);
+ if (ret) {
+ pr_perror("Can't get bound dev");
+ return ret;
+ }
+
+ if (len == 0)
+ return 0;
+
+ pr_debug("\tDumping %s bound dev for sk\n", dev);
+ soe->so_bound_dev = xmalloc(len);
+ if (soe->so_bound_dev == NULL)
+ return -1;
+ strcpy(soe->so_bound_dev, dev);
+ return 0;
+}
+
+static int restore_bound_dev(int sk, SkOptsEntry *soe)
+{
+ char *n = soe->so_bound_dev;
+
+ if (!n)
+ return 0;
+
+ pr_debug("\tBinding socket to %s dev\n", n);
+ return do_restore_opt(sk, SOL_SOCKET, SO_BINDTODEVICE, n, strlen(n));
+}
+
+/*
+ * Protobuf handles le/be himself, but the sock_filter is not just u64,
+ * it's a structure and we have to preserve the fields order to be able
+ * to move socket image across architectures.
+ */
+
+static void encode_filter(struct sock_filter *f, u64 *img, int n)
+{
+ int i;
+
+ BUILD_BUG_ON(sizeof(*f) != sizeof(*img));
+
+ for (i = 0; i < n; i++)
+ img[i] = ((u64)f[i].code << 48) |
+ ((u64)f[i].jt << 40) |
+ ((u64)f[i].jf << 32) |
+ ((u64)f[i].k << 0);
+}
+
+static void decode_filter(u64 *img, struct sock_filter *f, int n)
+{
+ int i;
+
+ for (i = 0; i < n; i++) {
+ f[i].code = img[i] >> 48;
+ f[i].jt = img[i] >> 40;
+ f[i].jf = img[i] >> 32;
+ f[i].k = img[i] >> 0;
+ }
+}
+
+static int dump_socket_filter(int sk, SkOptsEntry *soe)
+{
+ socklen_t len = 0;
+ int ret;
+ struct sock_filter *flt;
+
+ ret = getsockopt(sk, SOL_SOCKET, SO_GET_FILTER, NULL, &len);
+ if (ret) {
+ pr_perror("Can't get socket filter len");
+ return ret;
+ }
+
+ if (!len) {
+ pr_info("No filter for socket\n");
+ return 0;
+ }
+
+ flt = xmalloc(len * sizeof(*flt));
+ if (!flt)
+ return -1;
+
+ ret = getsockopt(sk, SOL_SOCKET, SO_GET_FILTER, flt, &len);
+ if (ret) {
+ pr_perror("Can't get socket filter");
+ xfree(flt);
+ return ret;
+ }
+
+ soe->so_filter = xmalloc(len * sizeof(*soe->so_filter));
+ if (!soe->so_filter) {
+ xfree(flt);
+ return -1;
+ }
+
+ encode_filter(flt, soe->so_filter, len);
+ soe->n_so_filter = len;
+ xfree(flt);
+ return 0;
+}
+
+static int restore_socket_filter(int sk, SkOptsEntry *soe)
+{
+ int ret;
+ struct sock_fprog sfp;
+
+ if (!soe->n_so_filter)
+ return 0;
+
+ pr_info("Restoring socket filter\n");
+ sfp.len = soe->n_so_filter;
+ sfp.filter = xmalloc(soe->n_so_filter * sfp.len);
+ if (!sfp.filter)
+ return -1;
+
+ decode_filter(soe->so_filter, sfp.filter, sfp.len);
+ ret = restore_opt(sk, SOL_SOCKET, SO_ATTACH_FILTER, &sfp);
+ xfree(sfp.filter);
+
+ return ret;
+}
+
+static struct socket_desc *sockets[SK_HASH_SIZE];
+
+struct socket_desc *lookup_socket(int ino, int family, int proto)
+{
+ struct socket_desc *sd;
+
+ if (!socket_test_collect_bit(family, proto)) {
+ pr_err("Sockets (family %d, proto %d) are not collected\n",
+ family, proto);
+ return ERR_PTR(-EINVAL);
+ }
+
+ pr_debug("\tSearching for socket %x (family %d.%d)\n", ino, family, proto);
+ for (sd = sockets[ino % SK_HASH_SIZE]; sd; sd = sd->next)
+ if (sd->ino == ino) {
+ BUG_ON(sd->family != family);
+ return sd;
+ }
+
+ return NULL;
+}
+
+int sk_collect_one(int ino, int family, struct socket_desc *d)
+{
+ struct socket_desc **chain;
+
+ d->ino = ino;
+ d->family = family;
+ d->already_dumped = 0;
+
+ chain = &sockets[ino % SK_HASH_SIZE];
+ d->next = *chain;
+ *chain = d;
+
+ return 0;
+}
+
+int do_restore_opt(int sk, int level, int name, void *val, int len)
+{
+ if (setsockopt(sk, level, name, val, len) < 0) {
+ pr_perror("Can't set %d:%d (len %d)", level, name, len);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int sk_setbufs(void *arg, int fd, pid_t pid)
+{
+ u32 *buf = (u32 *)arg;
+
+ if (restore_opt(fd, SOL_SOCKET, SO_SNDBUFFORCE, &buf[0]))
+ return -1;
+ if (restore_opt(fd, SOL_SOCKET, SO_RCVBUFFORCE, &buf[1]))
+ return -1;
+
+ return 0;
+}
+
+/*
+ * Set sizes of buffers to maximum and prevent blocking
+ * Caller of this fn should call other socket restoring
+ * routines to drop the non-blocking and set proper send
+ * and receive buffers.
+ */
+int restore_prepare_socket(int sk)
+{
+ int flags;
+ /* In kernel a bufsize has type int and a value is doubled. */
+ u32 maxbuf[2] = { INT_MAX / 2, INT_MAX / 2 };
+
+ if (userns_call(sk_setbufs, 0, maxbuf, sizeof(maxbuf), sk))
+ return -1;
+
+ /* Prevent blocking on restore */
+ flags = fcntl(sk, F_GETFL, 0);
+ if (flags == -1) {
+ pr_perror("Unable to get flags for %d", sk);
+ return -1;
+ }
+ if (fcntl(sk, F_SETFL, flags | O_NONBLOCK) ) {
+ pr_perror("Unable to set O_NONBLOCK for %d", sk);
+ return -1;
+ }
+
+ return 0;
+}
+
+int restore_socket_opts(int sk, SkOptsEntry *soe)
+{
+ int ret = 0, val;
+ struct timeval tv;
+ /* In kernel a bufsize value is doubled. */
+ u32 bufs[2] = { soe->so_sndbuf / 2, soe->so_rcvbuf / 2};
+
+ pr_info("%d restore sndbuf %d rcv buf %d\n", sk, soe->so_sndbuf, soe->so_rcvbuf);
+
+ /* setsockopt() multiplies the input values by 2 */
+ ret |= userns_call(sk_setbufs, UNS_ASYNC, bufs, sizeof(bufs), sk);
+
+ if (soe->has_so_priority) {
+ pr_debug("\trestore priority %d for socket\n", soe->so_priority);
+ ret |= restore_opt(sk, SOL_SOCKET, SO_PRIORITY, &soe->so_priority);
+ }
+ if (soe->has_so_rcvlowat) {
+ pr_debug("\trestore rcvlowat %d for socket\n", soe->so_rcvlowat);
+ ret |= restore_opt(sk, SOL_SOCKET, SO_RCVLOWAT, &soe->so_rcvlowat);
+ }
+ if (soe->has_so_mark) {
+ pr_debug("\trestore mark %d for socket\n", soe->so_mark);
+ ret |= restore_opt(sk, SOL_SOCKET, SO_MARK, &soe->so_mark);
+ }
+ if (soe->has_so_passcred && soe->so_passcred) {
+ val = 1;
+ pr_debug("\tset passcred for socket\n");
+ ret |= restore_opt(sk, SOL_SOCKET, SO_PASSCRED, &val);
+ }
+ if (soe->has_so_passsec && soe->so_passsec) {
+ val = 1;
+ pr_debug("\tset passsec for socket\n");
+ ret |= restore_opt(sk, SOL_SOCKET, SO_PASSSEC, &val);
+ }
+ if (soe->has_so_dontroute && soe->so_dontroute) {
+ val = 1;
+ pr_debug("\tset dontroute for socket\n");
+ ret |= restore_opt(sk, SOL_SOCKET, SO_DONTROUTE, &val);
+ }
+ if (soe->has_so_no_check && soe->so_no_check) {
+ val = 1;
+ pr_debug("\tset no_check for socket\n");
+ ret |= restore_opt(sk, SOL_SOCKET, SO_NO_CHECK, &val);
+ }
+
+ tv.tv_sec = soe->so_snd_tmo_sec;
+ tv.tv_usec = soe->so_snd_tmo_usec;
+ ret |= restore_opt(sk, SOL_SOCKET, SO_SNDTIMEO, &tv);
+
+ tv.tv_sec = soe->so_rcv_tmo_sec;
+ tv.tv_usec = soe->so_rcv_tmo_usec;
+ ret |= restore_opt(sk, SOL_SOCKET, SO_RCVTIMEO, &tv);
+
+ ret |= restore_bound_dev(sk, soe);
+ ret |= restore_socket_filter(sk, soe);
+
+ /* The restore of SO_REUSEADDR depends on type of socket */
+
+ return ret;
+}
+
+int do_dump_opt(int sk, int level, int name, void *val, int len)
+{
+ socklen_t aux = len;
+
+ if (getsockopt(sk, level, name, val, &aux) < 0) {
+ pr_perror("Can't get %d:%d opt", level, name);
+ return -1;
+ }
+
+ if (aux != len) {
+ pr_err("Len mismatch on %d:%d : %d, want %d\n",
+ level, name, aux, len);
+ return -1;
+ }
+
+ return 0;
+}
+
+int dump_socket_opts(int sk, SkOptsEntry *soe)
+{
+ int ret = 0, val;
+ struct timeval tv;
+
+ ret |= dump_opt(sk, SOL_SOCKET, SO_SNDBUF, &soe->so_sndbuf);
+ ret |= dump_opt(sk, SOL_SOCKET, SO_RCVBUF, &soe->so_rcvbuf);
+ soe->has_so_priority = true;
+ ret |= dump_opt(sk, SOL_SOCKET, SO_PRIORITY, &soe->so_priority);
+ soe->has_so_rcvlowat = true;
+ ret |= dump_opt(sk, SOL_SOCKET, SO_RCVLOWAT, &soe->so_rcvlowat);
+ soe->has_so_mark = true;
+ ret |= dump_opt(sk, SOL_SOCKET, SO_MARK, &soe->so_mark);
+
+ ret |= dump_opt(sk, SOL_SOCKET, SO_SNDTIMEO, &tv);
+ soe->so_snd_tmo_sec = tv.tv_sec;
+ soe->so_snd_tmo_usec = tv.tv_usec;
+
+ ret |= dump_opt(sk, SOL_SOCKET, SO_RCVTIMEO, &tv);
+ soe->so_rcv_tmo_sec = tv.tv_sec;
+ soe->so_rcv_tmo_usec = tv.tv_usec;
+
+ ret |= dump_opt(sk, SOL_SOCKET, SO_REUSEADDR, &val);
+ soe->reuseaddr = val ? true : false;
+ soe->has_reuseaddr = true;
+
+ ret |= dump_opt(sk, SOL_SOCKET, SO_PASSCRED, &val);
+ soe->has_so_passcred = true;
+ soe->so_passcred = val ? true : false;
+
+ ret |= dump_opt(sk, SOL_SOCKET, SO_PASSSEC, &val);
+ soe->has_so_passsec = true;
+ soe->so_passsec = val ? true : false;
+
+ ret |= dump_opt(sk, SOL_SOCKET, SO_DONTROUTE, &val);
+ soe->has_so_dontroute = true;
+ soe->so_dontroute = val ? true : false;
+
+ ret |= dump_opt(sk, SOL_SOCKET, SO_NO_CHECK, &val);
+ soe->has_so_no_check = true;
+ soe->so_no_check = val ? true : false;
+
+ ret |= dump_bound_dev(sk, soe);
+ ret |= dump_socket_filter(sk, soe);
+
+ return ret;
+}
+
+void release_skopts(SkOptsEntry *soe)
+{
+ xfree(soe->so_filter);
+ xfree(soe->so_bound_dev);
+}
+
+int dump_socket(struct fd_parms *p, int lfd, struct cr_img *img)
+{
+ int family;
+ const struct fdtype_ops *ops;
+
+ if (dump_opt(lfd, SOL_SOCKET, SO_DOMAIN, &family))
+ return -1;
+
+ switch (family) {
+ case AF_UNIX:
+ ops = &unix_dump_ops;
+ break;
+ case AF_INET:
+ ops = &inet_dump_ops;
+ break;
+ case AF_INET6:
+ ops = &inet6_dump_ops;
+ break;
+ case AF_PACKET:
+ ops = &packet_dump_ops;
+ break;
+ case AF_NETLINK:
+ ops = &netlink_dump_ops;
+ break;
+ default:
+ pr_err("BUG! Unknown socket collected (family %d)\n", family);
+ return -1;
+ }
+
+ return do_dump_gen_file(p, lfd, ops, img);
+}
+
+static int inet_receive_one(struct nlmsghdr *h, void *arg)
+{
+ struct inet_diag_req_v2 *i = arg;
+ int type;
+
+ switch (i->sdiag_protocol) {
+ case IPPROTO_TCP:
+ type = SOCK_STREAM;
+ break;
+ case IPPROTO_UDP:
+ case IPPROTO_UDPLITE:
+ type = SOCK_DGRAM;
+ break;
+ default:
+ BUG_ON(1);
+ return -1;
+ }
+
+ return inet_collect_one(h, i->sdiag_family, type);
+}
+
+static int do_collect_req(int nl, struct sock_diag_req *req, int size,
+ int (*receive_callback)(struct nlmsghdr *h, void *), void *arg)
+{
+ int tmp;
+
+ tmp = do_rtnl_req(nl, req, size, receive_callback, NULL, arg);
+
+ if (tmp == 0)
+ set_collect_bit(req->r.n.sdiag_family, req->r.n.sdiag_protocol);
+
+ return tmp;
+}
+
+int collect_sockets(struct ns_id *ns)
+{
+ int err = 0, tmp;
+ int nl = ns->net.nlsk;
+ struct sock_diag_req req;
+
+ memset(&req, 0, sizeof(req));
+ req.hdr.nlmsg_len = sizeof(req);
+ req.hdr.nlmsg_type = SOCK_DIAG_BY_FAMILY;
+ req.hdr.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST;
+ req.hdr.nlmsg_seq = CR_NLMSG_SEQ;
+
+ /* Collect UNIX sockets */
+ req.r.u.sdiag_family = AF_UNIX;
+ req.r.u.udiag_states = -1; /* All */
+ req.r.u.udiag_show = UDIAG_SHOW_NAME | UDIAG_SHOW_VFS |
+ UDIAG_SHOW_PEER | UDIAG_SHOW_ICONS |
+ UDIAG_SHOW_RQLEN;
+ tmp = do_collect_req(nl, &req, sizeof(req), unix_receive_one, NULL);
+ if (tmp)
+ err = tmp;
+
+ /* Collect IPv4 TCP sockets */
+ req.r.i.sdiag_family = AF_INET;
+ req.r.i.sdiag_protocol = IPPROTO_TCP;
+ req.r.i.idiag_ext = 0;
+ /* Only listening and established sockets supported yet */
+ req.r.i.idiag_states = (1 << TCP_LISTEN) | (1 << TCP_ESTABLISHED);
+ tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, &req.r.i);
+ if (tmp)
+ err = tmp;
+
+ /* Collect IPv4 UDP sockets */
+ req.r.i.sdiag_family = AF_INET;
+ req.r.i.sdiag_protocol = IPPROTO_UDP;
+ req.r.i.idiag_ext = 0;
+ req.r.i.idiag_states = -1; /* All */
+ tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, &req.r.i);
+ if (tmp)
+ err = tmp;
+
+ /* Collect IPv4 UDP-lite sockets */
+ req.r.i.sdiag_family = AF_INET;
+ req.r.i.sdiag_protocol = IPPROTO_UDPLITE;
+ req.r.i.idiag_ext = 0;
+ req.r.i.idiag_states = -1; /* All */
+ tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, &req.r.i);
+ if (tmp)
+ err = tmp;
+
+ /* Collect IPv6 TCP sockets */
+ req.r.i.sdiag_family = AF_INET6;
+ req.r.i.sdiag_protocol = IPPROTO_TCP;
+ req.r.i.idiag_ext = 0;
+ /* Only listening sockets supported yet */
+ req.r.i.idiag_states = (1 << TCP_LISTEN) | (1 << TCP_ESTABLISHED);
+ tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, &req.r.i);
+ if (tmp)
+ err = tmp;
+
+ /* Collect IPv6 UDP sockets */
+ req.r.i.sdiag_family = AF_INET6;
+ req.r.i.sdiag_protocol = IPPROTO_UDP;
+ req.r.i.idiag_ext = 0;
+ req.r.i.idiag_states = -1; /* All */
+ tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, &req.r.i);
+ if (tmp)
+ err = tmp;
+
+ /* Collect IPv6 UDP-lite sockets */
+ req.r.i.sdiag_family = AF_INET6;
+ req.r.i.sdiag_protocol = IPPROTO_UDPLITE;
+ req.r.i.idiag_ext = 0;
+ req.r.i.idiag_states = -1; /* All */
+ tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, &req.r.i);
+ if (tmp)
+ err = tmp;
+
+ req.r.p.sdiag_family = AF_PACKET;
+ req.r.p.sdiag_protocol = 0;
+ req.r.p.pdiag_show = PACKET_SHOW_INFO | PACKET_SHOW_MCLIST |
+ PACKET_SHOW_FANOUT | PACKET_SHOW_RING_CFG;
+ tmp = do_collect_req(nl, &req, sizeof(req), packet_receive_one, NULL);
+ if (tmp) {
+ pr_warn("The current kernel doesn't support packet_diag\n");
+ if (ns->ns_pid == 0 || tmp != -ENOENT) /* Fedora 19 */
+ err = tmp;
+ }
+
+ req.r.n.sdiag_family = AF_NETLINK;
+ req.r.n.sdiag_protocol = NDIAG_PROTO_ALL;
+ req.r.n.ndiag_show = NDIAG_SHOW_GROUPS;
+ tmp = do_collect_req(nl, &req, sizeof(req), netlink_receive_one, NULL);
+ if (tmp) {
+ pr_warn("The current kernel doesn't support netlink_diag\n");
+ if (ns->ns_pid == 0 || tmp != -ENOENT) /* Fedora 19 */
+ err = tmp;
+ }
+
+ /* don't need anymore */
+ close(nl);
+ ns->net.nlsk = -1;
+
+ if (err && (ns->type == NS_CRIU)) {
+ /*
+ * If netns isn't dumped, criu will fail only
+ * if an unsupported socket will be really dumped.
+ */
+ pr_info("Uncollected sockets! Will probably fail later.\n");
+ err = 0;
+ }
+
+ return err;
+}
diff --git a/criu/stats.c b/criu/stats.c
new file mode 100644
index 000000000000..2a80bb31bc0a
--- /dev/null
+++ b/criu/stats.c
@@ -0,0 +1,157 @@
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/time.h>
+#include "asm/atomic.h"
+#include "protobuf.h"
+#include "stats.h"
+#include "image.h"
+#include "protobuf/stats.pb-c.h"
+
+struct timing {
+ struct timeval start;
+ struct timeval total;
+};
+
+struct dump_stats {
+ struct timing timings[DUMP_TIME_NR_STATS];
+ unsigned long counts[DUMP_CNT_NR_STATS];
+};
+
+struct restore_stats {
+ struct timing timings[RESTORE_TIME_NS_STATS];
+ atomic_t counts[RESTORE_CNT_NR_STATS];
+};
+
+struct dump_stats *dstats;
+struct restore_stats *rstats;
+
+void cnt_add(int c, unsigned long val)
+{
+ if (dstats != NULL) {
+ BUG_ON(c >= DUMP_CNT_NR_STATS);
+ dstats->counts[c] += val;
+ } else if (rstats != NULL) {
+ BUG_ON(c >= RESTORE_CNT_NR_STATS);
+ atomic_add(val, &rstats->counts[c]);
+ } else
+ BUG();
+}
+
+static void timeval_accumulate(const struct timeval *from, const struct timeval *to,
+ struct timeval *res)
+{
+ suseconds_t usec;
+
+ res->tv_sec += to->tv_sec - from->tv_sec;
+ usec = to->tv_usec;
+ if (usec < from->tv_usec) {
+ usec += USEC_PER_SEC;
+ res->tv_sec -= 1;
+ }
+ res->tv_usec += usec - from->tv_usec;
+ if (res->tv_usec > USEC_PER_SEC) {
+ res->tv_usec -= USEC_PER_SEC;
+ res->tv_sec += 1;
+ }
+}
+
+static struct timing *get_timing(int t)
+{
+ if (dstats != NULL) {
+ BUG_ON(t >= DUMP_TIME_NR_STATS);
+ return &dstats->timings[t];
+ } else if (rstats != NULL) {
+ /*
+ * FIXME -- this does _NOT_ work when called
+ * from different tasks.
+ */
+ BUG_ON(t >= RESTORE_TIME_NS_STATS);
+ return &rstats->timings[t];
+ }
+
+ BUG();
+ return NULL;
+}
+
+void timing_start(int t)
+{
+ struct timing *tm;
+
+ tm = get_timing(t);
+ gettimeofday(&tm->start, NULL);
+}
+
+void timing_stop(int t)
+{
+ struct timing *tm;
+ struct timeval now;
+
+ tm = get_timing(t);
+ gettimeofday(&now, NULL);
+ timeval_accumulate(&tm->start, &now, &tm->total);
+}
+
+static void encode_time(int t, u_int32_t *to)
+{
+ struct timing *tm;
+
+ tm = get_timing(t);
+ *to = tm->total.tv_sec * USEC_PER_SEC + tm->total.tv_usec;
+}
+
+void write_stats(int what)
+{
+ StatsEntry stats = STATS_ENTRY__INIT;
+ DumpStatsEntry ds_entry = DUMP_STATS_ENTRY__INIT;
+ RestoreStatsEntry rs_entry = RESTORE_STATS_ENTRY__INIT;
+ char *name;
+ struct cr_img *img;
+
+ pr_info("Writing stats\n");
+ if (what == DUMP_STATS) {
+ stats.dump = &ds_entry;
+
+ encode_time(TIME_FREEZING, &ds_entry.freezing_time);
+ encode_time(TIME_FROZEN, &ds_entry.frozen_time);
+ encode_time(TIME_MEMDUMP, &ds_entry.memdump_time);
+ encode_time(TIME_MEMWRITE, &ds_entry.memwrite_time);
+ ds_entry.has_irmap_resolve = true;
+ encode_time(TIME_IRMAP_RESOLVE, &ds_entry.irmap_resolve);
+
+ ds_entry.pages_scanned = dstats->counts[CNT_PAGES_SCANNED];
+ ds_entry.pages_skipped_parent = dstats->counts[CNT_PAGES_SKIPPED_PARENT];
+ ds_entry.pages_written = dstats->counts[CNT_PAGES_WRITTEN];
+
+ name = "dump";
+ } else if (what == RESTORE_STATS) {
+ stats.restore = &rs_entry;
+
+ rs_entry.pages_compared = atomic_read(&rstats->counts[CNT_PAGES_COMPARED]);
+ rs_entry.pages_skipped_cow = atomic_read(&rstats->counts[CNT_PAGES_SKIPPED_COW]);
+ rs_entry.has_pages_restored = true;
+ rs_entry.pages_restored = atomic_read(&rstats->counts[CNT_PAGES_RESTORED]);
+
+ encode_time(TIME_FORK, &rs_entry.forking_time);
+ encode_time(TIME_RESTORE, &rs_entry.restore_time);
+
+ name = "restore";
+ } else
+ return;
+
+ img = open_image_at(AT_FDCWD, CR_FD_STATS, O_DUMP, name);
+ if (img) {
+ pb_write_one(img, &stats, PB_STATS);
+ close_image(img);
+ }
+}
+
+int init_stats(int what)
+{
+ if (what == DUMP_STATS) {
+ dstats = xzalloc(sizeof(*dstats));
+ return dstats ? 0 : -1;
+ }
+
+ rstats = shmalloc(sizeof(struct restore_stats));
+ return rstats ? 0 : -1;
+}
diff --git a/criu/string.c b/criu/string.c
new file mode 100644
index 000000000000..543c642912c6
--- /dev/null
+++ b/criu/string.c
@@ -0,0 +1,60 @@
+/*
+ * Adopted from linux kernel
+ */
+#include <sys/types.h>
+#include <string.h>
+
+#include "string.h"
+
+#ifndef CONFIG_HAS_STRLCPY
+/**
+ * strlcpy - Copy a %NUL terminated string into a sized buffer
+ * @dest: Where to copy the string to
+ * @src: Where to copy the string from
+ * @size: size of destination buffer
+ *
+ * Compatible with *BSD: the result is always a valid
+ * NUL-terminated string that fits in the buffer (unless,
+ * of course, the buffer size is zero). It does not pad
+ * out the result like strncpy() does.
+ */
+size_t strlcpy(char *dest, const char *src, size_t size)
+{
+ size_t ret = strlen(src);
+
+ if (size) {
+ size_t len = (ret >= size) ? size - 1 : ret;
+ memcpy(dest, src, len);
+ dest[len] = '\0';
+ }
+ return ret;
+}
+#endif
+
+#ifndef CONFIG_HAS_STRLCAT
+/**
+ * strlcat - Append a length-limited, %NUL-terminated string to another
+ * @dest: The string to be appended to
+ * @src: The string to append to it
+ * @count: The size of the destination buffer.
+ */
+size_t strlcat(char *dest, const char *src, size_t count)
+{
+ size_t dsize = strlen(dest);
+ size_t len = strlen(src);
+ size_t res = dsize + len;
+
+ /*
+ * It's assumed that @dsize strictly
+ * less than count. Otherwise it's
+ * a bug. But we left it to a caller.
+ */
+ dest += dsize;
+ count -= dsize;
+ if (len >= count)
+ len = count-1;
+ memcpy(dest, src, len);
+ dest[len] = 0;
+ return res;
+}
+#endif
diff --git a/criu/sysctl.c b/criu/sysctl.c
new file mode 100644
index 000000000000..21ae4cef19e8
--- /dev/null
+++ b/criu/sysctl.c
@@ -0,0 +1,467 @@
+#include <unistd.h>
+#include <fcntl.h>
+#include <ctype.h>
+#include <string.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sched.h>
+
+#include "asm/types.h"
+#include "namespaces.h"
+#include "sysctl.h"
+#include "util.h"
+
+/* These are the namespaces we know how to restore in various ways.
+ */
+#define KNOWN_NS_MASK (CLONE_NEWUTS | CLONE_NEWNET | CLONE_NEWIPC)
+
+struct sysctl_userns_req {
+ int op;
+ unsigned int ns;
+ size_t nr_req;
+ struct sysctl_req *reqs;
+};
+
+#define __SYSCTL_OP(__ret, __fd, __req, __type, __nr, __op) \
+do { \
+ if (__op == CTL_READ) \
+ __ret = sysctl_read_##__type(__fd, __req, \
+ (__type *)(__req)->arg, \
+ __nr); \
+ else if (__op == CTL_WRITE) \
+ __ret = sysctl_write_##__type(__fd, __req, \
+ (__type *)(__req)->arg, \
+ __nr); \
+ else \
+ __ret = -1; \
+} while (0)
+
+#define GEN_SYSCTL_READ_FUNC(__type, __conv) \
+static int sysctl_read_##__type(int fd, \
+ struct sysctl_req *req, \
+ __type *arg, \
+ int nr) \
+{ \
+ char buf[1024] = {0}; \
+ int i, ret = -1; \
+ char *p = buf; \
+ \
+ ret = read(fd, buf, sizeof(buf)); \
+ if (ret < 0) { \
+ pr_perror("Can't read %s", req->name); \
+ ret = -1; \
+ goto err; \
+ } \
+ \
+ for (i = 0; i < nr && p < buf + sizeof(buf); p++, i++) \
+ ((__type *)arg)[i] = __conv(p, &p, 10); \
+ \
+ if (i != nr) { \
+ pr_err("Not enough params for %s (%d != %d)\n", \
+ req->name, i, nr); \
+ goto err; \
+ } \
+ \
+ ret = 0; \
+ \
+err: \
+ return ret; \
+}
+
+#define GEN_SYSCTL_WRITE_FUNC(__type, __fmt) \
+static int sysctl_write_##__type(int fd, \
+ struct sysctl_req *req, \
+ __type *arg, \
+ int nr) \
+{ \
+ char buf[1024]; \
+ int i, ret = -1; \
+ int off = 0; \
+ \
+ for (i = 0; i < nr && off < sizeof(buf) - 1; i++) { \
+ snprintf(&buf[off], sizeof(buf) - off, __fmt, arg[i]); \
+ off += strlen(&buf[off]); \
+ } \
+ \
+ if (i != nr) { \
+ pr_err("Not enough space for %s (%d != %d)\n", \
+ req->name, i, nr); \
+ goto err; \
+ } \
+ \
+ /* trailing spaces in format */ \
+ while (off > 0 && isspace(buf[off - 1])) \
+ off--; \
+ buf[off + 0] = '\n'; \
+ ret = write(fd, buf, off + 1); \
+ if (ret < 0) { \
+ pr_perror("Can't write %s", req->name); \
+ ret = -1; \
+ goto err; \
+ } \
+ \
+ ret = 0; \
+err: \
+ return ret; \
+}
+
+GEN_SYSCTL_READ_FUNC(u32, strtoul);
+GEN_SYSCTL_READ_FUNC(u64, strtoull);
+GEN_SYSCTL_READ_FUNC(s32, strtol);
+
+GEN_SYSCTL_WRITE_FUNC(u32, "%u ");
+GEN_SYSCTL_WRITE_FUNC(u64, "%"PRIu64" ");
+GEN_SYSCTL_WRITE_FUNC(s32, "%d ");
+
+static int
+sysctl_write_char(int fd, struct sysctl_req *req, char *arg, int nr)
+{
+ pr_debug("%s nr %d\n", req->name, nr);
+ if (dprintf(fd, "%s\n", arg) < 0)
+ return -1;
+
+ return 0;
+}
+
+static int
+sysctl_read_char(int fd, struct sysctl_req *req, char *arg, int nr)
+{
+ int ret = -1;
+
+ pr_debug("%s nr %d\n", req->name, nr);
+ ret = read(fd, arg, nr);
+ if (ret < 0) {
+ pr_perror("Can't read %s", req->name);
+ goto err;
+ }
+ ret = 0;
+
+err:
+ return ret;
+}
+
+static int sysctl_userns_arg_size(int type)
+{
+ switch(CTL_TYPE(type)) {
+ case __CTL_U32A:
+ return sizeof(u32) * CTL_LEN(type);
+ case CTL_U32:
+ return sizeof(u32);
+ case CTL_32:
+ return sizeof(s32);
+ case __CTL_U64A:
+ return sizeof(u64) * CTL_LEN(type);
+ case CTL_U64:
+ return sizeof(u64);
+ case __CTL_STR:
+ return sizeof(char) * CTL_LEN(type) + 1;
+ default:
+ pr_err("unknown arg type %d\n", type);
+
+ /* Ensure overflow to cause an error */
+ return MAX_UNSFD_MSG_SIZE;
+ }
+}
+
+static int do_sysctl_op(int fd, struct sysctl_req *req, int op)
+{
+ int ret = -1, nr = 1;
+
+ switch (CTL_TYPE(req->type)) {
+ case __CTL_U32A:
+ nr = CTL_LEN(req->type);
+ /* fallthrough */
+ case CTL_U32:
+ __SYSCTL_OP(ret, fd, req, u32, nr, op);
+ break;
+ case CTL_32:
+ __SYSCTL_OP(ret, fd, req, s32, nr, op);
+ break;
+ case __CTL_U64A:
+ nr = CTL_LEN(req->type);
+ /* fallthrough */
+ case CTL_U64:
+ __SYSCTL_OP(ret, fd, req, u64, nr, op);
+ break;
+ case __CTL_STR:
+ nr = CTL_LEN(req->type);
+ __SYSCTL_OP(ret, fd, req, char, nr, op);
+ break;
+ }
+
+ return ret;
+}
+
+static int __userns_sysctl_op(void *arg, int proc_fd, pid_t pid)
+{
+ int fd, ret = -1, dir, i, status, *fds = NULL;
+ struct sysctl_userns_req *userns_req = arg;
+ int op = userns_req->op;
+ struct sysctl_req *req, **reqs = NULL;
+ sigset_t blockmask, oldmask;
+ pid_t worker;
+
+ // fix up the pointer
+ req = userns_req->reqs = (struct sysctl_req *) &userns_req[1];
+
+ /* For files in the IPC/UTS namespaces, restoring is more complicated
+ * than for net. Unprivileged users cannot even open these files, so
+ * they must be opened by usernsd. However, the value in the kernel is
+ * changed for the IPC/UTS namespace that write()s to the open sysctl
+ * file (not who opened it). So, we must set the value from inside the
+ * usernsd caller's namespace. We:
+ *
+ * 1. unsd opens the sysctl files
+ * 2. forks a task
+ * 3. setns()es to the UTS/IPC namespace of the caller
+ * 4. write()s to the files and exits
+ */
+ dir = open("/proc/sys", O_RDONLY, O_DIRECTORY);
+ if (dir < 0) {
+ pr_perror("Can't open sysctl dir");
+ return -1;
+ }
+
+ fds = xmalloc(sizeof(int) * userns_req->nr_req);
+ if (!fds)
+ goto out;
+
+ reqs = xmalloc(sizeof(struct sysctl_req) * userns_req->nr_req);
+ if (!reqs)
+ goto out;
+
+ memset(fds, -1, sizeof(int) * userns_req->nr_req);
+
+ for (i = 0; i < userns_req->nr_req; i++) {
+ int arg_len = sysctl_userns_arg_size(req->type);
+ int name_len = strlen((char *) &req[1]) + 1;
+ int total_len = sizeof(*req) + arg_len + name_len;
+ int flags;
+
+ /* fix up the pointers */
+ req->name = (char *) &req[1];
+ req->arg = req->name + name_len;
+
+ if (((char *) req) + total_len >= ((char *) userns_req) + MAX_UNSFD_MSG_SIZE) {
+ pr_err("bad sysctl req %s, too big: %d\n", req->name, total_len);
+ goto out;
+ }
+
+ if (op == CTL_READ)
+ flags = O_RDONLY;
+ else
+ flags = O_WRONLY;
+
+ fd = openat(dir, req->name, flags);
+ if (fd < 0) {
+ if (errno == ENOENT && (req->flags & CTL_FLAGS_OPTIONAL))
+ continue;
+ pr_perror("Can't open sysctl %s", req->name);
+ goto out;
+ }
+
+ /* save a pointer to the req, so we don't need to recompute its
+ * location
+ */
+ reqs[i] = req;
+ fds[i] = fd;
+
+ req = (struct sysctl_req *) (((char *) req) + total_len);
+ }
+
+ /*
+ * Don't let the sigchld_handler() mess with us
+ * calling waitpid() on the exited worker. The
+ * same is done in cr_system().
+ */
+
+ sigemptyset(&blockmask);
+ sigaddset(&blockmask, SIGCHLD);
+ sigprocmask(SIG_BLOCK, &blockmask, &oldmask);
+
+ worker = fork();
+ if (worker < 0)
+ goto out;
+
+ if (!worker) {
+ int nsfd;
+ const char *nsname = ns_to_string(userns_req->ns);
+
+ BUG_ON(!nsname);
+ nsfd = openat(proc_fd, nsname, O_RDONLY);
+ if (nsfd < 0) {
+ pr_perror("failed to open pid %d's ns %s", pid, nsname);
+ exit(1);
+ }
+
+ if (setns(nsfd, 0) < 0) {
+ pr_perror("failed to setns to %d's ns %s", pid, nsname);
+ exit(1);
+ }
+
+ close(nsfd);
+
+ for (i = 0; i < userns_req->nr_req; i++) {
+ if (do_sysctl_op(fds[i], reqs[i], op) < 0)
+ exit(1);
+ }
+
+ exit(0);
+ }
+
+ if (waitpid(worker, &status, 0) != worker) {
+ pr_perror("worker didn't die?");
+ kill(worker, SIGKILL);
+ goto out;
+ }
+ sigprocmask(SIG_SETMASK, &oldmask, NULL);
+
+ if (!WIFEXITED(status) || WEXITSTATUS(status)) {
+ pr_err("worker failed: %d\n", status);
+ goto out;
+ }
+
+ ret = 0;
+
+out:
+ if (fds) {
+ for (i = 0; i < userns_req->nr_req; i++) {
+ if (fds[i] < 0)
+ break;
+ close_safe(&fds[i]);
+ }
+
+ xfree(fds);
+ }
+
+ if (reqs)
+ xfree(reqs);
+
+ close_safe(&dir);
+
+ return ret;
+}
+
+static int __nonuserns_sysctl_op(struct sysctl_req *req, size_t nr_req, int op)
+{
+ int dir, ret, exit_code = -1;;
+
+ dir = open("/proc/sys", O_RDONLY, O_DIRECTORY);
+ if (dir < 0) {
+ pr_perror("Can't open sysctl dir");
+ return -1;
+ }
+
+ while (nr_req--) {
+ int fd, flags;
+
+ if (op == CTL_READ)
+ flags = O_RDONLY;
+ else
+ flags = O_WRONLY;
+
+ fd = openat(dir, req->name, flags);
+ if (fd < 0) {
+ if (errno == ENOENT && (req->flags & CTL_FLAGS_OPTIONAL)) {
+ req++;
+ continue;
+ }
+ pr_perror("Can't open sysctl %s", req->name);
+ goto out;
+ }
+
+ ret = do_sysctl_op(fd, req, op);
+ if (ret)
+ goto out;
+ close(fd);
+ req++;
+ }
+
+ exit_code = 0;
+out:
+ close(dir);
+ return exit_code;
+}
+
+int sysctl_op(struct sysctl_req *req, size_t nr_req, int op, unsigned int ns)
+{
+ int i, fd, ret;
+ struct sysctl_userns_req *userns_req;
+ struct sysctl_req *cur;
+
+ if (nr_req == 0)
+ return 0;
+
+ if (ns & ~KNOWN_NS_MASK) {
+ pr_err("don't know how to restore some namespaces in %u\n", ns);
+ return -1;
+ }
+
+ /* The way sysctl files behave on open/write depends on the namespace
+ * they correspond to. If we don't want to interact with something in a
+ * namespace (e.g. kernel/cap_last_cap is global), we can do this from
+ * the current process. Similarly, if we're accessing net namespaces,
+ * we can just do the operation from our current process, since
+ * anything with CAP_NET_ADMIN can write to the net/ sysctls, and we
+ * still have that even when restoring in a user ns.
+ *
+ * For IPC/UTS, we restore them as described above.
+ *
+ * For read operations, we need to copy the values back to return.
+ * Fortunately, we only do read on dump (or global reads on restore),
+ * so we can do those in process as well.
+ */
+ if (!ns || ns & CLONE_NEWNET || op == CTL_READ)
+ return __nonuserns_sysctl_op(req, nr_req, op);
+
+ /*
+ * In order to avoid lots of opening of /proc/sys for each struct sysctl_req,
+ * we encode each array of sysctl_reqs into one contiguous region of memory so
+ * it can be passed via userns_call if necessary. It looks like this:
+ *
+ * struct sysctl_userns_req struct sysctl_req name arg
+ * ---------------------------------------------------------------------------
+ * | op | nr_req | reqs | <fields> | name | arg | "the name" | "the arg" ...
+ * ---------------------------------------------------------------------------
+ * |____^ |______|__^ ^
+ * |_______________|
+ */
+ userns_req = alloca(MAX_UNSFD_MSG_SIZE);
+ userns_req->op = op;
+ userns_req->nr_req = nr_req;
+ userns_req->ns = ns;
+ userns_req->reqs = (struct sysctl_req *) (&userns_req[1]);
+
+ cur = userns_req->reqs;
+ for (i = 0; i < nr_req; i++) {
+ int arg_len = sysctl_userns_arg_size(req[i].type);
+ int name_len = strlen(req[i].name) + 1;
+ int total_len = sizeof(*cur) + arg_len + name_len;
+
+ if (((char *) cur) + total_len >= ((char *) userns_req) + MAX_UNSFD_MSG_SIZE) {
+ pr_err("sysctl msg %s too big: %d\n", req[i].name, total_len);
+ return -1;
+ }
+
+ /* copy over the non-pointer fields */
+ cur->type = req[i].type;
+ cur->flags = req[i].flags;
+
+ cur->name = (char *) &cur[1];
+ strcpy(cur->name, req[i].name);
+
+ cur->arg = cur->name + name_len;
+ memcpy(cur->arg, req[i].arg, arg_len);
+
+ cur = (struct sysctl_req *) (((char *) cur) + total_len);
+ }
+
+ fd = open_proc(PROC_SELF, "ns");
+ if (fd < 0)
+ return -1;
+
+ ret = userns_call(__userns_sysctl_op, 0, userns_req, MAX_UNSFD_MSG_SIZE, fd);
+ close(fd);
+ return ret;
+}
diff --git a/criu/sysfs_parse.c b/criu/sysfs_parse.c
new file mode 100644
index 000000000000..6497d53497e9
--- /dev/null
+++ b/criu/sysfs_parse.c
@@ -0,0 +1,325 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <ctype.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <sys/stat.h>
+
+#include "cr_options.h"
+#include "criu-log.h"
+#include "xmalloc.h"
+#include "files.h"
+#include "proc_parse.h"
+#include "util.h"
+#include "sysfs_parse.h"
+#include "namespaces.h"
+
+/*
+ * Currently, there are two kernel problems dealing with AUFS
+ * filesystems. Until these problems are fixed in the kernel,
+ * we have AUFS support in CRIU to handle the following issues:
+ *
+ * 1) /proc/<pid>/mountinfo: The problem is that for AUFS the root field
+ * of the root entry is missing the pathname (it's only /). For example:
+ *
+ * 90 61 0:33 / / rw,relatime - aufs none rw,si=4476a910a24617e6
+ *
+ * To handle this issue, the user has to specify the root of the AUFS
+ * filesystem with the --root command line option.
+ *
+ * 2) /proc/<pid>/map_files: The symlinks are absolute pathnames of the
+ * corresponding *physical* files in the branch they exist. For example,
+ * for a Docker container using AUFS, a symlink would look like:
+ * 400000-489000 -> /var/lib/docker/aufs/diff/<LAYER_ID>/bin/<cmd>
+ *
+ * Therefore, when we use the link file descriptor vm_file_fd in
+ * dump_one_reg_file() to read the link, we get the file's physical
+ * absolute pathname which does not exist relative to the root of the
+ * mount namespace and even if we used its relative pathname, the dev:ino
+ * values would be different from the physical file's dev:ino causing the
+ * dump to fail.
+ *
+ * To handle this issue, we figure out the "correct" paths when parsing
+ * map_files and save it for later use. See fixup_aufs_vma_fd() for
+ * details.
+ */
+
+struct ns_id *aufs_nsid;
+static char **aufs_branches;
+
+/*
+ * Parse out and save the AUFS superblock info in the
+ * given buffer.
+ */
+static int parse_aufs_sbinfo(struct mount_info *mi, char *sbinfo, int len)
+{
+ char *cp;
+ int n;
+
+ cp = strstr(mi->options, "si=");
+ if (!cp) {
+ pr_err("Cannot find sbinfo in option string %s\n", mi->options);
+ return -1;
+ }
+
+ /* all ok, copy */
+ if (len < 4) { /* 4 for "si_" */
+ pr_err("Buffer of %d bytes too small for sbinfo\n", len);
+ return -1;
+ }
+ strcpy(sbinfo, "si_");
+ n = 3;
+ sbinfo += n;
+ cp += n;
+ while (isxdigit(*cp) && n < len) {
+ *sbinfo++ = *cp++;
+ n++;
+ }
+ if (n >= len) {
+ pr_err("Sbinfo in options string %s too long\n", mi->options);
+ return -1;
+ }
+ *sbinfo = '\0';
+ return 0;
+}
+
+/*
+ * If the specified path is in a branch, replace it
+ * with pathname from root.
+ */
+static int fixup_aufs_path(char *path, int size)
+{
+ char rpath[PATH_MAX];
+ int n;
+ int blen;
+
+ if (aufs_branches == NULL) {
+ pr_err("No aufs branches to search for %s\n", path);
+ return -1;
+ }
+
+ for (n = 0; aufs_branches[n] != NULL; n++) {
+ blen = strlen(aufs_branches[n]);
+ if (!strncmp(path, aufs_branches[n], blen))
+ break;
+ }
+
+ if (aufs_branches[n] == NULL)
+ return 0; /* not in a branch */
+
+ n = snprintf(rpath, PATH_MAX, "%s", &path[blen]);
+ if (n >= min(PATH_MAX, size)) {
+ pr_err("Not enough space to replace %s\n", path);
+ return -1;
+ }
+
+ pr_debug("Replacing %s with %s\n", path, rpath);
+ strcpy(path, rpath);
+ return n;
+}
+
+/*
+ * Kernel stores patchnames to AUFS branches in the br<n> files in
+ * the /sys/fs/aufs/si_<sbinfo> directory where <n> denotes a branch
+ * number and <sbinfo> is a hexadecimal number in %lx format. For
+ * example:
+ *
+ * $ cat /sys/fs/aufs/si_f598876b087ed883/br0
+ * /path/to/branch0/directory=rw
+ *
+ * This function sets up an array of pointers to branch pathnames.
+ */
+int parse_aufs_branches(struct mount_info *mi)
+{
+ char path[AUFSBR_PATH_LEN];
+ char *cp;
+ int n;
+ int ret;
+ unsigned int br_num;
+ unsigned int br_max;
+ DIR *dp;
+ FILE *fp;
+ struct dirent *de;
+
+ pr_info("Collecting AUFS branch pathnames ...\n");
+
+ if (mi->nsid == 0) {
+ pr_err("No nsid to parse its aufs branches\n");
+ return -1;
+ }
+
+ if (mi->nsid == aufs_nsid) {
+ pr_debug("Using cached aufs branch paths for nsid %p\n", aufs_nsid);
+ return 0;
+ }
+
+ if (aufs_nsid)
+ free_aufs_branches();
+
+ strcpy(path, SYSFS_AUFS); /* /sys/fs/aufs/ */
+ if (parse_aufs_sbinfo(mi, &path[sizeof SYSFS_AUFS - 1], SBINFO_LEN) < 0)
+ return -1;
+ if ((dp = opendir(path)) == NULL) {
+ pr_perror("Cannot opendir %s", path);
+ return -1;
+ }
+
+ /*
+ * Find out how many branches we have.
+ */
+ br_max = 0;
+ ret = 0;
+ while (1) {
+ errno = 0;
+ if ((de = readdir(dp)) == NULL) {
+ if (errno) {
+ pr_perror("Cannot readdir %s", path);
+ ret = -1;
+ }
+ break;
+ }
+
+ ret = sscanf(de->d_name, "br%d", &br_num);
+ if (ret == 1 && br_num > br_max)
+ br_max = br_num;
+ }
+ closedir(dp);
+ if (ret == -1)
+ return -1;
+
+ /*
+ * Default AUFS maximum is 127, so 1000 should be plenty.
+ * If you increase the maximum to more than 3 digits,
+ * make sure to change AUFSBR_PATH_LEN accordingly.
+ */
+ if (br_max > 999) {
+ pr_err("Too many branches %d\n", br_max);
+ return -1;
+ }
+
+ /*
+ * Allocate an array of pointers to branch pathnames to be read.
+ * Branches are indexed from 0 and we need a NULL pointer at the end.
+ */
+ aufs_branches = xzalloc((br_max + 2) * sizeof (char *));
+ if (!aufs_branches)
+ return -1;
+
+ /*
+ * Now read branch pathnames from the branch files.
+ */
+ n = strlen(path);
+ for (br_num = 0; br_num <= br_max; br_num++) {
+ fp = NULL;
+
+ ret = snprintf(&path[n], sizeof path - n, "/br%d", br_num);
+ if (ret >= sizeof path - n) {
+ pr_err("Buffer overrun creating path for branch %d\n", br_num);
+ goto err;
+ }
+
+ if ((fp = fopen(path, "r")) == NULL) {
+ pr_perror("Cannot fopen %s", path);
+ goto err;
+ }
+
+ if (fscanf(fp, "%ms=", &aufs_branches[br_num]) != 1 ||
+ aufs_branches[br_num] == NULL) {
+ pr_perror("Parse error reading %s", path);
+ goto err;
+ }
+
+ /* chop off the trailing "=..." stuff */
+ if ((cp = strchr(aufs_branches[br_num], '=')) == NULL) {
+ pr_err("Bad format in branch pathname %s\n", aufs_branches[br_num]);
+ goto err;
+ }
+ *cp = '\0';
+
+ fclose(fp);
+ /*
+ * Log branch information for extenal utitilies that
+ * want to recreate the process's AUFS filesystem
+ * before calling criu restore.
+ *
+ * DO NOT CHANGE this format!
+ */
+ pr_info("%s : %s\n", path, aufs_branches[br_num]);
+ }
+
+ aufs_nsid = mi->nsid;
+ return 0;
+
+err:
+ if (fp)
+ fclose(fp);
+ free_aufs_branches();
+ return -1;
+}
+
+/*
+ * AUFS support to compensate for the kernel bug
+ * exposing branch pathnames in map_files and providing
+ * a wrong mnt_id value in /proc/<pid>/fdinfo/<fd>.
+ *
+ * If the link points inside a branch, save the
+ * relative pathname from the root of the mount
+ * namespace as well as the full pathname from
+ * globl root (/) for later use in dump_filemap()
+ * and parse_smaps().
+ */
+int fixup_aufs_vma_fd(struct vma_area *vma)
+{
+ char path[PATH_MAX];
+ int len;
+
+ path[0] = '.';
+ len = read_fd_link(vma->vm_file_fd, &path[1], sizeof path - 1);
+ if (len < 0)
+ return -1;
+
+ len = fixup_aufs_path(&path[1], sizeof path - 1);
+ if (len <= 0)
+ return len;
+
+ vma->aufs_rpath = xmalloc(len + 2);
+ if (!vma->aufs_rpath)
+ return -1;
+
+ strcpy(vma->aufs_rpath, path);
+ if (opts.root) {
+ /* skip ./ in path */
+ vma->aufs_fpath = xsprintf("%s/%s", opts.root, &path[2]);
+ if (!vma->aufs_fpath)
+ return -1;
+ }
+ pr_debug("Saved AUFS paths %s and %s\n", vma->aufs_rpath, vma->aufs_fpath);
+
+ if (stat(vma->aufs_fpath, vma->vmst) < 0) {
+ pr_perror("Failed stat on map %"PRIx64" (%s)",
+ vma->e->start, vma->aufs_fpath);
+ return -1;
+ }
+
+ /* tell parse_smap() not to call get_fd_mntid() */
+ vma->mnt_id = -1;
+ return len;
+}
+
+void free_aufs_branches(void)
+{
+ int n;
+
+ if (aufs_branches) {
+ for (n = 0; aufs_branches[n] != NULL; n++)
+ xfree(aufs_branches[n]);
+
+ xfree(aufs_branches);
+ aufs_branches = NULL;
+ }
+
+ aufs_nsid = NULL;
+}
diff --git a/criu/timerfd.c b/criu/timerfd.c
new file mode 100644
index 000000000000..019de69ef61b
--- /dev/null
+++ b/criu/timerfd.c
@@ -0,0 +1,211 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+#include <sys/timerfd.h>
+#include <sys/ioctl.h>
+
+#include "protobuf.h"
+#include "protobuf/timerfd.pb-c.h"
+
+#include "proc_parse.h"
+#include "rst-malloc.h"
+#include "cr_options.h"
+#include "restorer.h"
+#include "timerfd.h"
+#include "pstree.h"
+#include "files.h"
+#include "imgset.h"
+#include "util.h"
+#include "log.h"
+#include "bug.h"
+
+#undef LOG_PREFIX
+#define LOG_PREFIX "timerfd: "
+
+struct timerfd_dump_arg {
+ u32 id;
+ const struct fd_parms *p;
+};
+
+struct timerfd_info {
+ TimerfdEntry *tfe;
+ struct file_desc d;
+ int t_fd;
+ struct list_head rlist;
+};
+
+static LIST_HEAD(rst_timerfds);
+
+unsigned long rst_timerfd_cpos;
+unsigned int rst_timerfd_nr = 0;
+
+int check_timerfd(void)
+{
+ int fd, ret = -1;
+
+ fd = timerfd_create(CLOCK_MONOTONIC, 0);
+ if (fd < 0) {
+ pr_perror("timerfd_create failed");
+ return -1;
+ } else {
+ ret = ioctl(fd, TFD_IOC_SET_TICKS, NULL);
+ if (ret < 0) {
+ if (errno != EFAULT)
+ pr_perror("No timerfd support for c/r");
+ else
+ ret = 0;
+ }
+ }
+
+ close(fd);
+ return ret;
+}
+
+int is_timerfd_link(char *link)
+{
+ return is_anon_link_type(link, "[timerfd]");
+}
+
+static int dump_timerfd_entry(union fdinfo_entries *e, void *arg)
+{
+ struct timerfd_dump_arg *da = arg;
+ TimerfdEntry *tfy = &e->tfy;
+
+ tfy->id = da->id;
+ tfy->flags = da->p->flags;
+ tfy->fown = (FownEntry *)&da->p->fown;
+
+ pr_info("Dumping id %#x clockid %d it_value(%llu, %llu) it_interval(%llu, %llu)\n",
+ tfy->id, tfy->clockid, (unsigned long long)tfy->vsec, (unsigned long long)tfy->vnsec,
+ (unsigned long long)tfy->isec, (unsigned long long)tfy->insec);
+
+ return pb_write_one(img_from_set(glob_imgset, CR_FD_TIMERFD), &e->tfy, PB_TIMERFD);
+}
+
+static int dump_one_timerfd(int lfd, u32 id, const struct fd_parms *p)
+{
+ struct timerfd_dump_arg da = { .id = id, .p = p, };
+ return parse_fdinfo(lfd, FD_TYPES__TIMERFD, dump_timerfd_entry, &da);
+}
+
+const struct fdtype_ops timerfd_dump_ops = {
+ .type = FD_TYPES__TIMERFD,
+ .dump = dump_one_timerfd,
+};
+
+/*
+ * We need to restore timers at the very late stage in restorer
+ * to eliminate the case when timer is expired but we have not
+ * yet finished restore procedure and signal handlers are not
+ * set up properly. We need to copy timers settings into restorer
+ * area that's why post-open is used for.
+ */
+static int timerfd_post_open(struct file_desc *d, int fd)
+{
+ struct timerfd_info *info = container_of(d, struct timerfd_info, d);
+
+ info->t_fd = fd;
+ list_add_tail(&info->rlist, &rst_timerfds);
+ return 0;
+}
+
+int rst_timerfd_prep(void)
+{
+ struct timerfd_info *ti;
+ struct restore_timerfd *t;
+
+ rst_timerfd_cpos = rst_mem_align_cpos(RM_PRIVATE);
+ list_for_each_entry(ti, &rst_timerfds, rlist) {
+ TimerfdEntry *tfe = ti->tfe;
+
+ t = rst_mem_alloc(sizeof(*t), RM_PRIVATE);
+ if (!t)
+ return -1;
+
+ t->id = tfe->id;
+ t->fd = ti->t_fd;
+ t->clockid = tfe->clockid;
+ t->ticks = (unsigned long)tfe->ticks;
+ t->settime_flags = tfe->settime_flags;
+ t->val.it_interval.tv_sec = (time_t)tfe->isec;
+ t->val.it_interval.tv_nsec = (long)tfe->insec;
+ t->val.it_value.tv_sec = (time_t)tfe->vsec;
+ t->val.it_value.tv_nsec = (long)tfe->vnsec;
+
+ rst_timerfd_nr++;
+ }
+
+ return 0;
+}
+
+static int timerfd_open(struct file_desc *d)
+{
+ struct timerfd_info *info;
+ TimerfdEntry *tfe;
+ int tmp = -1;
+
+ info = container_of(d, struct timerfd_info, d);
+ tfe = info->tfe;
+ pr_info("Creating timerfd id %#x clockid %d settime_flags %x ticks %llu "
+ "it_value(%llu, %llu) it_interval(%llu, %llu)\n",
+ tfe->id, tfe->clockid, tfe->settime_flags, (unsigned long long)tfe->ticks,
+ (unsigned long long)tfe->vsec, (unsigned long long)tfe->vnsec,
+ (unsigned long long)tfe->isec, (unsigned long long)tfe->insec);
+
+ tmp = timerfd_create(tfe->clockid, 0);
+ if (tmp < 0) {
+ pr_perror("Can't create for %#x", tfe->id);
+ return -1;
+ }
+
+ if (rst_file_params(tmp, tfe->fown, tfe->flags)) {
+ pr_perror("Can't restore params for %#x", tfe->id);
+ goto err_close;
+ }
+
+ return tmp;
+
+err_close:
+ close_safe(&tmp);
+ return -1;
+}
+
+static struct file_desc_ops timerfd_desc_ops = {
+ .type = FD_TYPES__TIMERFD,
+ .open = timerfd_open,
+ .post_open = timerfd_post_open,
+};
+
+static int verify_timerfd(TimerfdEntry *tfe)
+{
+ if (tfe->clockid != CLOCK_REALTIME &&
+ tfe->clockid != CLOCK_MONOTONIC) {
+ pr_err("Unknown clock type %d for %#x\n", tfe->clockid, tfe->id);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int collect_one_timerfd(void *o, ProtobufCMessage *msg)
+{
+ struct timerfd_info *info = o;
+
+ info->tfe = pb_msg(msg, TimerfdEntry);
+ if (verify_timerfd(info->tfe)) {
+ pr_err("Verification failed for %#x\n", info->tfe->id);
+ return -1;
+ }
+
+ info->t_fd = -1;
+
+ return file_desc_add(&info->d, info->tfe->id, &timerfd_desc_ops);
+}
+
+struct collect_image_info timerfd_cinfo = {
+ .fd_type = CR_FD_TIMERFD,
+ .pb_type = PB_TIMERFD,
+ .priv_size = sizeof(struct timerfd_info),
+ .collect = collect_one_timerfd,
+};
diff --git a/criu/tty.c b/criu/tty.c
new file mode 100644
index 000000000000..ef82583ddfd6
--- /dev/null
+++ b/criu/tty.c
@@ -0,0 +1,1712 @@
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <string.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <termios.h>
+#include <linux/major.h>
+
+#include "compiler.h"
+#include "asm/types.h"
+
+#include "files.h"
+#include "cr_options.h"
+#include "imgset.h"
+#include "servicefd.h"
+#include "image.h"
+#include "util.h"
+#include "log.h"
+#include "list.h"
+#include "util-pie.h"
+#include "proc_parse.h"
+#include "file-ids.h"
+#include "files-reg.h"
+#include "namespaces.h"
+
+#include "protobuf.h"
+#include "protobuf/tty.pb-c.h"
+
+#include "parasite-syscall.h"
+#include "parasite.h"
+
+#include "pstree.h"
+#include "tty.h"
+
+/*
+ * Here are some notes about overall TTY c/r design. At moment
+ * we support unix98 ptys only. Supporting legacy BSD terminals
+ * is impossible without help from the kernel side -- the indices
+ * of such terminals are not reported anywhere in the kernel so that
+ * we can't figure out active pairs.
+ *
+ * Usually the PTYs represent a pair of links -- master peer and slave
+ * peer. Master peer must be opened before slave. Internally, when kernel
+ * creates master peer it also generates a slave interface in a form of
+ * /dev/pts/N, where N is that named pty "index". Master/slave connection
+ * unambiguously identified by this index.
+ *
+ * Still, one master can carry multiple slaves -- for example a user opens
+ * one master via /dev/ptmx and appropriate /dev/pts/N in sequence.
+ * The result will be the following
+ *
+ * master
+ * `- slave 1
+ * `- slave 2
+ *
+ * both slave will have same master index but different file descriptors.
+ * Still inside the kernel pty parameters are same for both slaves. Thus
+ * only one slave parameters should be restored, there is no need to carry
+ * all parameters for every slave peer we've found.
+ *
+ * Note the /dev/pts/ is rather convenient agreement and internally the
+ * kernel doesn't care where exactly the inodes of ptys are laying --
+ * it depends on "devpts" mount point path.
+ */
+
+#undef LOG_PREFIX
+#define LOG_PREFIX "tty: "
+
+struct tty_info_entry {
+ struct list_head list;
+ TtyInfoEntry *tie;
+};
+
+struct tty_info {
+ struct list_head list;
+ struct file_desc d;
+
+ struct file_desc *reg_d;
+
+ TtyFileEntry *tfe;
+ TtyInfoEntry *tie;
+
+ struct list_head sibling;
+ struct tty_driver *driver;
+
+ bool create;
+ bool inherit;
+
+ struct tty_info *ctl_tty;
+};
+
+struct tty_dump_info {
+ struct list_head list;
+
+ u32 id;
+ pid_t sid;
+ pid_t pgrp;
+ int fd;
+ struct tty_driver *driver;
+};
+
+static LIST_HEAD(all_tty_info_entries);
+static LIST_HEAD(all_ttys);
+
+/*
+ * Usually an application has not that many ttys opened.
+ * If this won't be enough in future we simply need to
+ * change tracking mechanism to some more extendable.
+ *
+ * This particular bitmap requires 256 bytes of memory.
+ * Pretty acceptable trade off in a sake of simplicity.
+ */
+
+#define MAX_TTYS 1024
+
+/*
+ * Custom indices should be even numbers just in case if we
+ * need odds for pair numbering someday.
+ */
+
+#define MAX_PTY_INDEX 1000
+#define CONSOLE_INDEX 1002
+#define VT_INDEX 1004
+#define CTTY_INDEX 1006
+#define INDEX_ERR (MAX_TTYS + 1)
+
+static DECLARE_BITMAP(tty_bitmap, (MAX_TTYS << 1));
+static DECLARE_BITMAP(tty_active_pairs, (MAX_TTYS << 1));
+
+struct tty_driver {
+ short type;
+ short subtype;
+ char *name;
+ int index;
+ int (*fd_get_index)(int fd, const struct fd_parms *p);
+ int (*img_get_index)(struct tty_info *ti);
+ int (*open)(struct tty_info *ti);
+};
+
+#define TTY_SUBTYPE_MASTER 0x0001
+#define TTY_SUBTYPE_SLAVE 0x0002
+
+static int ptm_fd_get_index(int fd, const struct fd_parms *p)
+{
+ int index;
+
+ if (ioctl(fd, TIOCGPTN, &index)) {
+ pr_perror("Can't obtain ptmx index");
+ return INDEX_ERR;
+ }
+
+ if (index > MAX_PTY_INDEX) {
+ pr_err("Index %d on ptmx is too big\n", index);
+ return INDEX_ERR;
+ }
+
+ return index;
+}
+
+static int pty_get_index(struct tty_info *ti)
+{
+ return ti->tie->pty->index;
+}
+
+static int pty_open_ptmx(struct tty_info *info);
+
+static struct tty_driver ptm_driver = {
+ .type = TTY_TYPE__PTY,
+ .subtype = TTY_SUBTYPE_MASTER,
+ .name = "ptmx",
+ .fd_get_index = ptm_fd_get_index,
+ .img_get_index = pty_get_index,
+ .open = pty_open_ptmx,
+};
+
+static int open_simple_tty(struct tty_info *info);
+
+static struct tty_driver console_driver = {
+ .type = TTY_TYPE__CONSOLE,
+ .name = "console",
+ .index = CONSOLE_INDEX,
+ .open = open_simple_tty,
+};
+
+static struct tty_driver ctty_driver = {
+ .type = TTY_TYPE__CTTY,
+ .name = "ctty",
+ .index = CTTY_INDEX,
+ .open = open_simple_tty,
+};
+
+static struct tty_driver vt_driver = {
+ .type = TTY_TYPE__VT,
+ .name = "vt",
+ .index = VT_INDEX,
+ .open = open_simple_tty,
+};
+
+static int open_ext_tty(struct tty_info *info);
+static struct tty_driver ext_driver = {
+ .type = TTY_TYPE__EXT_TTY,
+ .name = "ext",
+ .open = open_ext_tty,
+};
+
+static int pts_fd_get_index(int fd, const struct fd_parms *p)
+{
+ int index;
+ const struct fd_link *link = p->link;
+ char *pos = strrchr(link->name, '/');
+
+ if (!pos || pos == (link->name + link->len - 1)) {
+ pr_err("Unexpected format on path %s\n", link->name + 1);
+ return INDEX_ERR;
+ }
+
+ index = atoi(pos + 1);
+ if (index > MAX_PTY_INDEX) {
+ pr_err("Index %d on pts is too big\n", index);
+ return INDEX_ERR;
+ }
+
+ return index;
+}
+
+static struct tty_driver pts_driver = {
+ .type = TTY_TYPE__PTY,
+ .subtype = TTY_SUBTYPE_SLAVE,
+ .name = "pts",
+ .fd_get_index = pts_fd_get_index,
+ .img_get_index = pty_get_index,
+ .open = pty_open_ptmx,
+};
+
+struct tty_driver *get_tty_driver(dev_t rdev, dev_t dev)
+{
+ int major, minor;
+ char id[42];
+
+ snprintf(id, sizeof(id), "tty[%"PRIx64":%"PRIx64"]", rdev, dev);
+ if (external_lookup_id(id) || inherit_fd_lookup_id(id) >= 0)
+ return &ext_driver;
+
+ major = major(rdev);
+ minor = minor(rdev);
+
+ switch (major) {
+ case TTYAUX_MAJOR:
+ if (minor == 2)
+ return &ptm_driver;
+ else if (minor == 1)
+ return &console_driver;
+ else if (minor == 0)
+ return &ctty_driver;
+ break;
+ case TTY_MAJOR:
+ if (minor > MIN_NR_CONSOLES && minor < MAX_NR_CONSOLES)
+ /*
+ * Minors [MIN_NR_CONSOLES; MAX_NR_CONSOLES] stand
+ * for consoles (virtual terminals, VT in terms
+ * of kernel).
+ */
+ return &vt_driver;
+ case UNIX98_PTY_MASTER_MAJOR ... (UNIX98_PTY_MASTER_MAJOR + UNIX98_PTY_MAJOR_COUNT - 1):
+ return &ptm_driver;
+ case UNIX98_PTY_SLAVE_MAJOR:
+ return &pts_driver;
+ }
+ return NULL;
+}
+
+static inline int is_pty(struct tty_driver *driver)
+{
+ return driver->type == TTY_TYPE__PTY;
+}
+
+/*
+ * /dev/ptmx is a shared resource between all tasks
+ * so we need to serialize access to it.
+ */
+static mutex_t *tty_mutex;
+
+static bool tty_is_master(struct tty_info *info);
+
+int prepare_shared_tty(void)
+{
+ tty_mutex = shmalloc(sizeof(*tty_mutex));
+ if (!tty_mutex) {
+ pr_err("Can't create ptmx index mutex\n");
+ return -1;
+ }
+
+ mutex_init(tty_mutex);
+
+ return 0;
+}
+
+#define winsize_copy(d, s) \
+ do { \
+ ASSIGN_MEMBER((d), (s), ws_row); \
+ ASSIGN_MEMBER((d), (s), ws_col); \
+ ASSIGN_MEMBER((d), (s), ws_xpixel); \
+ ASSIGN_MEMBER((d), (s), ws_ypixel); \
+ } while (0)
+
+#define termios_copy(d, s) \
+ do { \
+ struct termios __t; \
+ \
+ memcpy((d)->c_cc, (s)->c_cc, \
+ sizeof(__t.c_cc)); \
+ \
+ ASSIGN_MEMBER((d),(s), c_iflag); \
+ ASSIGN_MEMBER((d),(s), c_oflag); \
+ ASSIGN_MEMBER((d),(s), c_cflag); \
+ ASSIGN_MEMBER((d),(s), c_lflag); \
+ ASSIGN_MEMBER((d),(s), c_line); \
+ } while (0)
+
+static int tty_gen_id(struct tty_driver *driver, int index)
+{
+ return (index << 1) + (driver->subtype == TTY_SUBTYPE_MASTER);
+}
+
+static int tty_get_index(u32 id)
+{
+ return id >> 1;
+}
+
+/* Make sure the active pairs do exist */
+int tty_verify_active_pairs(void)
+{
+ unsigned long i, unpaired_slaves = 0;
+
+ for_each_bit(i, tty_active_pairs) {
+ if ((i % 2) == 0) {
+ if (test_bit(i + 1, tty_active_pairs)) {
+ i++;
+ continue;
+ }
+
+ if (!opts.shell_job) {
+ pr_err("Found slave peer index %d without "
+ "correspond master peer\n",
+ tty_get_index(i));
+ return -1;
+ }
+
+ pr_debug("Unpaired slave %d\n", tty_get_index(i));
+
+ if (++unpaired_slaves > 1) {
+ pr_err("Only one slave external peer "
+ "is allowed (index %d)\n",
+ tty_get_index(i));
+ return -1;
+ }
+ }
+ }
+
+ return 0;
+}
+
+static int tty_test_and_set(int bit, unsigned long *bitmap)
+{
+ int ret;
+
+ ret = test_bit(bit, bitmap);
+ if (!ret)
+ set_bit(bit, bitmap);
+ return ret;
+}
+
+/*
+ * Generate a regular file object in case if such is missed
+ * in the image file, ie obsolete interface has been used on
+ * checkpoint.
+ */
+static struct file_desc *pty_alloc_reg(struct tty_info *info, bool add)
+{
+ TtyFileEntry *tfe = info->tfe;
+ const size_t namelen = 64;
+ struct reg_file_info *r;
+ static struct file_desc_ops noops = {};
+
+ r = xzalloc(sizeof(*r) + sizeof(*r->rfe) + namelen);
+ if (!r)
+ return NULL;
+
+ r->rfe = (void *)r + sizeof(*r);
+ reg_file_entry__init(r->rfe);
+
+ r->rfe->name = (void *)r + sizeof(*r) + sizeof(*r->rfe);
+ if (tty_is_master(info))
+ strcpy(r->rfe->name, "/dev/ptmx");
+ else
+ snprintf(r->rfe->name, namelen, "/dev/pts/%u",
+ info->tie->pty->index);
+
+ if (add)
+ file_desc_add(&r->d, tfe->id, &noops);
+ else
+ file_desc_init(&r->d, tfe->id, &noops);
+
+ r->rfe->id = tfe->id;
+ r->rfe->flags = tfe->flags;
+ r->rfe->fown = tfe->fown;
+ r->path = &r->rfe->name[1];
+
+ return &r->d;
+}
+
+/*
+ * In case if we need to open a fake pty (for example
+ * a master peer which were deleted at checkpoint moment,
+ * or open a slave peer when restoring control terminal)
+ * we need to create a new reg-file object taking @info
+ * as a template. Here is a trick though: the @info might
+ * represent master peer while we need to allocate a slave
+ * one and the reverse. For such case taking path from the
+ * @info as a template we generate that named 'inverted-path'.
+ *
+ * For example if the master peer was /dev/pts/ptmx with index 1,
+ * the inverted path is /dev/pts/1, for inverted slaves it's simplier
+ * we just add 'ptmx' postfix.
+ */
+static struct reg_file_info *pty_alloc_fake_reg(struct tty_info *info, int subtype)
+{
+ struct reg_file_info *new, *orig;
+ struct file_desc *fake_desc;
+
+ pr_debug("Allocating fake descriptor for %#x (reg_d %p)\n",
+ info->tfe->id, info->reg_d);
+
+ BUG_ON(!info->reg_d);
+ BUG_ON(!is_pty(info->driver));
+
+ fake_desc = pty_alloc_reg(info, false);
+ if (!fake_desc)
+ return NULL;
+
+ orig = container_of(info->reg_d, struct reg_file_info, d);
+ new = container_of(fake_desc, struct reg_file_info, d);
+
+ if ((subtype == TTY_SUBTYPE_MASTER && tty_is_master(info)) ||
+ (subtype == TTY_SUBTYPE_SLAVE && !tty_is_master(info))) {
+ new->path = xstrdup(orig->path);
+ new->rfe->name = &new->path[1];
+ } else {
+ char *pos = strrchr(orig->rfe->name, '/');
+ size_t len = strlen(orig->rfe->name) + 1;
+ size_t slash_at = pos - orig->rfe->name;
+ char *inverted_path = xmalloc(len + 32);
+
+ BUG_ON(!pos || !inverted_path);
+
+ memcpy(inverted_path, orig->rfe->name, slash_at + 1);
+ if (subtype == TTY_SUBTYPE_MASTER)
+ strcat(inverted_path, "ptmx");
+ else {
+ if (slash_at >= 3 && strncmp(&inverted_path[slash_at - 3], "pts", 3))
+ snprintf(&inverted_path[slash_at + 1], 10, "pts/%u",
+ info->tie->pty->index);
+ else
+ snprintf(&inverted_path[slash_at + 1], 10, "%u",
+ info->tie->pty->index);
+ }
+
+ new->rfe->name = inverted_path;
+ new->path = &inverted_path[1];
+ }
+
+ return new;
+}
+
+#define pty_alloc_fake_master(info) pty_alloc_fake_reg(info, TTY_SUBTYPE_MASTER)
+#define pty_alloc_fake_slave(info) pty_alloc_fake_reg(info, TTY_SUBTYPE_SLAVE)
+
+static void pty_free_fake_reg(struct reg_file_info **r)
+{
+ if (*r) {
+ xfree((*r)->rfe->name);
+ xfree((*r));
+ *r = NULL;
+ }
+}
+
+static int open_tty_reg(struct file_desc *reg_d, u32 flags)
+{
+ /*
+ * Never set as a control terminal automatically, all
+ * ctty magic happens only in tty_set_sid().
+ */
+ flags |= O_NOCTTY;
+ return open_path(reg_d, do_open_reg_noseek_flags, &flags);
+}
+
+static char *path_from_reg(struct file_desc *d)
+{
+ struct reg_file_info *rfi = container_of(d, struct reg_file_info, d);
+ return rfi->path;
+}
+
+static int pty_open_ptmx_index(struct file_desc *d, int index, int flags)
+{
+ int fds[32], i, ret = -1, cur_idx;
+
+ memset(fds, 0xff, sizeof(fds));
+
+ mutex_lock(tty_mutex);
+
+ for (i = 0; i < ARRAY_SIZE(fds); i++) {
+ fds[i] = open_tty_reg(d, flags);
+ if (fds[i] < 0) {
+ pr_perror("Can't open %s", path_from_reg(d));
+ break;
+ }
+
+ if (ioctl(fds[i], TIOCGPTN, &cur_idx)) {
+ pr_perror("Can't obtain current index on %s",
+ path_from_reg(d));
+ break;
+ }
+
+ pr_debug("\t\tptmx opened with index %d\n", cur_idx);
+
+ if (cur_idx == index) {
+ pr_info("ptmx opened with index %d\n", cur_idx);
+ ret = fds[i];
+ fds[i] = -1;
+ break;
+ }
+
+ /*
+ * Maybe indices are already borrowed by
+ * someone else, so no need to continue.
+ */
+ if (cur_idx < index && (index - cur_idx) < ARRAY_SIZE(fds))
+ continue;
+
+ pr_err("Unable to open %s with specified index %d\n",
+ path_from_reg(d), index);
+ break;
+ }
+
+ for (i = 0; i < ARRAY_SIZE(fds); i++) {
+ if (fds[i] >= 0)
+ close(fds[i]);
+ }
+
+ mutex_unlock(tty_mutex);
+
+ return ret;
+}
+
+static int unlock_pty(int fd)
+{
+ const int lock = 0;
+
+ /*
+ * Usually when ptmx opened it gets locked
+ * by kernel and we need to unlock it to be
+ * able to connect slave peer.
+ */
+ if (ioctl(fd, TIOCSPTLCK, &lock)) {
+ pr_err("Unable to unlock pty device via y%d\n", fd);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int lock_pty(int fd)
+{
+ const int lock = 1;
+
+ if (ioctl(fd, TIOCSPTLCK, &lock)) {
+ pr_err("Unable to lock pty device via %d\n", fd);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int tty_set_sid(int fd)
+{
+ if (ioctl(fd, TIOCSCTTY, 1)) {
+ pr_perror("Can't set sid on terminal fd %d", fd);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int tty_set_prgp(int fd, int group)
+{
+ if (ioctl(fd, TIOCSPGRP, &group)) {
+ pr_perror("Failed to set group %d on %d", group, fd);
+ return -1;
+ }
+ return 0;
+}
+
+static int tty_restore_ctl_terminal(struct file_desc *d, int fd)
+{
+ struct tty_info *info = container_of(d, struct tty_info, d);
+ struct tty_driver *driver = info->driver;
+ struct reg_file_info *fake = NULL;
+ struct file_desc *slave_d;
+ int slave = -1, ret = -1, index = -1;
+
+ if (!is_service_fd(fd, CTL_TTY_OFF))
+ return 0;
+
+ if (driver->type == TTY_TYPE__EXT_TTY) {
+ slave = -1;
+ if (!inherited_fd(&info->d, &slave) && slave < 0)
+ return -1;
+ goto out;
+ }
+ if (driver->img_get_index)
+ index = driver->img_get_index(info);
+ else
+ index = driver->index;
+
+ if (is_pty(info->driver)) {
+ fake = pty_alloc_fake_slave(info);
+ if (!fake)
+ goto err;
+
+ slave_d = &fake->d;
+ } else
+ slave_d = info->reg_d;
+
+ slave = open_tty_reg(slave_d, O_RDONLY);
+ if (slave < 0) {
+ pr_perror("Can't open %s", path_from_reg(slave_d));
+ goto err;
+ }
+
+out:
+ pr_info("Restore session %d by %d tty (index %d)\n",
+ info->tie->sid, (int)getpid(), index);
+
+ ret = tty_set_sid(slave);
+ if (!ret)
+ ret = tty_set_prgp(slave, info->tie->pgrp);
+
+ close(slave);
+err:
+ pty_free_fake_reg(&fake);
+ close(fd);
+ return ret;
+}
+
+static bool tty_is_master(struct tty_info *info)
+{
+ if (info->driver->subtype == TTY_SUBTYPE_MASTER)
+ return true;
+
+ switch (info->driver->type) {
+ case TTY_TYPE__CONSOLE:
+ case TTY_TYPE__CTTY:
+ return true;
+ case TTY_TYPE__VT:
+ if (!opts.shell_job)
+ return true;
+ break;
+ case TTY_TYPE__EXT_TTY:
+ return true;
+ }
+
+ return false;
+}
+
+static bool tty_is_hung(struct tty_info *info)
+{
+ return info->tie->termios == NULL;
+}
+
+static bool tty_has_active_pair(struct tty_info *info)
+{
+ int d = tty_is_master(info) ? -1 : + 1;
+
+ return test_bit(info->tfe->tty_info_id + d,
+ tty_active_pairs);
+}
+
+static void tty_show_pty_info(char *prefix, struct tty_info *info)
+{
+ int index = -1;
+ struct tty_driver *driver = info->driver;
+
+ if (driver->img_get_index)
+ index = driver->img_get_index(info);
+ else
+ index = driver->index;
+
+ pr_info("%s driver %s id %#x index %d (master %d sid %d pgrp %d inherit %d)\n",
+ prefix, info->driver->name, info->tfe->id, index,
+ tty_is_master(info), info->tie->sid, info->tie->pgrp, info->inherit);
+}
+
+struct tty_parms {
+ int tty_id;
+ unsigned has;
+#define HAS_TERMIOS_L 0x1
+#define HAS_TERMIOS 0x2
+#define HAS_WINS 0x4
+ struct termios tl;
+ struct termios t;
+ struct winsize w;
+};
+
+static int do_restore_tty_parms(void *arg, int fd, pid_t pid)
+{
+ struct tty_parms *p = arg;
+
+ /*
+ * Only locked termios need CAP_SYS_ADMIN, but we
+ * restore them all here, since the regular tremios
+ * restore is affected by locked and thus we would
+ * have to do synchronous usernsd call which is not
+ * nice.
+ *
+ * Window size is restored here as it might depend
+ * on termios too. Just to be on the safe side.
+ */
+
+ if ((p->has & HAS_TERMIOS_L) &&
+ ioctl(fd, TIOCSLCKTRMIOS, &p->tl) < 0)
+ goto err;
+
+ if ((p->has & HAS_TERMIOS) &&
+ ioctl(fd, TCSETS, &p->t) < 0)
+ goto err;
+
+ if ((p->has & HAS_WINS) &&
+ ioctl(fd, TIOCSWINSZ, &p->w) < 0)
+ goto err;
+
+ return 0;
+
+err:
+ pr_perror("Can't set tty params on %d", p->tty_id);
+ return -1;
+}
+
+static int restore_tty_params(int fd, struct tty_info *info)
+{
+ struct tty_parms p;
+
+ /*
+ * It's important to zeroify termios
+ * because it contain @c_cc array which
+ * is bigger than TERMIOS_NCC. Same applies
+ * to winsize usage, we can't guarantee the
+ * structure taken from the system headers will
+ * never be extended.
+ */
+
+ p.has = 0;
+ p.tty_id = info->tfe->id;
+
+ if (info->tie->termios_locked) {
+ memzero(&p.tl, sizeof(p.tl));
+ p.has |= HAS_TERMIOS_L;
+ termios_copy(&p.tl, info->tie->termios_locked);
+ }
+
+ if (info->tie->termios) {
+ memzero(&p.t, sizeof(p.t));
+ p.has |= HAS_TERMIOS;
+ termios_copy(&p.t, info->tie->termios);
+ }
+
+ if (info->tie->winsize) {
+ memzero(&p.w, sizeof(p.w));
+ p.has |= HAS_WINS;
+ winsize_copy(&p.w, info->tie->winsize);
+ }
+
+ return userns_call(do_restore_tty_parms, UNS_ASYNC, &p, sizeof(p), fd);
+}
+
+static int pty_open_slaves(struct tty_info *info)
+{
+ int sock = -1, fd = -1, ret = -1;
+ struct fdinfo_list_entry *fle;
+ struct tty_info *slave;
+
+ sock = socket(PF_UNIX, SOCK_DGRAM, 0);
+ if (sock < 0) {
+ pr_perror("Can't create socket");
+ goto err;
+ }
+
+ list_for_each_entry(slave, &info->sibling, sibling) {
+ BUG_ON(tty_is_master(slave));
+
+ fd = open_tty_reg(slave->reg_d, slave->tfe->flags);
+ if (fd < 0) {
+ pr_perror("Can't open slave %s", path_from_reg(slave->reg_d));
+ goto err;
+ }
+
+ if (restore_tty_params(fd, slave))
+ goto err;
+
+ fle = file_master(&slave->d);
+
+ pr_debug("send slave %#x fd %d connected on %s (pid %d)\n",
+ slave->tfe->id, fd, path_from_reg(slave->reg_d), fle->pid);
+
+ if (send_fd_to_peer(fd, fle, sock)) {
+ pr_perror("Can't send file descriptor");
+ goto err;
+ }
+
+ close(fd);
+ fd = -1;
+ }
+ ret = 0;
+
+err:
+ close_safe(&fd);
+ close_safe(&sock);
+ return ret;
+}
+
+static int receive_tty(struct tty_info *info)
+{
+ struct fdinfo_list_entry *fle;
+ int fd;
+
+ fle = file_master(&info->d);
+ pr_info("\tWaiting tty fd %d (pid %d)\n", fle->fe->fd, fle->pid);
+
+ fd = recv_fd(fle->fe->fd);
+ close(fle->fe->fd);
+ if (fd < 0) {
+ pr_err("Can't get fd %d\n", fd);
+ return -1;
+ }
+
+ if (rst_file_params(fd, info->tfe->fown, info->tfe->flags))
+ close_safe(&fd);
+
+ return fd;
+}
+
+static int pty_open_unpaired_slave(struct file_desc *d, struct tty_info *slave)
+{
+ struct reg_file_info *fake = NULL;
+ int master = -1, ret = -1, fd = -1;
+
+ /*
+ * We may have 2 cases here: the slave either need to
+ * be inherited, either it requires a fake master.
+ */
+
+ if (likely(slave->inherit)) {
+ fd = dup(get_service_fd(SELF_STDIN_OFF));
+ if (fd < 0) {
+ pr_perror("Can't dup SELF_STDIN_OFF");
+ return -1;
+ }
+ pr_info("Migrated slave peer %x -> to fd %d\n",
+ slave->tfe->id, fd);
+ } else {
+ fake = pty_alloc_fake_master(slave);
+ if (!fake)
+ goto err;
+ master = pty_open_ptmx_index(&fake->d, slave->tie->pty->index, O_RDONLY);
+ if (master < 0) {
+ pr_perror("Can't open fale %x (index %d)",
+ slave->tfe->id, slave->tie->pty->index);
+ goto err;
+ }
+
+ unlock_pty(master);
+
+ fd = open_tty_reg(slave->reg_d, slave->tfe->flags);
+ if (fd < 0) {
+ pr_perror("Can't open slave %s", path_from_reg(slave->reg_d));
+ goto err;
+ }
+
+ }
+
+ if (restore_tty_params(fd, slave))
+ goto err;
+
+ /*
+ * If tty is migrated we need to set its group
+ * to the parent group, because signals on key
+ * presses are delivered to a group of terminal.
+ *
+ * Note, at this point the group/session should
+ * be already restored properly thus we can simply
+ * use syscalls instead of lookup via process tree.
+ */
+ if (likely(slave->inherit)) {
+ /*
+ * The restoration procedure only works if we're
+ * migrating not a session leader, otherwise it's
+ * not allowed to restore a group and one better to
+ * checkpoint complete process tree together with
+ * the process which keeps the master peer.
+ */
+ if (root_item->sid != root_item->pid.virt) {
+ pr_debug("Restore inherited group %d\n",
+ getpgid(getppid()));
+ if (tty_set_prgp(fd, getpgid(getppid())))
+ goto err;
+ }
+ }
+
+ if (pty_open_slaves(slave))
+ goto err;
+
+ ret = fd;
+ fd = -1;
+err:
+ close_safe(&master);
+ close_safe(&fd);
+ pty_free_fake_reg(&fake);
+ return ret;
+}
+
+static int pty_open_ptmx(struct tty_info *info)
+{
+ int master = -1;
+
+ master = pty_open_ptmx_index(info->reg_d, info->tie->pty->index, info->tfe->flags);
+ if (master < 0) {
+ pr_perror("Can't open %x (index %d)",
+ info->tfe->id, info->tie->pty->index);
+ return -1;
+ }
+
+ unlock_pty(master);
+
+ if (restore_tty_params(master, info))
+ goto err;
+
+ if (info->tie->packet_mode) {
+ int packet_mode = 1;
+
+ if (ioctl(master, TIOCPKT, &packet_mode) < 0) {
+ pr_perror("Can't set packed mode on %x",
+ info->tfe->id);
+ goto err;
+ }
+ }
+
+ if (pty_open_slaves(info))
+ goto err;
+
+ if (info->tie->locked)
+ lock_pty(master);
+
+ return master;
+err:
+ close_safe(&master);
+ return -1;
+}
+
+static int open_simple_tty(struct tty_info *info)
+{
+ int fd = -1;
+
+ fd = open_tty_reg(info->reg_d, info->tfe->flags);
+ if (fd < 0) {
+ pr_perror("Can't open %s %x",
+ info->driver->name, info->tfe->id);
+ return -1;
+ }
+
+ if (restore_tty_params(fd, info))
+ goto err;
+
+ return fd;
+err:
+ close_safe(&fd);
+ return -1;
+}
+
+static int open_ext_tty(struct tty_info *info)
+{
+ int fd = -1;
+
+ if (!inherited_fd(&info->d, &fd) && fd < 0)
+ return -1;
+
+ if (restore_tty_params(fd, info)) {
+ close(fd);
+ return -1;
+ }
+
+ return fd;
+}
+
+static int tty_open(struct file_desc *d)
+{
+ struct tty_info *info = container_of(d, struct tty_info, d);
+
+ tty_show_pty_info("open", info);
+
+ if (!info->create)
+ return receive_tty(info);
+
+ if (is_pty(info->driver) && !tty_is_master(info))
+ return pty_open_unpaired_slave(d, info);
+
+ return info->driver->open(info);
+}
+
+static int tty_transport(FdinfoEntry *fe, struct file_desc *d)
+{
+ struct tty_info *info = container_of(d, struct tty_info, d);
+ return !info->create;
+}
+
+static void tty_collect_fd(struct file_desc *d, struct fdinfo_list_entry *fle,
+ struct rst_info *ri)
+{
+ struct tty_info *info = container_of(d, struct tty_info, d);
+ struct list_head *tgt;
+
+ /*
+ * Unix98 pty slave peers requires the master peers being
+ * opened before them. In turn, current ttys should be opened
+ * after the slave peers so session must alread exist.
+ */
+
+ if (tty_is_master(info) && info->driver->type != TTY_TYPE__CTTY)
+ tgt = &ri->fds;
+ else if (info->driver->type == TTY_TYPE__CTTY)
+ tgt = &ri->tty_ctty;
+ else
+ tgt = &ri->tty_slaves;
+
+ list_add_tail(&fle->ps_list, tgt);
+}
+
+static char *tty_d_name(struct file_desc *d, char *buf, size_t s)
+{
+ struct tty_info *info = container_of(d, struct tty_info, d);
+
+ snprintf(buf, s, "tty[%x:%x]", info->tie->rdev, info->tie->dev);
+
+ return buf;
+}
+
+static struct file_desc_ops tty_desc_ops = {
+ .type = FD_TYPES__TTY,
+ .open = tty_open,
+ .post_open = tty_restore_ctl_terminal,
+ .want_transport = tty_transport,
+ .collect_fd = tty_collect_fd,
+ .name = tty_d_name,
+};
+
+static struct pstree_item *find_first_sid(int sid)
+{
+ struct pstree_item *item;
+
+ for_each_pstree_item(item) {
+ if (item->sid == sid)
+ return item;
+ }
+
+ return NULL;
+}
+
+static int tty_find_restoring_task(struct tty_info *info)
+{
+ struct pstree_item *item;
+
+ /*
+ * The overall scenario is the following (note
+ * we might have corrupted image so don't believe
+ * anything).
+ *
+ * SID is present on a peer
+ * ------------------------
+ *
+ * - if it's master peer and we have as well a slave
+ * peer then prefer restore controlling terminal
+ * via slave peer
+ *
+ * - if it's master peer without slave, there must be
+ * a SID leader who will be restoring the peer
+ *
+ * - if it's a slave peer and no session leader found
+ * than we need an option to inherit terminal
+ *
+ * No SID present on a peer
+ * ------------------------
+ *
+ * - if it's a master peer than we are in good shape
+ * and continue in a normal way, we're the peer keepers
+ *
+ * - if it's a slave peer and no appropriate master peer
+ * found we need an option to inherit terminal
+ *
+ * In any case if it's hungup peer, then we jump out
+ * early since it will require fake master peer and
+ * rather non-usable anyway.
+ */
+
+ if (tty_is_hung(info)) {
+ pr_debug("Hungup terminal found id %x\n", info->tfe->id);
+ return 0;
+ }
+
+ /*
+ * Current tty should be skipped here: the
+ * underlied _real_ pty (or anything else
+ * driver in future) should restore the
+ * session.
+ */
+ if (info->driver->type == TTY_TYPE__CTTY)
+ return 0;
+
+ if (info->tie->sid) {
+ if (!tty_is_master(info)) {
+ if (tty_has_active_pair(info))
+ return 0;
+ else
+ goto shell_job;
+ }
+
+ /*
+ * Restoring via leader only. All files
+ * opened over same real tty get propagated
+ * automatically by kernel itself.
+ */
+ if (info->ctl_tty != info)
+ return 0;
+
+ /*
+ * Find out the task which is session leader
+ * and it can restore the controlling terminal
+ * for us.
+ */
+ item = find_first_sid(info->tie->sid);
+ if (item && item->pid.virt == item->sid) {
+ pr_info("Set a control terminal %x to %d\n",
+ info->tfe->id, info->tie->sid);
+ return prepare_ctl_tty(item->pid.virt,
+ rsti(item),
+ info->tfe->id);
+ }
+
+ goto notask;
+ } else {
+ if (tty_is_master(info))
+ return 0;
+ if (tty_has_active_pair(info))
+ return 0;
+ }
+
+shell_job:
+ if (opts.shell_job) {
+ pr_info("Inherit terminal for id %x\n", info->tfe->id);
+ info->inherit = true;
+ return 0;
+ }
+
+notask:
+ pr_err("No task found with sid %d\n", info->tie->sid);
+ return -1;
+}
+
+static int tty_setup_orphan_slavery(void)
+{
+ struct tty_info *info, *peer, *m;
+
+ list_for_each_entry(info, &all_ttys, list) {
+ struct fdinfo_list_entry *a, *b;
+ bool has_leader = false;
+
+ if (tty_is_master(info))
+ continue;
+
+ a = file_master(&info->d);
+ m = info;
+
+ list_for_each_entry(peer, &info->sibling, sibling) {
+ if (tty_is_master(peer)) {
+ has_leader = true;
+ break;
+ }
+
+ /*
+ * Same check as in pipes and files -- need to
+ * order slave ends so that they do not dead lock
+ * waiting for each other.
+ */
+ b = file_master(&peer->d);
+ if (fdinfo_rst_prio(b, a)) {
+ a = b;
+ m = peer;
+ }
+ }
+
+ if (!has_leader) {
+ m->create = true;
+ pr_debug("Found orphan slave fake leader (%#x)\n",
+ m->tfe->id);
+ }
+ }
+
+ return 0;
+}
+
+int tty_setup_slavery(void)
+{
+ struct tty_info *info, *peer, *m;
+
+ /*
+ * The image may carry several terminals opened
+ * belonging to the same session, so choose the
+ * leader which gonna be setting up the controlling
+ * terminal.
+ */
+ list_for_each_entry(info, &all_ttys, list) {
+ if (!info->tie->sid || info->ctl_tty ||
+ info->driver->type == TTY_TYPE__CTTY)
+ continue;
+
+ if (!tty_is_master(info))
+ continue;
+
+ info->ctl_tty = info;
+ pr_debug("ctl tty leader %x\n", info->tfe->id);
+ peer = info;
+ list_for_each_entry_safe_continue(peer, m, &all_ttys, list) {
+ if (!peer->tie->sid || peer->ctl_tty ||
+ peer->driver->type == TTY_TYPE__CTTY)
+ continue;
+ if (peer->tie->sid == info->tie->sid) {
+ pr_debug(" `- slave %x\n", peer->tfe->id);
+ peer->ctl_tty = info;
+ }
+ }
+ }
+
+ list_for_each_entry(info, &all_ttys, list) {
+ if (tty_find_restoring_task(info))
+ return -1;
+ if (!is_pty(info->driver))
+ continue;
+
+ peer = info;
+ list_for_each_entry_safe_continue(peer, m, &all_ttys, list) {
+ if (!is_pty(peer->driver))
+ continue;
+ if (peer->tie->pty->index != info->tie->pty->index)
+ continue;
+
+ if (tty_find_restoring_task(peer))
+ return -1;
+
+ list_add(&peer->sibling, &info->sibling);
+ list_del(&peer->list);
+ }
+ }
+
+ /*
+ * Print out information about peers.
+ */
+ list_for_each_entry(info, &all_ttys, list) {
+ tty_show_pty_info("head", info);
+ list_for_each_entry(peer, &info->sibling, sibling)
+ tty_show_pty_info(" `- sibling", peer);
+ }
+
+ return tty_setup_orphan_slavery();
+}
+
+static int verify_termios(u32 id, TermiosEntry *e)
+{
+ if (e && e->n_c_cc < TERMIOS_NCC) {
+ pr_err("pty ID %#x n_c_cc (%d) has wrong value\n",
+ id, (int)e->n_c_cc);
+ return -1;
+ }
+ return 0;
+}
+
+#define term_opts_missing_cmp(p, op) \
+ (!(p)->tie->termios op \
+ !(p)->tie->termios_locked op \
+ !(p)->tie->winsize)
+
+#define term_opts_missing_any(p) \
+ term_opts_missing_cmp(p, ||)
+
+#define term_opts_missing_all(p) \
+ term_opts_missing_cmp(p, &&)
+
+static int verify_info(struct tty_info *info)
+{
+ if (!info->driver) {
+ pr_err("Unknown driver master peer %x\n", info->tfe->id);
+ return -1;
+ }
+
+ /*
+ * Master peer must have all parameters present,
+ * while slave peer must have either all parameters present
+ * or don't have them at all.
+ */
+ if (term_opts_missing_any(info)) {
+ if (tty_is_master(info)) {
+ pr_err("Corrupted master peer %x\n", info->tfe->id);
+ return -1;
+ } else if (!term_opts_missing_all(info)) {
+ pr_err("Corrupted slave peer %x\n", info->tfe->id);
+ return -1;
+ }
+ }
+
+ if (verify_termios(info->tfe->id, info->tie->termios_locked) ||
+ verify_termios(info->tfe->id, info->tie->termios))
+ return -1;
+
+ if (info->tie->termios && info->tfe->tty_info_id > (MAX_TTYS << 1))
+ return -1;
+
+ return 0;
+}
+
+static TtyInfoEntry *lookup_tty_info_entry(u32 id)
+{
+ struct tty_info_entry *e;
+
+ list_for_each_entry(e, &all_tty_info_entries, list) {
+ if (e->tie->id == id)
+ return e->tie;
+ }
+
+ return NULL;
+}
+
+static int collect_one_tty_info_entry(void *obj, ProtobufCMessage *msg)
+{
+ struct tty_info_entry *info = obj;
+
+ info->tie = pb_msg(msg, TtyInfoEntry);
+
+ switch (info->tie->type) {
+ case TTY_TYPE__PTY:
+ if (!info->tie->pty) {
+ pr_err("No PTY data found (id %x), corrupted image?\n",
+ info->tie->id);
+ return -1;
+ }
+ break;
+ case TTY_TYPE__CTTY:
+ case TTY_TYPE__CONSOLE:
+ case TTY_TYPE__VT:
+ case TTY_TYPE__EXT_TTY:
+ if (info->tie->pty) {
+ pr_err("PTY data found (id %x), corrupted image?\n",
+ info->tie->id);
+ return -1;
+ }
+ break;
+ default:
+ pr_err("Unexpected TTY type %d (id %x)\n",
+ info->tie->type, info->tie->id);
+ return -1;
+ }
+
+ INIT_LIST_HEAD(&info->list);
+ list_add(&info->list, &all_tty_info_entries);
+
+ return 0;
+}
+
+struct collect_image_info tty_info_cinfo = {
+ .fd_type = CR_FD_TTY_INFO,
+ .pb_type = PB_TTY_INFO,
+ .priv_size = sizeof(struct tty_info_entry),
+ .collect = collect_one_tty_info_entry,
+};
+
+static int collect_one_tty(void *obj, ProtobufCMessage *msg)
+{
+ struct tty_info *info = obj;
+
+ info->tfe = pb_msg(msg, TtyFileEntry);
+
+ info->tie = lookup_tty_info_entry(info->tfe->tty_info_id);
+ if (!info->tie) {
+ pr_err("No tty-info-id %x found on id %x\n",
+ info->tfe->tty_info_id, info->tfe->id);
+ return -1;
+ }
+
+ INIT_LIST_HEAD(&info->sibling);
+ info->driver = get_tty_driver(info->tie->rdev, info->tie->dev);
+ if (info->driver == NULL) {
+ pr_err("Unable to find a tty driver\n");
+ return -1;
+ }
+ info->create = tty_is_master(info);
+ info->inherit = false;
+ info->ctl_tty = NULL;
+
+ if (verify_info(info))
+ return -1;
+
+ /*
+ * The image might have no reg file record in old CRIU, so
+ * lets don't fail for a while. After a couple of releases
+ * simply require the record to present.
+ */
+ info->reg_d = try_collect_special_file(info->tfe->id, 1);
+ if (!info->reg_d) {
+ if (is_pty(info->driver)) {
+ info->reg_d = pty_alloc_reg(info, true);
+ if (!info->reg_d) {
+ pr_err("Can't generate new reg descriptor for id %#x\n",
+ info->tfe->id);
+ return -1;
+ }
+ } if (info->driver->type != TTY_TYPE__EXT_TTY) {
+ pr_err("No reg_d descriptor for id %#x\n", info->tfe->id);
+ return -1;
+ }
+ }
+
+ /*
+ * The tty peers which have no @termios are hung up,
+ * so don't mark them as active, we create them with
+ * faked master and they are rather a rudiment which
+ * can't be used. Most likely they appear if a user has
+ * dumped program when it was closing a peer.
+ */
+ if (is_pty(info->driver) && info->tie->termios)
+ tty_test_and_set(info->tfe->tty_info_id, tty_active_pairs);
+
+ pr_info("Collected tty ID %#x (%s)\n", info->tfe->id, info->driver->name);
+
+ list_add(&info->list, &all_ttys);
+ return file_desc_add(&info->d, info->tfe->id, &tty_desc_ops);
+}
+
+struct collect_image_info tty_cinfo = {
+ .fd_type = CR_FD_TTY_FILES,
+ .pb_type = PB_TTY_FILE,
+ .priv_size = sizeof(struct tty_info),
+ .collect = collect_one_tty,
+};
+
+/* Make sure the ttys we're dumping do belong our process tree */
+int dump_verify_tty_sids(void)
+{
+ struct tty_dump_info *dinfo, *n;
+ int ret = 0;
+
+ /*
+ * There might be a cases where we get sid/pgid on
+ * slave peer. For example the application is running
+ * with redirection and we're migrating shell job.
+ *
+ * # ./app < /dev/zero > /dev/zero &2>1
+ *
+ * Which produce a tree like
+ * PID PPID PGID SID
+ * root 23786 23784 23786 23786 pts/0 \_ -bash
+ * root 24246 23786 24246 23786 pts/0 \_ ./app
+ *
+ * And the application goes background, then we dump
+ * it from the same shell.
+ *
+ * In this case we simply zap sid/pgid and inherit
+ * the peer from the current terminal on restore.
+ */
+ list_for_each_entry_safe(dinfo, n, &all_ttys, list) {
+ if (!ret && dinfo->sid) {
+ struct pstree_item *item = find_first_sid(dinfo->sid);
+
+ if (!item || item->pid.virt != dinfo->sid) {
+ if (!opts.shell_job) {
+ pr_err("Found dangling tty with sid %d pgid %d (%s) on peer fd %d.\n",
+ dinfo->sid, dinfo->pgrp,
+ dinfo->driver->name, dinfo->fd);
+ /*
+ * First thing people do with criu is dump smth
+ * run from shell. This is typical pitfall, warn
+ * user about it explicitly.
+ */
+ pr_msg("Task attached to shell terminal. "
+ "Consider using --" OPT_SHELL_JOB " option. "
+ "More details on http://criu.org/Simple_loop\n");
+ ret = -1;
+ }
+ }
+ }
+ xfree(dinfo);
+ }
+
+ return ret;
+}
+
+static int dump_tty_info(int lfd, u32 id, const struct fd_parms *p, struct tty_driver *driver, int index)
+{
+ TtyInfoEntry info = TTY_INFO_ENTRY__INIT;
+ TermiosEntry termios = TERMIOS_ENTRY__INIT;
+ TermiosEntry termios_locked = TERMIOS_ENTRY__INIT;
+ WinsizeEntry winsize = WINSIZE_ENTRY__INIT;
+ TtyPtyEntry pty = TTY_PTY_ENTRY__INIT;
+ struct parasite_tty_args *pti;
+ struct tty_dump_info *dinfo;
+
+ struct termios t;
+ struct winsize w;
+
+ int ret = -1;
+
+ /*
+ * Make sure the structures the system provides us
+ * correlates well with protobuf templates.
+ */
+ BUILD_BUG_ON(ARRAY_SIZE(t.c_cc) < TERMIOS_NCC);
+ BUILD_BUG_ON(sizeof(termios.c_cc) != sizeof(void *));
+ BUILD_BUG_ON((sizeof(termios.c_cc) * TERMIOS_NCC) < sizeof(t.c_cc));
+
+ pti = parasite_dump_tty(p->ctl, p->fd, driver->type);
+ if (!pti)
+ return -1;
+
+ dinfo = xmalloc(sizeof(*dinfo));
+ if (!dinfo)
+ return -1;
+
+ dinfo->id = id;
+ dinfo->sid = pti->sid;
+ dinfo->pgrp = pti->pgrp;
+ dinfo->fd = p->fd;
+ dinfo->driver = driver;
+
+ list_add_tail(&dinfo->list, &all_ttys);
+
+ info.id = id;
+ info.sid = pti->sid;
+ info.pgrp = pti->pgrp;
+ info.rdev = p->stat.st_rdev;
+ info.dev = p->stat.st_dev;
+ info.has_dev = true;
+ info.locked = pti->st_lock;
+ info.exclusive = pti->st_excl;
+ info.packet_mode = pti->st_pckt;
+
+ info.type = driver->type;
+ if (info.type == TTY_TYPE__PTY) {
+ info.pty = &pty;
+ pty.index = index;
+ }
+
+ /*
+ * Nothing we can do on hanging up terminal,
+ * just write out minimum information we can
+ * gather.
+ */
+ if (pti->hangup)
+ return pb_write_one(img_from_set(glob_imgset, CR_FD_TTY_INFO), &info, PB_TTY_INFO);
+
+ /*
+ * Now trace the paired/unpaired ttys. For example
+ * the task might have slave peer assigned but no
+ * master peer. Such "detached" master peers are
+ * not yet supported by our tool and better to
+ * inform a user about such situation.
+ */
+ if (is_pty(driver))
+ tty_test_and_set(id, tty_active_pairs);
+
+ info.termios = &termios;
+ info.termios_locked = &termios_locked;
+ info.winsize = &winsize;
+
+ termios.n_c_cc = TERMIOS_NCC;
+ termios.c_cc = xmalloc(pb_repeated_size(&termios, c_cc));
+
+ termios_locked.n_c_cc = TERMIOS_NCC;
+ termios_locked.c_cc = xmalloc(pb_repeated_size(&termios_locked, c_cc));
+
+ if (!termios.c_cc || !termios_locked.c_cc)
+ goto out;
+
+ memzero(&t, sizeof(t));
+ if (ioctl(lfd, TCGETS, &t) < 0) {
+ pr_perror("Can't get tty params on %x", id);
+ goto out;
+ }
+ termios_copy(&termios, &t);
+
+ memzero(&t, sizeof(t));
+ if (ioctl(lfd, TIOCGLCKTRMIOS, &t) < 0) {
+ pr_perror("Can't get tty locked params on %x", id);
+ goto out;
+ }
+ termios_copy(&termios_locked, &t);
+
+ memzero(&w, sizeof(w));
+ if (ioctl(lfd, TIOCGWINSZ, &w) < 0) {
+ pr_perror("Can't get tty window params on %x", id);
+ goto out;
+ }
+ winsize_copy(&winsize, &w);
+
+ ret = pb_write_one(img_from_set(glob_imgset, CR_FD_TTY_INFO), &info, PB_TTY_INFO);
+out:
+ xfree(termios.c_cc);
+ xfree(termios_locked.c_cc);
+ return ret;
+}
+
+static int dump_one_tty(int lfd, u32 id, const struct fd_parms *p)
+{
+ TtyFileEntry e = TTY_FILE_ENTRY__INIT;
+ int ret = 0, index = -1;
+ struct tty_driver *driver;
+
+ pr_info("Dumping tty %d with id %#x\n", lfd, id);
+
+ driver = get_tty_driver(p->stat.st_rdev, p->stat.st_dev);
+ if (driver->fd_get_index)
+ index = driver->fd_get_index(lfd, p);
+ else
+ index = driver->index;
+
+ if (index == INDEX_ERR) {
+ pr_info("Can't obtain index on tty %d id %#x\n", lfd, id);
+ return -1;
+ }
+
+ if (driver->type != TTY_TYPE__EXT_TTY && dump_one_reg_file(lfd, id, p))
+ return -1;
+
+ e.id = id;
+ e.tty_info_id = tty_gen_id(driver, index);
+ e.flags = p->flags;
+ e.fown = (FownEntry *)&p->fown;
+
+ /*
+ * FIXME
+ *
+ * Figure out how to fetch data buffered in terminal.
+ * For a while simply flush before dumping. Note
+ * we don't check for errors here since it makes
+ * no sense anyway, the buffered data is not handled
+ * properly yet.
+ *
+ * Note as well that if we have only one peer here
+ * the external end might be sending the data to us
+ * again and again while kernel buffer is not full,
+ * this might lead to endless SIGTTOU signal delivery
+ * to the dumpee, ruining checkpoint procedure.
+ *
+ * So simply do not flush the line while we dump
+ * parameters tty never was being a guaranteed delivery
+ * transport anyway.
+ */
+
+ if (!tty_test_and_set(e.tty_info_id, tty_bitmap))
+ ret = dump_tty_info(lfd, e.tty_info_id, p, driver, index);
+
+ if (!ret)
+ ret = pb_write_one(img_from_set(glob_imgset, CR_FD_TTY_FILES), &e, PB_TTY_FILE);
+ return ret;
+}
+
+const struct fdtype_ops tty_dump_ops = {
+ .type = FD_TYPES__TTY,
+ .dump = dump_one_tty,
+};
+
+int tty_prep_fds(void)
+{
+ if (!opts.shell_job)
+ return 0;
+
+ if (!isatty(STDIN_FILENO)) {
+ pr_err("Standard stream is not a terminal, aborting\n");
+ return -1;
+ }
+
+ if (install_service_fd(SELF_STDIN_OFF, STDIN_FILENO) < 0) {
+ pr_perror("Can't dup stdin to SELF_STDIN_OFF");
+ return -1;
+ }
+
+ return 0;
+}
+
+void tty_fini_fds(void)
+{
+ close_service_fd(SELF_STDIN_OFF);
+}
diff --git a/criu/tun.c b/criu/tun.c
new file mode 100644
index 000000000000..dcee704e48b4
--- /dev/null
+++ b/criu/tun.c
@@ -0,0 +1,494 @@
+#include <unistd.h>
+#include <sys/socket.h>
+#include <linux/if.h>
+#include <linux/if_tun.h>
+#include <sys/ioctl.h>
+#include <sched.h>
+
+// MAO required on Centos 6 (linux-3.18.1 kernel)
+#include <linux/filter.h>
+
+#include "cr_options.h"
+#include "imgset.h"
+#include "protobuf.h"
+#include "cr-show.h"
+#include "string.h"
+#include "files.h"
+#include "files-reg.h"
+#include "tun.h"
+#include "net.h"
+#include "namespaces.h"
+
+#include "protobuf/tun.pb-c.h"
+
+#ifndef IFF_PERSIST
+#define IFF_PERSIST 0x0800
+#endif
+
+#ifndef IFF_NOFILTER
+#define IFF_NOFILTER 0x1000
+#endif
+
+#ifndef TUNSETQUEUE
+#define TUNSETQUEUE _IOW('T', 217, int)
+#define IFF_ATTACH_QUEUE 0x0200
+#define IFF_DETACH_QUEUE 0x0400
+#endif
+
+/*
+ * Absense of the 1st ioctl means we cannot restore tun link. But
+ * since the 2nd one appeared at the same time, we'll "check" this
+ * by trying to dump filter and abort dump if it's not there.
+ */
+
+#ifndef TUNSETIFINDEX
+#define TUNSETIFINDEX _IOW('T', 218, unsigned int)
+#endif
+
+#ifndef TUNGETFILTER
+#define TUNGETFILTER _IOR('T', 219, struct sock_fprog)
+#endif
+
+#define TUN_DEV_GEN_PATH "/dev/net/tun"
+
+int check_tun_cr(int no_tun_err)
+{
+ int fd, idx = 13, ret;
+
+ fd = open(TUN_DEV_GEN_PATH, O_RDWR);
+ if (fd < 0) {
+ pr_perror("Can't check tun support");
+ return no_tun_err;
+ }
+
+ ret = ioctl(fd, TUNSETIFINDEX, &idx);
+ if (ret < 0)
+ pr_perror("No proper support for tun dump/restore");
+
+ close(fd);
+ return ret;
+}
+
+static LIST_HEAD(tun_links);
+
+struct tun_link {
+ char name[IFNAMSIZ];
+ struct list_head l;
+ union {
+ struct {
+ unsigned flags;
+ } rst;
+
+ struct {
+ unsigned sndbuf;
+ unsigned vnethdr;
+ } dmp;
+ };
+};
+
+static int list_tun_link(NetDeviceEntry *nde)
+{
+ struct tun_link *tl;
+
+ tl = xmalloc(sizeof(*tl));
+ if (!tl)
+ return -1;
+
+ strlcpy(tl->name, nde->name, sizeof(tl->name));
+ /*
+ * Keep tun-flags not only for persistency fixup (see
+ * commend below), but also for TUNSETIFF -- we must
+ * open the device with the same flags it should live
+ * with (i.e. -- with which it was created.
+ */
+ tl->rst.flags = nde->tun->flags;
+ list_add_tail(&tl->l, &tun_links);
+ return 0;
+}
+
+static struct tun_link *find_tun_link(char *name)
+{
+ struct tun_link *tl;
+
+ list_for_each_entry(tl, &tun_links, l)
+ if (!strcmp(tl->name, name))
+ return tl;
+
+ return NULL;
+}
+
+static struct tun_link *__dump_tun_link_fd(int fd, char *name, unsigned flags)
+{
+ struct tun_link *tl;
+ struct sock_fprog flt;
+
+ tl = xmalloc(sizeof(*tl));
+ if (!tl)
+ goto err;
+ strlcpy(tl->name, name, sizeof(tl->name));
+
+ if (ioctl(fd, TUNGETVNETHDRSZ, &tl->dmp.vnethdr) < 0) {
+ pr_perror("Can't dump vnethdr size for %s", name);
+ goto err;
+ }
+
+ if (ioctl(fd, TUNGETSNDBUF, &tl->dmp.sndbuf) < 0) {
+ pr_perror("Can't dump sndbuf for %s", name);
+ goto err;
+ }
+
+ if (flags & IFF_TAP) {
+ pr_debug("Checking filter for tap %s\n", name);
+ if (ioctl(fd, TUNGETFILTER, &flt) < 0) {
+ pr_perror("Can't get tun filter for %s", name);
+ goto err;
+ }
+
+ /*
+ * TUN filters are tricky -- the program itself is 'somewhere'
+ * in the task's memory, so we can't get one for unattached
+ * persistent device. The only way for doing it is opening the
+ * device with IFF_NOFILTER and attaching some fake one :(
+ */
+
+ if (flt.len != 0) {
+ pr_err("Can't dump %s with filter on-board\n", name);
+ goto err;
+ }
+ } else if (!(flags & IFF_NOFILTER)) {
+ pr_err("No info about %s filter, kernel is too old\n", name);
+ goto err;
+ }
+
+ return tl;
+
+err:
+ xfree(tl);
+ return NULL;
+}
+
+static struct tun_link *dump_tun_link_fd(int fd, char *name, unsigned flags)
+{
+ struct tun_link *tl;
+
+ tl = find_tun_link(name);
+ if (tl)
+ return tl;
+
+ tl = __dump_tun_link_fd(fd, name, flags);
+ if (tl)
+ /*
+ * Keep this in list till links dumping code starts.
+ * We can't let it dump all this stuff itself, since
+ * multiple attaches to one tun device is limited and
+ * we may not be able to it that late.
+ *
+ * For persistent detached devices the get_tun_link_fd
+ * will attach to the device and get the needed stuff.
+ */
+ list_add(&tl->l, &tun_links);
+
+ return tl;
+}
+
+static int open_tun_dev(char *name, unsigned int idx, unsigned flags)
+{
+ int fd;
+ struct ifreq ifr;
+
+ fd = open(TUN_DEV_GEN_PATH, O_RDWR);
+ if (fd < 0) {
+ pr_perror("Can't open tun device");
+ return -1;
+ }
+
+ if (idx) {
+ pr_debug(" restoring %u for %s tun\n", idx, name);
+ if (ioctl(fd, TUNSETIFINDEX, &idx) < 0) {
+ pr_perror("Can't restore tun's index");
+ goto err;
+ }
+ }
+
+ memset(&ifr, 0, sizeof(ifr));
+ strlcpy(ifr.ifr_name, name, sizeof(ifr.ifr_name));
+ ifr.ifr_flags = flags;
+
+ if (ioctl(fd, TUNSETIFF, &ifr)) {
+ pr_perror("Can't create tun device");
+ goto err;
+ }
+
+ return fd;
+
+err:
+ close(fd);
+ return -1;
+}
+
+static struct tun_link *get_tun_link_fd(char *name, unsigned flags)
+{
+ struct tun_link *tl;
+ int fd;
+
+ tl = find_tun_link(name);
+ if (tl)
+ return tl;
+
+ /*
+ * If we haven't found this thing, then the
+ * device we see via netlink exists w/o any fds
+ * attached, i.e. -- it's persistent
+ */
+
+ if (!(flags & IFF_PERSIST)) {
+ pr_err("No fd infor for non persistent tun device %s\n", name);
+ return NULL;
+ }
+
+ /*
+ * Kernel will try to attach filter (if it exists) to our memory,
+ * avoid this.
+ */
+
+ flags |= IFF_NOFILTER;
+
+ fd = open_tun_dev(name, 0, flags);
+ if (fd < 0)
+ return NULL;
+
+ tl = __dump_tun_link_fd(fd, name, flags);
+ close(fd);
+
+ return tl;
+}
+
+static int dump_tunfile(int lfd, u32 id, const struct fd_parms *p)
+{
+ int ret;
+ struct cr_img *img;
+ TunfileEntry tfe = TUNFILE_ENTRY__INIT;
+ struct ifreq ifr;
+
+ if (!(root_ns_mask & CLONE_NEWNET)) {
+ pr_err("Net namespace is required to dump tun link\n");
+ return -1;
+ }
+
+ if (dump_one_reg_file(lfd, id, p))
+ return -1;
+
+ pr_info("Dumping tun-file %d with id %#x\n", lfd, id);
+
+ tfe.id = id;
+ ret = ioctl(lfd, TUNGETIFF, &ifr);
+ if (ret < 0) {
+ if (errno != EBADFD) {
+ pr_perror("Can't dump tun-file device");
+ return -1;
+ }
+
+ /*
+ * Otherwise this is just opened file with not yet attached
+ * tun device. Go agead an write the respective entry.
+ */
+ } else {
+ tfe.netdev = ifr.ifr_name;
+ pr_info("`- attached to device %s (flags %x)\n", tfe.netdev, ifr.ifr_flags);
+
+ if (ifr.ifr_flags & IFF_DETACH_QUEUE) {
+ tfe.has_detached = true;
+ tfe.detached = true;
+ }
+
+ if (dump_tun_link_fd(lfd, tfe.netdev, ifr.ifr_flags) == NULL)
+ return -1;
+ }
+
+ img = img_from_set(glob_imgset, CR_FD_TUNFILE);
+ return pb_write_one(img, &tfe, PB_TUNFILE);
+}
+
+const struct fdtype_ops tunfile_dump_ops = {
+ .type = FD_TYPES__TUNF,
+ .dump = dump_tunfile,
+};
+
+struct tunfile_info {
+ struct file_desc d;
+ TunfileEntry *tfe;
+};
+
+static int tunfile_open(struct file_desc *d)
+{
+ int fd;
+ struct tunfile_info *ti;
+ struct ifreq ifr;
+ struct tun_link *tl;
+
+ ti = container_of(d, struct tunfile_info, d);
+ fd = open_reg_by_id(ti->tfe->id);
+ if (fd < 0)
+ return -1;
+
+ if (!ti->tfe->netdev)
+ /* just-opened tun file */
+ return fd;
+
+ tl = find_tun_link(ti->tfe->netdev);
+ if (!tl) {
+ pr_err("No tun device for file %s\n", ti->tfe->netdev);
+ goto err;
+ }
+
+ memset(&ifr, 0, sizeof(ifr));
+ strlcpy(ifr.ifr_name, tl->name, sizeof(ifr.ifr_name));
+ ifr.ifr_flags = tl->rst.flags;
+
+ if (ioctl(fd, TUNSETIFF, &ifr) < 0) {
+ pr_perror("Can't attach tunfile to device");
+ goto err;
+ }
+
+ if (ti->tfe->has_detached && ti->tfe->detached) {
+ pr_info("Detaching from %s queue\n", ti->tfe->netdev);
+ ifr.ifr_flags = IFF_DETACH_QUEUE;
+ if (ioctl(fd, TUNSETQUEUE, &ifr) < 0) {
+ pr_perror("Can't detach queue");
+ goto err;
+ }
+ }
+
+ if (!(tl->rst.flags & IFF_PERSIST)) {
+ pr_info("Dropping persistency for %s\n", tl->name);
+ if (ioctl(fd, TUNSETPERSIST, 0) < 0) {
+ pr_perror("Error dropping persistency");
+ goto err;
+ }
+ }
+
+ return fd;
+
+err:
+ close(fd);
+ return -1;
+}
+
+static struct file_desc_ops tunfile_desc_ops = {
+ .type = FD_TYPES__TUNF,
+ .open = tunfile_open,
+};
+
+static int collect_one_tunfile(void *o, ProtobufCMessage *base)
+{
+ struct tunfile_info *ti = o;
+
+ ti->tfe = pb_msg(base, TunfileEntry);
+ file_desc_add(&ti->d, ti->tfe->id, &tunfile_desc_ops);
+
+ pr_info("Collected %s tunfile\n", ti->tfe->netdev);
+
+ return 0;
+}
+
+struct collect_image_info tunfile_cinfo = {
+ .fd_type = CR_FD_TUNFILE,
+ .pb_type = PB_TUNFILE,
+ .priv_size = sizeof(struct tunfile_info),
+ .collect = collect_one_tunfile,
+};
+
+int dump_tun_link(NetDeviceEntry *nde, struct cr_imgset *fds)
+{
+ TunLinkEntry tle = TUN_LINK_ENTRY__INIT;
+ char spath[64];
+ char buf[64];
+ int ret = 0;
+ struct tun_link *tl;
+
+ sprintf(spath, "class/net/%s/tun_flags", nde->name);
+ ret |= read_ns_sys_file(spath, buf, sizeof(buf));
+ tle.flags = strtol(buf, NULL, 0);
+
+ sprintf(spath, "class/net/%s/owner", nde->name);
+ ret |= read_ns_sys_file(spath, buf, sizeof(buf));
+ tle.owner = strtol(buf, NULL, 10);
+
+ sprintf(spath, "class/net/%s/group", nde->name);
+ ret |= read_ns_sys_file(spath, buf, sizeof(buf));
+ tle.group = strtol(buf, NULL, 10);
+
+ if (ret < 0)
+ return ret;
+
+ tl = get_tun_link_fd(nde->name, tle.flags);
+ if (!tl)
+ return ret;
+
+ tle.vnethdr = tl->dmp.vnethdr;
+ tle.sndbuf = tl->dmp.sndbuf;
+
+ nde->tun = &tle;
+ return write_netdev_img(nde, fds);
+}
+
+int restore_one_tun(NetDeviceEntry *nde, int nlsk)
+{
+ int fd, ret = -1, aux;
+
+ if (!nde->tun) {
+ pr_err("Corrupted TUN link entry %x\n", nde->ifindex);
+ return -1;
+ }
+
+ pr_info("Restoring tun device %s\n", nde->name);
+
+ fd = open_tun_dev(nde->name, nde->ifindex, nde->tun->flags);
+ if (fd < 0)
+ return -1;
+
+ aux = nde->tun->owner;
+ if ((aux != -1) && ioctl(fd, TUNSETOWNER, aux) < 0) {
+ pr_perror("Can't set owner");
+ goto out;
+ }
+
+ aux = nde->tun->group;
+ if ((aux != -1) && ioctl(fd, TUNSETGROUP, aux) < 0) {
+ pr_perror("Can't set group");
+ goto out;
+ }
+
+ aux = nde->tun->sndbuf;
+ if (ioctl(fd, TUNSETSNDBUF, &aux) < 0) {
+ pr_perror("Can't set sndbuf");
+ goto out;
+ }
+
+ aux = nde->tun->vnethdr;
+ if (ioctl(fd, TUNSETVNETHDRSZ, &aux) < 0) {
+ pr_perror("Can't set vnethdr");
+ goto out;
+ }
+
+ /*
+ * Set this device persistent anyway and schedule
+ * the persistence drop if it should not be such.
+ * The first _real_ opener will do it.
+ */
+
+ if (ioctl(fd, TUNSETPERSIST, 1)) {
+ pr_perror("Can't make tun device persistent");
+ goto out;
+ }
+
+ if (restore_link_parms(nde, nlsk)) {
+ pr_err("Error restoring %s link params\n", nde->name);
+ goto out;
+ }
+
+ ret = list_tun_link(nde);
+out:
+ close(fd);
+ return ret;
+}
diff --git a/criu/util.c b/criu/util.c
new file mode 100644
index 000000000000..00c327396c8a
--- /dev/null
+++ b/criu/util.c
@@ -0,0 +1,1002 @@
+#define _XOPEN_SOURCE
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <string.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <limits.h>
+#include <signal.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <dirent.h>
+#include <sys/sendfile.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <sys/mount.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/ptrace.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <sys/vfs.h>
+#include <sys/ptrace.h>
+#include <sys/wait.h>
+#include <sys/resource.h>
+#include <sys/wait.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <sched.h>
+
+#include "compiler.h"
+#include "asm/types.h"
+#include "list.h"
+#include "util.h"
+#include "rst-malloc.h"
+#include "image.h"
+#include "vma.h"
+#include "mem.h"
+#include "namespaces.h"
+
+#include "cr_options.h"
+#include "servicefd.h"
+#include "cr-service.h"
+#include "files.h"
+
+#include "cr-errno.h"
+
+#define VMA_OPT_LEN 128
+
+/*
+ * This function reallocates passed str pointer.
+ * It means:
+ * 1) passed pointer can be either NULL, or previously allocated by malloc.
+ * 2) Passed pointer can' be reused. It's either freed in case of error or can
+ * be changed.
+ */
+static char *xvstrcat(char *str, const char *fmt, va_list args)
+{
+ size_t offset = 0, delta;
+ int ret;
+ char *new;
+ va_list tmp;
+
+ if (str)
+ offset = strlen(str);
+ delta = strlen(fmt) * 2;
+
+ do {
+ ret = -ENOMEM;
+ new = xrealloc(str, offset + delta);
+ if (new) {
+ va_copy(tmp, args);
+ ret = vsnprintf(new + offset, delta, fmt, tmp);
+ va_end(tmp);
+ if (ret >= delta) {
+ /* NOTE: vsnprintf returns the amount of bytes
+ * to allocate. */
+ delta = ret +1;
+ str = new;
+ ret = 0;
+ }
+ }
+ } while (ret == 0);
+
+ if (ret == -ENOMEM) {
+ /* realloc failed. We must release former string */
+ pr_err("Failed to allocate string\n");
+ xfree(str);
+ } else if (ret < 0) {
+ /* vsnprintf failed */
+ pr_err("Failed to print string\n");
+ xfree(new);
+ new = NULL;
+ }
+ return new;
+}
+
+char *xstrcat(char *str, const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ str = xvstrcat(str, fmt, args);
+ va_end(args);
+
+ return str;
+}
+
+char *xsprintf(const char *fmt, ...)
+{
+ va_list args;
+ char *str;
+
+ va_start(args, fmt);
+ str = xvstrcat(NULL, fmt, args);
+ va_end(args);
+
+ return str;
+}
+
+static void vma_opt_str(const struct vma_area *v, char *opt)
+{
+ int p = 0;
+
+#define opt2s(_o, _s) do { \
+ if (v->e->status & _o) \
+ p += sprintf(opt + p, _s " "); \
+ } while (0)
+
+ opt[p] = '\0';
+ opt2s(VMA_AREA_REGULAR, "reg");
+ opt2s(VMA_AREA_STACK, "stk");
+ opt2s(VMA_AREA_VSYSCALL, "vsys");
+ opt2s(VMA_AREA_VDSO, "vdso");
+ opt2s(VMA_AREA_VVAR, "vvar");
+ opt2s(VMA_AREA_HEAP, "heap");
+ opt2s(VMA_FILE_PRIVATE, "fp");
+ opt2s(VMA_FILE_SHARED, "fs");
+ opt2s(VMA_ANON_SHARED, "as");
+ opt2s(VMA_ANON_PRIVATE, "ap");
+ opt2s(VMA_AREA_SYSVIPC, "sysv");
+ opt2s(VMA_AREA_SOCKET, "sk");
+
+#undef opt2s
+}
+
+void pr_vma(unsigned int loglevel, const struct vma_area *vma_area)
+{
+ char opt[VMA_OPT_LEN];
+ memset(opt, 0, VMA_OPT_LEN);
+
+ if (!vma_area)
+ return;
+
+ vma_opt_str(vma_area, opt);
+ print_on_level(loglevel, "%#"PRIx64"-%#"PRIx64" (%"PRIi64"K) prot %#x flags %#x st %#x off %#"PRIx64" "
+ "%s shmid: %#"PRIx64"\n",
+ vma_area->e->start, vma_area->e->end,
+ KBYTES(vma_area_len(vma_area)),
+ vma_area->e->prot,
+ vma_area->e->flags,
+ vma_area->e->status,
+ vma_area->e->pgoff,
+ opt, vma_area->e->shmid);
+}
+
+int close_safe(int *fd)
+{
+ int ret = 0;
+
+ if (*fd > -1) {
+ ret = close(*fd);
+ if (!ret)
+ *fd = -1;
+ else
+ pr_perror("Unable to close fd %d", *fd);
+ }
+
+ return ret;
+}
+
+int reopen_fd_as_safe(char *file, int line, int new_fd, int old_fd, bool allow_reuse_fd)
+{
+ int tmp;
+
+ if (old_fd != new_fd) {
+ /* make sure we won't clash with an inherit fd */
+ if (inherit_fd_resolve_clash(new_fd) < 0)
+ return -1;
+
+ if (!allow_reuse_fd) {
+ if (fcntl(new_fd, F_GETFD) != -1 || errno != EBADF) {
+ pr_err("fd %d already in use (called at %s:%d)\n",
+ new_fd, file, line);
+ return -1;
+ }
+ }
+
+ tmp = dup2(old_fd, new_fd);
+ if (tmp < 0) {
+ pr_perror("Dup %d -> %d failed (called at %s:%d)",
+ old_fd, new_fd, file, line);
+ return tmp;
+ }
+
+ /* Just to have error message if failed */
+ close_safe(&old_fd);
+ }
+
+ return 0;
+}
+
+int move_img_fd(int *img_fd, int want_fd)
+{
+ if (*img_fd == want_fd) {
+ int tmp;
+
+ tmp = dup(*img_fd);
+ if (tmp < 0) {
+ pr_perror("Can't dup file");
+ return -1;
+ }
+
+ close(*img_fd);
+
+ *img_fd = tmp;
+ }
+
+ return 0;
+}
+
+/*
+ * Cached opened /proc/$pid and /proc/self files.
+ * Used for faster access to /proc/.../foo files
+ * by using openat()-s
+ */
+
+static pid_t open_proc_pid = PROC_NONE;
+static int open_proc_fd = -1;
+static pid_t open_proc_self_pid;
+static int open_proc_self_fd = -1;
+
+static inline void set_proc_self_fd(int fd)
+{
+ if (open_proc_self_fd >= 0)
+ close(open_proc_self_fd);
+
+ open_proc_self_fd = fd;
+ open_proc_self_pid = getpid();
+}
+
+static inline void set_proc_pid_fd(int pid, int fd)
+{
+ if (open_proc_fd >= 0)
+ close(open_proc_fd);
+
+ open_proc_pid = pid;
+ open_proc_fd = fd;
+}
+
+static inline int get_proc_fd(int pid)
+{
+ if (pid == PROC_SELF) {
+ if (open_proc_self_fd != -1 && open_proc_self_pid != getpid()) {
+ close(open_proc_self_fd);
+ open_proc_self_fd = -1;
+ }
+ return open_proc_self_fd;
+ } else if (pid == open_proc_pid)
+ return open_proc_fd;
+ else
+ return -1;
+}
+
+int close_pid_proc(void)
+{
+ set_proc_self_fd(-1);
+ set_proc_pid_fd(PROC_NONE, -1);
+ return 0;
+}
+
+void close_proc()
+{
+ close_pid_proc();
+ close_service_fd(PROC_FD_OFF);
+}
+
+int set_proc_fd(int fd)
+{
+ if (install_service_fd(PROC_FD_OFF, fd) < 0)
+ return -1;
+ return 0;
+}
+
+static int open_proc_sfd(char *path)
+{
+ int fd, ret;
+
+ close_proc();
+ fd = open(path, O_DIRECTORY | O_PATH);
+ if (fd == -1) {
+ pr_perror("Can't open %s", path);
+ return -1;
+ }
+
+ ret = install_service_fd(PROC_FD_OFF, fd);
+ close(fd);
+ if (ret < 0)
+ return -1;
+
+ return 0;
+}
+
+inline int open_pid_proc(pid_t pid)
+{
+ char path[18];
+ int fd;
+ int dfd;
+
+ fd = get_proc_fd(pid);
+ if (fd >= 0)
+ return fd;
+
+ dfd = get_service_fd(PROC_FD_OFF);
+ if (dfd < 0) {
+ if (open_proc_sfd("/proc") < 0)
+ return -1;
+
+ dfd = get_service_fd(PROC_FD_OFF);
+ }
+
+ if (pid == PROC_GEN)
+ /*
+ * Don't cache it, close_pid_proc() would
+ * close service descriptor otherwise.
+ */
+ return dfd;
+
+ if (pid == PROC_SELF)
+ snprintf(path, sizeof(path), "self");
+ else
+ snprintf(path, sizeof(path), "%d", pid);
+
+ fd = openat(dfd, path, O_PATH);
+ if (fd < 0) {
+ pr_perror("Can't open %s", path);
+ set_cr_errno(ESRCH);
+ return -1;
+ }
+
+ if (pid == PROC_SELF)
+ set_proc_self_fd(fd);
+ else
+ set_proc_pid_fd(pid, fd);
+
+ return fd;
+}
+
+int do_open_proc(pid_t pid, int flags, const char *fmt, ...)
+{
+ char path[128];
+ va_list args;
+ int dirfd;
+
+ dirfd = open_pid_proc(pid);
+ if (dirfd < 0)
+ return -1;
+
+ va_start(args, fmt);
+ vsnprintf(path, sizeof(path), fmt, args);
+ va_end(args);
+
+ return openat(dirfd, path, flags);
+}
+
+static int service_fd_rlim_cur;
+static int service_fd_id = 0;
+
+int init_service_fd(void)
+{
+ struct rlimit rlimit;
+
+ /*
+ * Service FDs are those that most likely won't
+ * conflict with any 'real-life' ones
+ */
+
+ if (getrlimit(RLIMIT_NOFILE, &rlimit)) {
+ pr_perror("Can't get rlimit");
+ return -1;
+ }
+
+ service_fd_rlim_cur = (int)rlimit.rlim_cur;
+ BUG_ON(service_fd_rlim_cur < SERVICE_FD_MAX);
+
+ return 0;
+}
+
+static int __get_service_fd(enum sfd_type type, int service_fd_id)
+{
+ return service_fd_rlim_cur - type - SERVICE_FD_MAX * service_fd_id;
+}
+
+static DECLARE_BITMAP(sfd_map, SERVICE_FD_MAX);
+
+int reserve_service_fd(enum sfd_type type)
+{
+ int sfd = __get_service_fd(type, service_fd_id);
+
+ BUG_ON((int)type <= SERVICE_FD_MIN || (int)type >= SERVICE_FD_MAX);
+
+ set_bit(type, sfd_map);
+ return sfd;
+}
+
+int install_service_fd(enum sfd_type type, int fd)
+{
+ int sfd = __get_service_fd(type, service_fd_id);
+
+ BUG_ON((int)type <= SERVICE_FD_MIN || (int)type >= SERVICE_FD_MAX);
+
+ if (dup3(fd, sfd, O_CLOEXEC) != sfd) {
+ pr_perror("Dup %d -> %d failed", fd, sfd);
+ return -1;
+ }
+
+ set_bit(type, sfd_map);
+ return sfd;
+}
+
+int get_service_fd(enum sfd_type type)
+{
+ BUG_ON((int)type <= SERVICE_FD_MIN || (int)type >= SERVICE_FD_MAX);
+
+ if (!test_bit(type, sfd_map))
+ return -1;
+
+ return __get_service_fd(type, service_fd_id);
+}
+
+int criu_get_image_dir(void)
+{
+ return get_service_fd(IMG_FD_OFF);
+}
+
+int close_service_fd(enum sfd_type type)
+{
+ int fd;
+
+ fd = get_service_fd(type);
+ if (fd < 0)
+ return 0;
+
+ if (close_safe(&fd))
+ return -1;
+
+ clear_bit(type, sfd_map);
+ return 0;
+}
+
+int clone_service_fd(int id)
+{
+ int ret = -1, i;
+
+ if (service_fd_id == id)
+ return 0;
+
+ for (i = SERVICE_FD_MIN + 1; i < SERVICE_FD_MAX; i++) {
+ int old = __get_service_fd(i, service_fd_id);
+ int new = __get_service_fd(i, id);
+
+ ret = dup2(old, new);
+ if (ret == -1) {
+ if (errno == EBADF)
+ continue;
+ pr_perror("Unable to clone %d->%d", old, new);
+ }
+ }
+
+ service_fd_id = id;
+ ret = 0;
+
+ return ret;
+}
+
+bool is_any_service_fd(int fd)
+{
+ return fd > __get_service_fd(SERVICE_FD_MAX, service_fd_id) &&
+ fd < __get_service_fd(SERVICE_FD_MIN, service_fd_id);
+}
+
+bool is_service_fd(int fd, enum sfd_type type)
+{
+ return fd == get_service_fd(type);
+}
+
+int copy_file(int fd_in, int fd_out, size_t bytes)
+{
+ ssize_t written = 0;
+ size_t chunk = bytes ? bytes : 4096;
+
+ while (1) {
+ ssize_t ret;
+
+ ret = sendfile(fd_out, fd_in, NULL, chunk);
+ if (ret < 0) {
+ pr_perror("Can't send data to ghost file");
+ return -1;
+ }
+
+ if (ret == 0) {
+ if (bytes && (written != bytes)) {
+ pr_err("Ghost file size mismatch %zu/%zu\n",
+ written, bytes);
+ return -1;
+ }
+ break;
+ }
+
+ written += ret;
+ }
+
+ return 0;
+}
+
+int read_fd_link(int lfd, char *buf, size_t size)
+{
+ char t[32];
+ ssize_t ret;
+
+ snprintf(t, sizeof(t), "/proc/self/fd/%d", lfd);
+ ret = readlink(t, buf, size);
+ if (ret < 0) {
+ pr_perror("Can't read link of fd %d", lfd);
+ return -1;
+ } else if ((size_t)ret >= size) {
+ pr_err("Buffer for read link of fd %d is too small\n", lfd);
+ return -1;
+ }
+ buf[ret] = 0;
+
+ return ret;
+}
+
+int is_anon_link_type(char *link, char *type)
+{
+ char aux[32];
+
+ snprintf(aux, sizeof(aux), "anon_inode:%s", type);
+ return !strcmp(link, aux);
+}
+
+void *shmalloc(size_t bytes)
+{
+ return rst_mem_alloc(bytes, RM_SHARED);
+}
+
+/* Only last chunk can be released */
+void shfree_last(void *ptr)
+{
+ rst_mem_free_last(RM_SHARED);
+}
+
+#define DUP_SAFE(fd, out) \
+ ({ \
+ int ret__; \
+ ret__ = dup(fd); \
+ if (ret__ == -1) { \
+ pr_perror("dup(%d) failed", fd); \
+ goto out; \
+ } \
+ ret__; \
+ })
+
+/*
+ * If "in" is negative, stdin will be closed.
+ * If "out" or "err" are negative, a log file descriptor will be used.
+ */
+int cr_system(int in, int out, int err, char *cmd, char *const argv[], unsigned flags)
+{
+ return cr_system_userns(in, out, err, cmd, argv, flags, -1);
+}
+
+int cr_system_userns(int in, int out, int err, char *cmd,
+ char *const argv[], unsigned flags, int userns_pid)
+{
+ sigset_t blockmask, oldmask;
+ int ret = -1, status;
+ pid_t pid;
+
+ sigemptyset(&blockmask);
+ sigaddset(&blockmask, SIGCHLD);
+ if (sigprocmask(SIG_BLOCK, &blockmask, &oldmask) == -1) {
+ pr_perror("Can not set mask of blocked signals");
+ return -1;
+ }
+
+ pid = fork();
+ if (pid == -1) {
+ pr_perror("fork() failed");
+ goto out;
+ } else if (pid == 0) {
+ if (userns_pid > 0) {
+ if (switch_ns(userns_pid, &user_ns_desc, NULL))
+ goto out_chld;
+ if (setuid(0) || setgid(0)) {
+ pr_perror("Unable to set uid or gid");
+ goto out_chld;
+ }
+ }
+
+ if (out < 0)
+ out = log_get_fd();
+ if (err < 0)
+ err = log_get_fd();
+
+ /*
+ * out, err, in should be a separate fds,
+ * because reopen_fd_as() closes an old fd
+ */
+ if (err == out || err == in)
+ err = DUP_SAFE(err, out_chld);
+
+ if (out == in)
+ out = DUP_SAFE(out, out_chld);
+
+ if (move_img_fd(&out, STDIN_FILENO) ||
+ move_img_fd(&err, STDIN_FILENO))
+ goto out_chld;
+
+ if (in < 0) {
+ close(STDIN_FILENO);
+ } else {
+ if (reopen_fd_as_nocheck(STDIN_FILENO, in))
+ goto out_chld;
+ }
+
+ if (move_img_fd(&err, STDOUT_FILENO))
+ goto out_chld;
+
+ if (reopen_fd_as_nocheck(STDOUT_FILENO, out))
+ goto out_chld;
+
+ if (reopen_fd_as_nocheck(STDERR_FILENO, err))
+ goto out_chld;
+
+ execvp(cmd, argv);
+
+ pr_perror("exec failed");
+out_chld:
+ _exit(1);
+ }
+
+ while (1) {
+ ret = waitpid(pid, &status, 0);
+ if (ret == -1) {
+ pr_perror("waitpid() failed");
+ goto out;
+ }
+
+ if (WIFEXITED(status)) {
+ if (!(flags & CRS_CAN_FAIL) && WEXITSTATUS(status))
+ pr_err("exited, status=%d\n", WEXITSTATUS(status));
+ break;
+ } else if (WIFSIGNALED(status)) {
+ pr_err("killed by signal %d\n", WTERMSIG(status));
+ break;
+ } else if (WIFSTOPPED(status)) {
+ pr_err("stopped by signal %d\n", WSTOPSIG(status));
+ } else if (WIFCONTINUED(status)) {
+ pr_err("continued\n");
+ }
+ }
+
+ ret = status ? -1 : 0;
+out:
+ if (sigprocmask(SIG_SETMASK, &oldmask, NULL) == -1) {
+ pr_perror("Can not unset mask of blocked signals");
+ BUG();
+ }
+
+ return ret;
+}
+
+int cr_daemon(int nochdir, int noclose, int *keep_fd, int close_fd)
+{
+ int pid;
+
+ pid = fork();
+ if (pid < 0) {
+ pr_perror("Can't fork");
+ return -1;
+ }
+
+ if (pid > 0)
+ return pid;
+
+ setsid();
+ if (!nochdir)
+ if (chdir("/") == -1)
+ pr_perror("Can't change directory");
+ if (!noclose) {
+ int fd;
+
+ if (close_fd != -1)
+ close(close_fd);
+
+ if (*keep_fd != -1)
+ *keep_fd = dup2(*keep_fd, 3);
+
+ fd = open("/dev/null", O_RDWR);
+ if (fd < 0) {
+ pr_perror("Can't open /dev/null");
+ return -1;
+ }
+ dup2(fd, 0);
+ dup2(fd, 1);
+ dup2(fd, 2);
+ close(fd);
+ }
+
+ return 0;
+}
+
+int is_root_user()
+{
+ if (geteuid() != 0) {
+ pr_err("You need to be root to run this command\n");
+ return 0;
+ }
+
+ return 1;
+}
+
+int is_empty_dir(int dirfd)
+{
+ int ret = 0;
+ DIR *fdir = NULL;
+ struct dirent *de;
+
+ fdir = fdopendir(dirfd);
+ if (!fdir)
+ return -1;
+
+ while ((de = readdir(fdir))) {
+ if (dir_dots(de))
+ continue;
+
+ goto out;
+ }
+
+ ret = 1;
+out:
+ closedir(fdir);
+ return ret;
+}
+
+int vaddr_to_pfn(unsigned long vaddr, u64 *pfn)
+{
+ int fd, ret = -1;
+ off_t off;
+
+ fd = open_proc(getpid(), "pagemap");
+ if (fd < 0)
+ return -1;
+
+ off = (vaddr / page_size()) * sizeof(u64);
+ ret = pread(fd, pfn, sizeof(*pfn), off);
+ if (ret != sizeof(*pfn)) {
+ pr_perror("Can't read pme for pid %d", getpid());
+ ret = -1;
+ } else {
+ *pfn &= PME_PFRAME_MASK;
+ ret = 0;
+ }
+
+ close(fd);
+ return ret;
+}
+
+/*
+ * Note since VMA_AREA_NONE = 0 we can skip assignment
+ * here and simply rely on xzalloc
+ */
+struct vma_area *alloc_vma_area(void)
+{
+ struct vma_area *p;
+
+ p = xzalloc(sizeof(*p) + sizeof(VmaEntry));
+ if (p) {
+ p->e = (VmaEntry *)(p + 1);
+ vma_entry__init(p->e);
+ p->vm_file_fd = -1;
+ p->e->fd = -1;
+ }
+
+ return p;
+}
+
+int mkdirpat(int fd, const char *path)
+{
+ size_t i;
+ char made_path[PATH_MAX], *pos;
+
+ if (strlen(path) >= PATH_MAX) {
+ pr_err("path %s is longer than PATH_MAX\n", path);
+ return -1;
+ }
+
+ strcpy(made_path, path);
+
+ i = 0;
+ if (made_path[0] == '/')
+ i++;
+
+ for (; i < strlen(made_path); i++) {
+ pos = strchr(made_path + i, '/');
+ if (pos)
+ *pos = '\0';
+ if (mkdirat(fd, made_path, 0755) < 0 && errno != EEXIST) {
+ pr_perror("couldn't mkdirpat directory %s", made_path);
+ return -1;
+ }
+ if (pos) {
+ *pos = '/';
+ i = pos - made_path;
+ } else
+ break;
+ }
+
+ return 0;
+}
+
+bool is_path_prefix(const char *path, const char *prefix)
+{
+ if (strstartswith(path, prefix)) {
+ size_t len = strlen(prefix);
+ switch (path[len]) {
+ case '\0':
+ case '/':
+ return true;
+ }
+ }
+
+ return false;
+}
+
+FILE *fopenat(int dirfd, char *path, char *cflags)
+{
+ int tmp, flags = 0;
+ char *iter;
+
+ for (iter = cflags; *iter; iter++) {
+ switch (*iter) {
+ case 'r':
+ flags |= O_RDONLY;
+ break;
+ case 'a':
+ flags |= O_APPEND;
+ break;
+ case 'w':
+ flags |= O_WRONLY | O_CREAT;
+ break;
+ case '+':
+ flags = O_RDWR | O_CREAT;
+ break;
+ }
+ }
+
+ tmp = openat(dirfd, path, flags, S_IRUSR | S_IWUSR);
+ if (tmp < 0)
+ return NULL;
+
+ return fdopen(tmp, cflags);
+}
+
+void split(char *str, char token, char ***out, int *n)
+{
+ int i;
+ char *cur;
+
+ *n = 0;
+ for (cur = str; cur != NULL; cur = strchr(cur, token)) {
+ (*n)++;
+ cur++;
+ }
+
+
+ *out = xmalloc((*n) * sizeof(char *));
+ if (!*out) {
+ *n = -1;
+ return;
+
+ }
+
+ cur = str;
+ i = 0;
+ do {
+ char *prev = cur;
+ cur = strchr(cur, token);
+
+ if (cur)
+ *cur = '\0';
+ (*out)[i] = xstrdup(prev);
+ if (cur) {
+ *cur = token;
+ cur++;
+ }
+
+ if (!(*out)[i]) {
+ int j;
+ for (j = 0; j < i; j++)
+ xfree((*out)[j]);
+ xfree(*out);
+ *out = NULL;
+ *n = -1;
+ return;
+ }
+
+ i++;
+ } while(cur);
+}
+
+int fd_has_data(int lfd)
+{
+ struct pollfd pfd = {lfd, POLLIN, 0};
+ int ret;
+
+ ret = poll(&pfd, 1, 0);
+ if (ret < 0) {
+ pr_perror("poll() failed");
+ }
+
+ return ret;
+}
+
+size_t read_into_buffer(int fd, char *buff, size_t size)
+{
+ size_t n = 0;
+ size_t curr = 0;
+
+ while (1) {
+ n = read(fd, buff + curr, size - curr);
+ if (n < 1)
+ return n;
+ curr += n;
+ if (curr == size)
+ return size;
+ }
+}
+
+int make_yard(char *path)
+{
+ if (mount("none", path, "tmpfs", 0, NULL)) {
+ pr_perror("Unable to mount tmpfs in %s", path);
+ return -1;
+ }
+
+ if (mount("none", path, NULL, MS_PRIVATE, NULL)) {
+ pr_perror("Unable to mark yard as private");
+ return -1;
+ }
+
+ return 0;
+}
+
+const char *ns_to_string(unsigned int ns)
+{
+ switch (ns) {
+ case CLONE_NEWIPC:
+ return "ipc";
+ case CLONE_NEWNS:
+ return "mnt";
+ case CLONE_NEWNET:
+ return "net";
+ case CLONE_NEWPID:
+ return "pid";
+ case CLONE_NEWUSER:
+ return "user";
+ case CLONE_NEWUTS:
+ return "uts";
+ default:
+ return NULL;
+ }
+}
+
+void tcp_cork(int sk, bool on)
+{
+ int val = on ? 1 : 0;
+ setsockopt(sk, SOL_TCP, TCP_CORK, &val, sizeof(val));
+}
+
+void tcp_nodelay(int sk, bool on)
+{
+ int val = on ? 1 : 0;
+ setsockopt(sk, SOL_TCP, TCP_NODELAY, &val, sizeof(val));
+}
diff --git a/criu/uts_ns.c b/criu/uts_ns.c
new file mode 100644
index 000000000000..ed64d77ec467
--- /dev/null
+++ b/criu/uts_ns.c
@@ -0,0 +1,71 @@
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/utsname.h>
+#include <string.h>
+#include <sched.h>
+
+#include "util.h"
+#include "namespaces.h"
+#include "sysctl.h"
+#include "uts_ns.h"
+
+#include "protobuf.h"
+#include "protobuf/utsns.pb-c.h"
+
+int dump_uts_ns(int ns_id)
+{
+ int ret;
+ struct cr_img *img;
+ struct utsname ubuf;
+ UtsnsEntry ue = UTSNS_ENTRY__INIT;
+
+ img = open_image(CR_FD_UTSNS, O_DUMP, ns_id);
+ if (!img)
+ return -1;
+
+ ret = uname(&ubuf);
+ if (ret < 0) {
+ pr_perror("Error calling uname");
+ goto err;
+ }
+
+ ue.nodename = ubuf.nodename;
+ ue.domainname = ubuf.domainname;
+
+ ret = pb_write_one(img, &ue, PB_UTSNS);
+err:
+ close_image(img);
+ return ret < 0 ? -1 : 0;
+}
+
+int prepare_utsns(int pid)
+{
+ int ret;
+ struct cr_img *img;
+ UtsnsEntry *ue;
+ struct sysctl_req req[] = {
+ { "kernel/hostname" },
+ { "kernel/domainname" },
+ };
+
+ img = open_image(CR_FD_UTSNS, O_RSTR, pid);
+ if (!img)
+ return -1;
+
+ ret = pb_read_one(img, &ue, PB_UTSNS);
+ if (ret < 0)
+ goto out;
+
+ req[0].arg = ue->nodename;
+ req[0].type = CTL_STR(strlen(ue->nodename));
+ req[1].arg = ue->domainname;
+ req[1].type = CTL_STR(strlen(ue->domainname));
+
+ ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWUTS);
+ utsns_entry__free_unpacked(ue, NULL);
+out:
+ close_image(img);
+ return ret;
+}
+
+struct ns_desc uts_ns_desc = NS_DESC_ENTRY(CLONE_NEWUTS, "uts");
diff --git a/criu/vdso.c b/criu/vdso.c
new file mode 100644
index 000000000000..bccf11cc11f8
--- /dev/null
+++ b/criu/vdso.c
@@ -0,0 +1,320 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <elf.h>
+#include <fcntl.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+#include "asm/types.h"
+#include "asm/parasite-syscall.h"
+
+#include "parasite-syscall.h"
+#include "parasite.h"
+#include "compiler.h"
+#include "kerndat.h"
+#include "vdso.h"
+#include "util.h"
+#include "log.h"
+#include "mem.h"
+#include "vma.h"
+
+#ifdef LOG_PREFIX
+# undef LOG_PREFIX
+#endif
+#define LOG_PREFIX "vdso: "
+
+struct vdso_symtable vdso_sym_rt = VDSO_SYMTABLE_INIT;
+u64 vdso_pfn = VDSO_BAD_PFN;
+/*
+ * The VMAs list might have proxy vdso/vvar areas left
+ * from previous dump/restore cycle so we need to detect
+ * them and eliminated from the VMAs list, they will be
+ * generated again on restore if needed.
+ */
+int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid,
+ struct vm_area_list *vma_area_list)
+{
+ unsigned long proxy_vdso_addr = VDSO_BAD_ADDR;
+ unsigned long proxy_vvar_addr = VVAR_BAD_ADDR;
+ struct vma_area *proxy_vdso_marked = NULL;
+ struct vma_area *proxy_vvar_marked = NULL;
+ struct parasite_vdso_vma_entry *args;
+ int fd = -1, ret, exit_code = -1;
+ u64 pfn = VDSO_BAD_PFN;
+ struct vma_area *vma;
+ off_t off;
+
+ args = parasite_args(ctl, struct parasite_vdso_vma_entry);
+ if (kdat.pmap == PM_FULL) {
+ BUG_ON(vdso_pfn == VDSO_BAD_PFN);
+ fd = open_proc(pid, "pagemap");
+ if (fd < 0)
+ return -1;
+ } else
+ pr_info("Pagemap is unavailable, trying a slow way\n");
+
+ list_for_each_entry(vma, &vma_area_list->h, list) {
+ if (!vma_area_is(vma, VMA_AREA_REGULAR))
+ continue;
+
+ if (vma_area_is(vma, VMA_FILE_SHARED) ||
+ vma_area_is(vma, VMA_FILE_PRIVATE))
+ continue;
+ /*
+ * It might be possible VVAR area from marked
+ * vDSO zone, we need to detect it earlier than
+ * VDSO_PROT test because VVAR_PROT is a subset
+ * of it but don't yield continue here,
+ * sigh... what a mess.
+ */
+ BUILD_BUG_ON(!(VDSO_PROT & VVAR_PROT));
+
+ if ((vma->e->prot & VVAR_PROT) == VVAR_PROT) {
+ if (proxy_vvar_addr != VVAR_BAD_ADDR &&
+ proxy_vvar_addr == vma->e->start) {
+ BUG_ON(proxy_vvar_marked);
+ proxy_vvar_marked = vma;
+ continue;
+ }
+ }
+
+ if ((vma->e->prot & VDSO_PROT) != VDSO_PROT)
+ continue;
+
+ if (vma->e->start > kdat.task_size)
+ continue;
+
+ if (vma->e->flags & MAP_GROWSDOWN)
+ continue;
+
+ /*
+ * I need to poke every potentially marked vma,
+ * otherwise if task never called for vdso functions
+ * page frame number won't be reported.
+ *
+ * Moreover, if page frame numbers are not accessible
+ * we have to scan the vma zone for vDSO elf structure
+ * which gonna be a slow way.
+ */
+ args->start = vma->e->start;
+ args->len = vma_area_len(vma);
+ args->try_fill_symtable = (fd < 0) ? true : false;
+ args->is_vdso = false;
+
+ if (parasite_execute_daemon(PARASITE_CMD_CHECK_VDSO_MARK, ctl)) {
+ pr_err("Parasite failed to poke for mark\n");
+ goto err;
+ }
+
+ /*
+ * Defer handling marked vdso until we walked over
+ * all vmas and restore potentially remapped vDSO
+ * area status.
+ */
+ if (unlikely(args->is_marked)) {
+ if (proxy_vdso_marked) {
+ pr_err("Ow! Second vdso mark detected!\n");
+ goto err;
+ }
+ proxy_vdso_marked = vma;
+ proxy_vdso_addr = args->proxy_vdso_addr;
+ proxy_vvar_addr = args->proxy_vvar_addr;
+ continue;
+ }
+
+ /*
+ * If we have an access to pagemap we can handle vDSO
+ * status early. Otherwise, in worst scenario, where
+ * the dumpee has been remapping vdso on its own and
+ * the kernel version is < 3.16, the vdso won't be
+ * detected via procfs status so we have to parse
+ * symbols in parasite code.
+ */
+ if (fd >= 0) {
+ off = (vma->e->start / PAGE_SIZE) * sizeof(u64);
+ ret = pread(fd, &pfn, sizeof(pfn), off);
+ if (ret < 0 || ret != sizeof(pfn)) {
+ pr_perror("Can't read pme for pid %d", pid);
+ goto err;
+ }
+
+ pfn = PME_PFRAME(pfn);
+ if (!pfn) {
+ pr_err("Unexpected page fram number 0 for pid %d\n", pid);
+ goto err;
+ }
+ }
+
+ /*
+ * Setup proper VMA status. Note starting with 3.16
+ * the [vdso]/[vvar] marks are reported correctly
+ * even when they are remapped into a new place,
+ * but only since that particular version of the
+ * kernel!
+ */
+ if ((pfn == vdso_pfn && pfn != VDSO_BAD_PFN) || args->is_vdso) {
+ if (!vma_area_is(vma, VMA_AREA_VDSO)) {
+ pr_debug("Restore vDSO status by pfn/symtable at %lx\n",
+ (long)vma->e->start);
+ vma->e->status |= VMA_AREA_VDSO;
+ }
+ } else {
+ if (unlikely(vma_area_is(vma, VMA_AREA_VDSO))) {
+ pr_debug("Drop mishinted vDSO status at %lx\n",
+ (long)vma->e->start);
+ vma->e->status &= ~VMA_AREA_VDSO;
+ }
+ }
+ }
+
+ /*
+ * There is marked vdso, it means such vdso is autogenerated
+ * and must be dropped from vma list.
+ */
+ if (proxy_vdso_marked) {
+ pr_debug("vdso: Found marked at %lx (proxy vDSO at %lx VVAR at %lx)\n",
+ (long)proxy_vdso_marked->e->start,
+ (long)proxy_vdso_addr, (long)proxy_vvar_addr);
+
+ /*
+ * Don't forget to restore the proxy vdso/vvar status, since
+ * it's unknown to the kernel.
+ */
+ list_for_each_entry(vma, &vma_area_list->h, list) {
+ if (vma->e->start == proxy_vdso_addr) {
+ vma->e->status |= VMA_AREA_REGULAR | VMA_AREA_VDSO;
+ pr_debug("vdso: Restore proxy vDSO status at %lx\n",
+ (long)vma->e->start);
+ } else if (vma->e->start == proxy_vvar_addr) {
+ vma->e->status |= VMA_AREA_REGULAR | VMA_AREA_VVAR;
+ pr_debug("vdso: Restore proxy VVAR status at %lx\n",
+ (long)vma->e->start);
+ }
+ }
+
+ pr_debug("vdso: Droppping marked vdso at %lx\n",
+ (long)proxy_vdso_marked->e->start);
+ list_del(&proxy_vdso_marked->list);
+ xfree(proxy_vdso_marked);
+ vma_area_list->nr--;
+
+ if (proxy_vvar_marked) {
+ pr_debug("vdso: Droppping marked vvar at %lx\n",
+ (long)proxy_vvar_marked->e->start);
+ list_del(&proxy_vvar_marked->list);
+ xfree(proxy_vvar_marked);
+ vma_area_list->nr--;
+ }
+ }
+ exit_code = 0;
+err:
+ close_safe(&fd);
+ return exit_code;
+}
+
+static int vdso_fill_self_symtable(struct vdso_symtable *s)
+{
+ char buf[512];
+ int ret, exit_code = -1;
+ FILE *maps;
+
+ *s = (struct vdso_symtable)VDSO_SYMTABLE_INIT;
+
+ maps = fopen_proc(PROC_SELF, "maps");
+ if (!maps) {
+ pr_perror("Can't open self-vma");
+ return -1;
+ }
+
+ while (fgets(buf, sizeof(buf), maps)) {
+ unsigned long start, end;
+ char *has_vdso, *has_vvar;
+
+ has_vdso = strstr(buf, "[vdso]");
+ if (!has_vdso)
+ has_vvar = strstr(buf, "[vvar]");
+ else
+ has_vvar = NULL;
+
+ if (!has_vdso && !has_vvar)
+ continue;
+
+ ret = sscanf(buf, "%lx-%lx", &start, &end);
+ if (ret != 2) {
+ pr_err("Can't find vDSO/VVAR bounds\n");
+ goto err;
+ }
+
+ if (has_vdso) {
+ if (s->vma_start != VDSO_BAD_ADDR) {
+ pr_err("Got second vDSO entry\n");
+ goto err;
+ }
+ s->vma_start = start;
+ s->vma_end = end;
+
+ ret = vdso_fill_symtable((void *)start, end - start, s);
+ if (ret)
+ goto err;
+ } else {
+ if (s->vvar_start != VVAR_BAD_ADDR) {
+ pr_err("Got second VVAR entry\n");
+ goto err;
+ }
+ s->vvar_start = start;
+ s->vvar_end = end;
+ }
+ }
+
+ /*
+ * Validate its structure -- for new vDSO format the
+ * structure must be like
+ *
+ * 7fff1f5fd000-7fff1f5fe000 r-xp 00000000 00:00 0 [vdso]
+ * 7fff1f5fe000-7fff1f600000 r--p 00000000 00:00 0 [vvar]
+ *
+ * The areas may be in reverse order.
+ *
+ * 7fffc3502000-7fffc3504000 r--p 00000000 00:00 0 [vvar]
+ * 7fffc3504000-7fffc3506000 r-xp 00000000 00:00 0 [vdso]
+ *
+ */
+ if (s->vma_start != VDSO_BAD_ADDR) {
+ if (s->vvar_start != VVAR_BAD_ADDR) {
+ if (s->vma_end != s->vvar_start &&
+ s->vvar_end != s->vma_start) {
+ pr_err("Unexpected rt vDSO area bounds\n");
+ goto err;
+ }
+ }
+ } else {
+ pr_err("Can't find rt vDSO\n");
+ goto err;
+ }
+
+ pr_debug("rt [vdso] %lx-%lx [vvar] %lx-%lx\n",
+ s->vma_start, s->vma_end,
+ s->vvar_start, s->vvar_end);
+
+ exit_code = 0;
+err:
+ fclose(maps);
+ return exit_code;
+}
+
+int vdso_init(void)
+{
+ if (vdso_fill_self_symtable(&vdso_sym_rt))
+ return -1;
+
+ if (kdat.pmap != PM_FULL)
+ pr_info("VDSO detection turned off\n");
+ else if (vaddr_to_pfn(vdso_sym_rt.vma_start, &vdso_pfn))
+ return -1;
+
+ return 0;
+}
diff --git a/crtools b/crtools
deleted file mode 120000
index 33c504fed369..000000000000
--- a/crtools
+++ /dev/null
@@ -1 +0,0 @@
-criu
\ No newline at end of file
diff --git a/crtools.c b/crtools.c
deleted file mode 100644
index 44060293e730..000000000000
--- a/crtools.c
+++ /dev/null
@@ -1,836 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <limits.h>
-#include <unistd.h>
-#include <errno.h>
-#include <getopt.h>
-#include <string.h>
-#include <ctype.h>
-#include <sched.h>
-
-#include <fcntl.h>
-
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include <sys/socket.h>
-#include <netinet/in.h>
-#include <arpa/inet.h>
-
-#include <dlfcn.h>
-
-#include "asm/types.h"
-
-#include "compiler.h"
-#include "crtools.h"
-#include "cr_options.h"
-#include "sockets.h"
-#include "files.h"
-#include "sk-inet.h"
-#include "net.h"
-#include "version.h"
-#include "page-xfer.h"
-#include "tty.h"
-#include "file-lock.h"
-#include "cr-service.h"
-#include "plugin.h"
-#include "mount.h"
-#include "cgroup.h"
-#include "cpu.h"
-#include "action-scripts.h"
-#include "irmap.h"
-#include "fault-injection.h"
-#include "lsm.h"
-#include "proc_parse.h"
-
-#include "setproctitle.h"
-
-struct cr_options opts;
-
-void init_opts(void)
-{
- memset(&opts, 0, sizeof(opts));
-
- /* Default options */
- opts.final_state = TASK_DEAD;
- INIT_LIST_HEAD(&opts.ext_unixsk_ids);
- INIT_LIST_HEAD(&opts.veth_pairs);
- INIT_LIST_HEAD(&opts.scripts);
- INIT_LIST_HEAD(&opts.ext_mounts);
- INIT_LIST_HEAD(&opts.inherit_fds);
- INIT_LIST_HEAD(&opts.external);
- INIT_LIST_HEAD(&opts.new_cgroup_roots);
- INIT_LIST_HEAD(&opts.irmap_scan_paths);
-
- opts.cpu_cap = CPU_CAP_DEFAULT;
- opts.manage_cgroups = CG_MODE_DEFAULT;
- opts.ps_socket = -1;
- opts.ghost_limit = DEFAULT_GHOST_LIMIT;
- opts.timeout = DEFAULT_TIMEOUT;
-}
-
-static int parse_ns_string(const char *ptr)
-{
- const char *end = ptr + strlen(ptr);
-
- do {
- if (ptr[3] != ',' && ptr[3] != '\0')
- goto bad_ns;
- if (!strncmp(ptr, "uts", 3))
- opts.rst_namespaces_flags |= CLONE_NEWUTS;
- else if (!strncmp(ptr, "ipc", 3))
- opts.rst_namespaces_flags |= CLONE_NEWIPC;
- else if (!strncmp(ptr, "mnt", 3))
- opts.rst_namespaces_flags |= CLONE_NEWNS;
- else if (!strncmp(ptr, "pid", 3))
- opts.rst_namespaces_flags |= CLONE_NEWPID;
- else if (!strncmp(ptr, "net", 3))
- opts.rst_namespaces_flags |= CLONE_NEWNET;
- else
- goto bad_ns;
- ptr += 4;
- } while (ptr < end);
- return 0;
-
-bad_ns:
- pr_msg("Error: unknown namespace: %s\n", ptr);
- return -1;
-}
-
-static int parse_cpu_cap(struct cr_options *opts, const char *optarg)
-{
- bool inverse = false;
-
-#define ____cpu_set_cap(__opts, __cap, __inverse) \
- do { \
- if ((__inverse)) \
- (__opts)->cpu_cap &= ~(__cap); \
- else \
- (__opts)->cpu_cap |= (__cap); \
- } while (0)
-
- if (!optarg) {
- ____cpu_set_cap(opts, CPU_CAP_ALL, false);
- return 0;
- }
-
- while (*optarg) {
- if (optarg[0] == '^') {
- inverse = !inverse;
- optarg++;
- continue;
- } else if (optarg[0] == ',') {
- inverse = false;
- optarg++;
- continue;
- }
-
- if (!strncmp(optarg, "fpu", 3)) {
- ____cpu_set_cap(opts, CPU_CAP_FPU, inverse);
- optarg += 3;
- } else if (!strncmp(optarg, "all", 3)) {
- ____cpu_set_cap(opts, CPU_CAP_ALL, inverse);
- optarg += 3;
- } else if (!strncmp(optarg, "none", 4)) {
- if (inverse)
- opts->cpu_cap = CPU_CAP_ALL;
- else
- opts->cpu_cap = CPU_CAP_NONE;
- optarg += 4;
- } else if (!strncmp(optarg, "cpu", 3)) {
- ____cpu_set_cap(opts, CPU_CAP_CPU, inverse);
- optarg += 3;
- } else if (!strncmp(optarg, "ins", 3)) {
- ____cpu_set_cap(opts, CPU_CAP_INS, inverse);
- optarg += 3;
- } else
- goto Esyntax;
- }
-#undef ____cpu_set_cap
-
- return 0;
-
-Esyntax:
- pr_err("Unknown FPU mode `%s' selected\n", optarg);
- return -1;
-}
-
-static int parse_manage_cgroups(struct cr_options *opts, const char *optarg)
-{
- if (!optarg) {
- opts->manage_cgroups = CG_MODE_SOFT;
- return 0;
- }
-
- if (!strcmp(optarg, "none")) {
- opts->manage_cgroups = CG_MODE_NONE;
- } else if (!strcmp(optarg, "props")) {
- opts->manage_cgroups = CG_MODE_PROPS;
- } else if (!strcmp(optarg, "soft")) {
- opts->manage_cgroups = CG_MODE_SOFT;
- } else if (!strcmp(optarg, "full")) {
- opts->manage_cgroups = CG_MODE_FULL;
- } else if (!strcmp(optarg, "strict")) {
- opts->manage_cgroups = CG_MODE_STRICT;
- } else
- goto Esyntax;
-
- return 0;
-
-Esyntax:
- pr_err("Unknown cgroups mode `%s' selected\n", optarg);
- return -1;
-}
-
-static size_t parse_size(char *optarg)
-{
- if (index(optarg, 'K'))
- return (size_t)KILO(atol(optarg));
- else if (index(optarg, 'M'))
- return (size_t)MEGA(atol(optarg));
- else if (index(optarg, 'G'))
- return (size_t)GIGA(atol(optarg));
- return (size_t)atol(optarg);
-}
-
-int add_external(char *key)
-{
- struct external *ext;
-
- ext = xmalloc(sizeof(*ext));
- if (!ext)
- return -1;
- ext->id = key;
- list_add(&ext->node, &opts.external);
-
- return 0;
-}
-
-int main(int argc, char *argv[], char *envp[])
-{
- pid_t pid = 0, tree_id = 0;
- int ret = -1;
- bool usage_error = true;
- bool has_exec_cmd = false;
- int opt, idx;
- int log_level = LOG_UNSET;
- char *imgs_dir = ".";
- char *work_dir = NULL;
- static const char short_opts[] = "dSsRf:F:t:p:hcD:o:n:v::x::Vr:jlW:L:M:";
- static struct option long_opts[] = {
- { "tree", required_argument, 0, 't' },
- { "pid", required_argument, 0, 'p' },
- { "leave-stopped", no_argument, 0, 's' },
- { "leave-running", no_argument, 0, 'R' },
- { "restore-detached", no_argument, 0, 'd' },
- { "restore-sibling", no_argument, 0, 'S' },
- { "daemon", no_argument, 0, 'd' },
- { "contents", no_argument, 0, 'c' },
- { "file", required_argument, 0, 'f' },
- { "fields", required_argument, 0, 'F' },
- { "images-dir", required_argument, 0, 'D' },
- { "work-dir", required_argument, 0, 'W' },
- { "log-file", required_argument, 0, 'o' },
- { "namespaces", required_argument, 0, 'n' },
- { "root", required_argument, 0, 'r' },
- { USK_EXT_PARAM, optional_argument, 0, 'x' },
- { "help", no_argument, 0, 'h' },
- { SK_EST_PARAM, no_argument, 0, 1042 },
- { "close", required_argument, 0, 1043 },
- { "log-pid", no_argument, 0, 1044 },
- { "version", no_argument, 0, 'V' },
- { "evasive-devices", no_argument, 0, 1045 },
- { "pidfile", required_argument, 0, 1046 },
- { "veth-pair", required_argument, 0, 1047 },
- { "action-script", required_argument, 0, 1049 },
- { LREMAP_PARAM, no_argument, 0, 1041 },
- { OPT_SHELL_JOB, no_argument, 0, 'j' },
- { OPT_FILE_LOCKS, no_argument, 0, 'l' },
- { "page-server", no_argument, 0, 1050 },
- { "address", required_argument, 0, 1051 },
- { "port", required_argument, 0, 1052 },
- { "prev-images-dir", required_argument, 0, 1053 },
- { "ms", no_argument, 0, 1054 },
- { "track-mem", no_argument, 0, 1055 },
- { "auto-dedup", no_argument, 0, 1056 },
- { "libdir", required_argument, 0, 'L' },
- { "cpu-cap", optional_argument, 0, 1057 },
- { "force-irmap", no_argument, 0, 1058 },
- { "ext-mount-map", required_argument, 0, 'M' },
- { "exec-cmd", no_argument, 0, 1059 },
- { "manage-cgroups", optional_argument, 0, 1060 },
- { "cgroup-root", required_argument, 0, 1061 },
- { "inherit-fd", required_argument, 0, 1062 },
- { "feature", required_argument, 0, 1063 },
- { "skip-mnt", required_argument, 0, 1064 },
- { "enable-fs", required_argument, 0, 1065 },
- { "enable-external-sharing", no_argument, 0, 1066 },
- { "enable-external-masters", no_argument, 0, 1067 },
- { "freeze-cgroup", required_argument, 0, 1068 },
- { "ghost-limit", required_argument, 0, 1069 },
- { "irmap-scan-path", required_argument, 0, 1070 },
- { "lsm-profile", required_argument, 0, 1071 },
- { "timeout", required_argument, 0, 1072 },
- { "external", required_argument, 0, 1073 },
- { },
- };
-
- BUILD_BUG_ON(PAGE_SIZE != PAGE_IMAGE_SIZE);
-
- if (fault_injection_init())
- return 1;
-
- cr_pb_init();
- setproctitle_init(argc, argv, envp);
-
- if (argc < 2)
- goto usage;
-
- init_opts();
-
- if (init_service_fd())
- return 1;
-
- if (!strcmp(argv[1], "swrk")) {
- if (argc < 3)
- goto usage;
- /*
- * This is to start criu service worker from libcriu calls.
- * The usage is "criu swrk <fd>" and is not for CLI/scripts.
- * The arguments semantics can change at any tyme with the
- * corresponding lib call change.
- */
- opts.swrk_restore = true;
- return cr_service_work(atoi(argv[2]));
- }
-
- while (1) {
- idx = -1;
- opt = getopt_long(argc, argv, short_opts, long_opts, &idx);
- if (opt == -1)
- break;
-
- switch (opt) {
- case 's':
- opts.final_state = TASK_STOPPED;
- break;
- case 'R':
- opts.final_state = TASK_ALIVE;
- break;
- case 'x':
- if (optarg && unix_sk_ids_parse(optarg) < 0)
- return 1;
- opts.ext_unix_sk = true;
- break;
- case 'p':
- pid = atoi(optarg);
- if (pid <= 0)
- goto bad_arg;
- break;
- case 't':
- tree_id = atoi(optarg);
- if (tree_id <= 0)
- goto bad_arg;
- break;
- case 'c':
- opts.show_pages_content = true;
- break;
- case 'f':
- opts.show_dump_file = optarg;
- break;
- case 'F':
- opts.show_fmt = optarg;
- break;
- case 'r':
- opts.root = optarg;
- break;
- case 'd':
- opts.restore_detach = true;
- break;
- case 'S':
- opts.restore_sibling = true;
- break;
- case 'D':
- imgs_dir = optarg;
- break;
- case 'W':
- work_dir = optarg;
- break;
- case 'o':
- opts.output = optarg;
- break;
- case 'n':
- if (parse_ns_string(optarg))
- goto bad_arg;
- break;
- case 'v':
- if (log_level == LOG_UNSET)
- log_level = 0;
- if (optarg) {
- if (optarg[0] == 'v')
- /* handle -vvvvv */
- log_level += strlen(optarg) + 1;
- else
- log_level = atoi(optarg);
- } else
- log_level++;
- break;
- case 1041:
- pr_info("Will allow link remaps on FS\n");
- opts.link_remap_ok = true;
- break;
- case 1042:
- pr_info("Will dump TCP connections\n");
- opts.tcp_established_ok = true;
- break;
- case 1043: {
- int fd;
-
- fd = atoi(optarg);
- pr_info("Closing fd %d\n", fd);
- close(fd);
- break;
- }
- case 1044:
- opts.log_file_per_pid = 1;
- break;
- case 1045:
- opts.evasive_devices = true;
- break;
- case 1046:
- opts.pidfile = optarg;
- break;
- case 1047:
- {
- char *aux;
-
- aux = strchr(optarg, '=');
- if (aux == NULL)
- goto bad_arg;
-
- *aux = '\0';
- if (veth_pair_add(optarg, aux + 1))
- return 1;
- }
- break;
- case 1049:
- if (add_script(optarg, 0))
- return 1;
-
- break;
- case 1050:
- opts.use_page_server = true;
- break;
- case 1051:
- opts.addr = optarg;
- break;
- case 1052:
- opts.port = htons(atoi(optarg));
- if (!opts.port)
- goto bad_arg;
- break;
- case 'j':
- opts.shell_job = true;
- break;
- case 'l':
- opts.handle_file_locks = true;
- break;
- case 1053:
- opts.img_parent = optarg;
- break;
- case 1055:
- opts.track_mem = true;
- break;
- case 1056:
- opts.auto_dedup = true;
- break;
- case 1057:
- if (parse_cpu_cap(&opts, optarg))
- goto usage;
- break;
- case 1058:
- opts.force_irmap = true;
- break;
- case 1054:
- opts.check_ms_kernel = true;
- break;
- case 'L':
- opts.libdir = optarg;
- break;
- case 1059:
- has_exec_cmd = true;
- break;
- case 1060:
- if (parse_manage_cgroups(&opts, optarg))
- goto usage;
- break;
- case 1061:
- {
- char *path, *ctl;
-
- path = strchr(optarg, ':');
- if (path) {
- *path = '\0';
- path++;
- ctl = optarg;
- } else {
- path = optarg;
- ctl = NULL;
- }
-
- if (new_cg_root_add(ctl, path))
- return -1;
- }
- break;
- case 1062:
- if (inherit_fd_parse(optarg) < 0)
- return 1;
- break;
- case 1063:
- if (check_add_feature(optarg) < 0)
- return 1;
- break;
- case 1064:
- if (!add_skip_mount(optarg))
- return 1;
- break;
- case 1065:
- if (!add_fsname_auto(optarg))
- return 1;
- break;
- case 1066:
- opts.enable_external_sharing = true;
- break;
- case 1067:
- opts.enable_external_masters = true;
- break;
- case 1068:
- opts.freeze_cgroup = optarg;
- break;
- case 1069:
- opts.ghost_limit = parse_size(optarg);
- break;
- case 1070:
- if (irmap_scan_path_add(optarg))
- return -1;
- break;
- case 1071:
- if (parse_lsm_arg(optarg) < 0)
- return -1;
- break;
- case 1072:
- opts.timeout = atoi(optarg);
- break;
- case 'M':
- {
- char *aux;
-
- if (strcmp(optarg, "auto") == 0) {
- opts.autodetect_ext_mounts = true;
- break;
- }
-
- aux = strchr(optarg, ':');
- if (aux == NULL)
- goto bad_arg;
-
- *aux = '\0';
- if (ext_mount_add(optarg, aux + 1))
- return 1;
- }
- break;
- case 1073:
- if (add_external(optarg))
- return 1;
- break;
- case 'V':
- pr_msg("Version: %s\n", CRIU_VERSION);
- if (strcmp(CRIU_GITID, "0"))
- pr_msg("GitID: %s\n", CRIU_GITID);
- return 0;
- case 'h':
- usage_error = false;
- goto usage;
- default:
- goto usage;
- }
- }
-
- if (!opts.restore_detach && opts.restore_sibling) {
- pr_msg("--restore-sibling only makes sense with --restore-detach\n");
- return 1;
- }
-
- if (!opts.autodetect_ext_mounts && (opts.enable_external_masters || opts.enable_external_sharing)) {
- pr_msg("must specify --ext-mount-map auto with --enable-external-{sharing|masters}");
- return 1;
- }
-
- if (work_dir == NULL)
- work_dir = imgs_dir;
-
- if (optind >= argc) {
- pr_msg("Error: command is required\n");
- goto usage;
- }
-
- if (has_exec_cmd) {
- if (argc - optind <= 1) {
- pr_msg("Error: --exec-cmd requires a command\n");
- goto usage;
- }
-
- if (strcmp(argv[optind], "restore")) {
- pr_msg("Error: --exec-cmd is available for the restore command only\n");
- goto usage;
- }
-
- if (opts.restore_detach) {
- pr_msg("Error: --restore-detached and --exec-cmd cannot be used together\n");
- goto usage;
- }
-
- opts.exec_cmd = xmalloc((argc - optind) * sizeof(char *));
- if (!opts.exec_cmd)
- return 1;
- memcpy(opts.exec_cmd, &argv[optind + 1], (argc - optind - 1) * sizeof(char *));
- opts.exec_cmd[argc - optind - 1] = NULL;
- }
-
- /* We must not open imgs dir, if service is called */
- if (strcmp(argv[optind], "service")) {
- ret = open_image_dir(imgs_dir);
- if (ret < 0)
- return 1;
- }
-
- if (chdir(work_dir)) {
- pr_perror("Can't change directory to %s", work_dir);
- return 1;
- }
-
- log_set_loglevel(log_level);
-
- if (log_init(opts.output))
- return 1;
-
- if (!list_empty(&opts.external) && strcmp(argv[optind], "dump")) {
- pr_err("--external is dump-only option\n");
- return 1;
- }
-
- if (!list_empty(&opts.inherit_fds)) {
- if (strcmp(argv[optind], "restore")) {
- pr_err("--inherit-fd is restore-only option\n");
- return 1;
- }
- /* now that log file is set up, print inherit fd list */
- inherit_fd_log();
- }
-
- if (opts.img_parent)
- pr_info("Will do snapshot from %s\n", opts.img_parent);
-
- if (!strcmp(argv[optind], "dump")) {
- preload_socket_modules();
-
- if (!tree_id)
- goto opt_pid_missing;
- return cr_dump_tasks(tree_id);
- }
-
- if (!strcmp(argv[optind], "pre-dump")) {
- if (!tree_id)
- goto opt_pid_missing;
-
- return cr_pre_dump_tasks(tree_id) != 0;
- }
-
- if (!strcmp(argv[optind], "restore")) {
- if (tree_id)
- pr_warn("Using -t with criu restore is obsoleted\n");
-
- ret = cr_restore_tasks();
- if (ret == 0 && opts.exec_cmd) {
- close_pid_proc();
- execvp(opts.exec_cmd[0], opts.exec_cmd);
- pr_perror("Failed to exec command %s", opts.exec_cmd[0]);
- ret = 1;
- }
-
- return ret != 0;
- }
-
- if (!strcmp(argv[optind], "show"))
- return cr_show(pid) != 0;
-
- if (!strcmp(argv[optind], "check"))
- return cr_check() != 0;
-
- if (!strcmp(argv[optind], "exec")) {
- if (!pid)
- pid = tree_id; /* old usage */
- if (!pid)
- goto opt_pid_missing;
- return cr_exec(pid, argv + optind + 1) != 0;
- }
-
- if (!strcmp(argv[optind], "page-server"))
- return cr_page_server(opts.daemon_mode, -1) > 0 ? 0 : 1;
-
- if (!strcmp(argv[optind], "service"))
- return cr_service(opts.daemon_mode);
-
- if (!strcmp(argv[optind], "dedup"))
- return cr_dedup() != 0;
-
- if (!strcmp(argv[optind], "cpuinfo")) {
- if (!argv[optind + 1])
- goto usage;
- if (!strcmp(argv[optind + 1], "dump"))
- return cpuinfo_dump();
- else if (!strcmp(argv[optind + 1], "check"))
- return cpuinfo_check();
- }
-
- pr_msg("Error: unknown command: %s\n", argv[optind]);
-usage:
- pr_msg("\n"
-"Usage:\n"
-" criu dump|pre-dump -t PID [<options>]\n"
-" criu restore [<options>]\n"
-" criu check [--ms]\n"
-" criu exec -p PID <syscall-string>\n"
-" criu page-server\n"
-" criu service [<options>]\n"
-" criu dedup\n"
-"\n"
-"Commands:\n"
-" dump checkpoint a process/tree identified by pid\n"
-" pre-dump pre-dump task(s) minimizing their frozen time\n"
-" restore restore a process/tree\n"
-" check checks whether the kernel support is up-to-date\n"
-" exec execute a system call by other task\n"
-" page-server launch page server\n"
-" service launch service\n"
-" dedup remove duplicates in memory dump\n"
-" cpuinfo dump writes cpu information into image file\n"
-" cpuinfo check validates cpu information read from image file\n"
- );
-
- if (usage_error) {
- pr_msg("\nTry -h|--help for more info\n");
- return 1;
- }
-
- pr_msg("\n"
-"Dump/Restore options:\n"
-"\n"
-"* Generic:\n"
-" -t|--tree PID checkpoint a process tree identified by PID\n"
-" -d|--restore-detached detach after restore\n"
-" -S|--restore-sibling restore root task as sibling\n"
-" -s|--leave-stopped leave tasks in stopped state after checkpoint\n"
-" -R|--leave-running leave tasks in running state after checkpoint\n"
-" -D|--images-dir DIR directory for image files\n"
-" --pidfile FILE write root task, service or page-server pid to FILE\n"
-" -W|--work-dir DIR directory to cd and write logs/pidfiles/stats to\n"
-" (if not specified, value of --images-dir is used)\n"
-" --cpu-cap [CAP] require certain cpu capability. CAP: may be one of:\n"
-" 'cpu','fpu','all','ins','none'. To disable capability, prefix it with '^'.\n"
-" --exec-cmd execute the command specified after '--' on successful\n"
-" restore making it the parent of the restored process\n"
-" --freeze-cgroup\n"
-" use cgroup freezer to collect processes\n"
-"\n"
-"* Special resources support:\n"
-" -x|--" USK_EXT_PARAM "inode,.." " allow external unix connections (optionally can be assign socket's inode that allows one-sided dump)\n"
-" --" SK_EST_PARAM " checkpoint/restore established TCP connections\n"
-" -r|--root PATH change the root filesystem (when run in mount namespace)\n"
-" --evasive-devices use any path to a device file if the original one\n"
-" is inaccessible\n"
-" --veth-pair IN=OUT map inside veth device name to outside one\n"
-" can optionally append @<bridge-name> to OUT for moving\n"
-" the outside veth to the named bridge\n"
-" --link-remap allow one to link unlinked files back when possible\n"
-" --ghost-limit size specify maximum size of deleted file contents to be carried inside an image file\n"
-" --action-script FILE add an external action script\n"
-" -j|--" OPT_SHELL_JOB " allow one to dump and restore shell jobs\n"
-" -l|--" OPT_FILE_LOCKS " handle file locks, for safety, only used for container\n"
-" -L|--libdir path to a plugin directory (by default " CR_PLUGIN_DEFAULT ")\n"
-" --force-irmap force resolving names for inotify/fsnotify watches\n"
-" --irmap-scan-path FILE\n"
-" add a path the irmap hints to scan\n"
-" -M|--ext-mount-map KEY:VALUE\n"
-" add external mount mapping\n"
-" -M|--ext-mount-map auto\n"
-" attempt to autodetect external mount mapings\n"
-" --enable-external-sharing\n"
-" allow autoresolving mounts with external sharing\n"
-" --enable-external-masters\n"
-" allow autoresolving mounts with external masters\n"
-" --manage-cgroups [m] dump or restore cgroups the process is in usig mode:\n"
-" 'none', 'props', 'soft' (default), 'full' and 'strict'.\n"
-" --cgroup-root [controller:]/newroot\n"
-" change the root cgroup the controller will be\n"
-" installed into. No controller means that root is the\n"
-" default for all controllers not specified.\n"
-" --skip-mnt PATH ignore this mountpoint when dumping the mount namespace.\n"
-" --enable-fs FSNAMES a comma separated list of filesystem names or \"all\".\n"
-" force criu to (try to) dump/restore these filesystem's\n"
-" mountpoints even if fs is not supported.\n"
-" --external RES dump objects from this list as external resources:\n"
-" Formats of RES:\n"
-" tty[rdev:dev]\n"
-" --inherit-fd fd[<num>]:<existing>\n"
-" Inherit file descriptors. This allows to treat file descriptor\n"
-" <num> as being already opened via <existing> one and instead of\n"
-" trying to open we inherit it:\n"
-" tty[rdev:dev]\n"
-" pipe[inode]\n"
-" socket[inode]\n"
-"\n"
-"* Logging:\n"
-" -o|--log-file FILE log file name\n"
-" --log-pid enable per-process logging to separate FILE.pid files\n"
-" -v[NUM] set logging level (higher level means more output):\n"
-" -v1|-v - only errors and messages\n"
-" -v2|-vv - also warnings (default level)\n"
-" -v3|-vvv - also information messages and timestamps\n"
-" -v4|-vvvv - lots of debug\n"
-"\n"
-"* Memory dumping options:\n"
-" --track-mem turn on memory changes tracker in kernel\n"
-" --prev-images-dir DIR path to images from previous dump (relative to -D)\n"
-" --page-server send pages to page server (see options below as well)\n"
-" --auto-dedup when used on dump it will deduplicate \"old\" data in\n"
-" pages images of previous dump\n"
-" when used on restore, as soon as page is restored, it\n"
-" will be punched from the image.\n"
-"\n"
-"Page/Service server options:\n"
-" --address ADDR address of server or service\n"
-" --port PORT port of page server\n"
-" -d|--daemon run in the background after creating socket\n"
-"\n"
-"Other options:\n"
-" -h|--help show this text\n"
-" -V|--version show version\n"
-" --ms don't check not yet merged kernel features\n"
- );
-
- return 0;
-
-opt_pid_missing:
- pr_msg("Error: pid not specified\n");
- return 1;
-
-bad_arg:
- if (idx < 0) /* short option */
- pr_msg("Error: invalid argument for -%c: %s\n",
- opt, optarg);
- else /* long option */
- pr_msg("Error: invalid argument for --%s: %s\n",
- long_opts[idx].name, optarg);
- return 1;
-}
diff --git a/eventfd.c b/eventfd.c
deleted file mode 100644
index 21b5c9d7b6d8..000000000000
--- a/eventfd.c
+++ /dev/null
@@ -1,129 +0,0 @@
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-#include <sys/stat.h>
-#include <sys/statfs.h>
-#include <sys/types.h>
-#include <sys/ioctl.h>
-#include <sys/eventfd.h>
-
-#include "compiler.h"
-#include "asm/types.h"
-#include "imgset.h"
-#include "eventfd.h"
-#include "proc_parse.h"
-#include "image.h"
-#include "util.h"
-#include "log.h"
-
-#include "protobuf.h"
-#include "protobuf/eventfd.pb-c.h"
-
-#undef LOG_PREFIX
-#define LOG_PREFIX "eventfd: "
-
-struct eventfd_file_info {
- EventfdFileEntry *efe;
- struct file_desc d;
-};
-
-/* Checks if file descriptor @lfd is eventfd */
-int is_eventfd_link(char *link)
-{
- return is_anon_link_type(link, "[eventfd]");
-}
-
-static void pr_info_eventfd(char *action, EventfdFileEntry *efe)
-{
- pr_info("%s: id %#08x flags %#04x counter %#016"PRIx64"\n",
- action, efe->id, efe->flags, efe->counter);
-}
-
-struct eventfd_dump_arg {
- u32 id;
- const struct fd_parms *p;
- bool dumped;
-};
-
-static int dump_eventfd_entry(union fdinfo_entries *e, void *arg)
-{
- struct eventfd_dump_arg *da = arg;
-
- if (da->dumped) {
- pr_err("Several counters in a file?\n");
- return -1;
- }
-
- da->dumped = true;
- e->efd.id = da->id;
- e->efd.flags = da->p->flags;
- e->efd.fown = (FownEntry *)&da->p->fown;
-
- pr_info_eventfd("Dumping ", &e->efd);
- return pb_write_one(img_from_set(glob_imgset, CR_FD_EVENTFD_FILE),
- &e->efd, PB_EVENTFD_FILE);
-}
-
-static int dump_one_eventfd(int lfd, u32 id, const struct fd_parms *p)
-{
- struct eventfd_dump_arg da = { .id = id, .p = p, };
- return parse_fdinfo(lfd, FD_TYPES__EVENTFD, dump_eventfd_entry, &da);
-}
-
-const struct fdtype_ops eventfd_dump_ops = {
- .type = FD_TYPES__EVENTFD,
- .dump = dump_one_eventfd,
-};
-
-static int eventfd_open(struct file_desc *d)
-{
- struct eventfd_file_info *info;
- int tmp;
-
- info = container_of(d, struct eventfd_file_info, d);
-
- tmp = eventfd(info->efe->counter, 0);
- if (tmp < 0) {
- pr_perror("Can't create eventfd %#08x",
- info->efe->id);
- return -1;
- }
-
- if (rst_file_params(tmp, info->efe->fown, info->efe->flags)) {
- pr_perror("Can't restore params on eventfd %#08x",
- info->efe->id);
- goto err_close;
- }
-
- return tmp;
-
-err_close:
- close(tmp);
- return -1;
-}
-
-static struct file_desc_ops eventfd_desc_ops = {
- .type = FD_TYPES__EVENTFD,
- .open = eventfd_open,
-};
-
-static int collect_one_efd(void *obj, ProtobufCMessage *msg)
-{
- struct eventfd_file_info *info = obj;
-
- info->efe = pb_msg(msg, EventfdFileEntry);
- pr_info_eventfd("Collected ", info->efe);
- return file_desc_add(&info->d, info->efe->id, &eventfd_desc_ops);
-}
-
-struct collect_image_info eventfd_cinfo = {
- .fd_type = CR_FD_EVENTFD_FILE,
- .pb_type = PB_EVENTFD_FILE,
- .priv_size = sizeof(struct eventfd_file_info),
- .collect = collect_one_efd,
-};
diff --git a/eventpoll.c b/eventpoll.c
deleted file mode 100644
index c414c35b9923..000000000000
--- a/eventpoll.c
+++ /dev/null
@@ -1,229 +0,0 @@
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-#include <sys/stat.h>
-#include <sys/statfs.h>
-#include <sys/types.h>
-#include <sys/ioctl.h>
-#include <sys/epoll.h>
-
-#include "compiler.h"
-#include "asm/types.h"
-#include "imgset.h"
-#include "rst_info.h"
-#include "eventpoll.h"
-#include "proc_parse.h"
-#include "image.h"
-#include "util.h"
-#include "log.h"
-
-#include "protobuf.h"
-#include "protobuf/eventpoll.pb-c.h"
-
-#undef LOG_PREFIX
-#define LOG_PREFIX "epoll: "
-
-struct eventpoll_file_info {
- EventpollFileEntry *efe;
- struct file_desc d;
-};
-
-struct eventpoll_tfd_file_info {
- EventpollTfdEntry *tdefe;
- struct list_head list;
-};
-
-static LIST_HEAD(eventpoll_tfds);
-
-/* Checks if file descriptor @lfd is eventfd */
-int is_eventpoll_link(char *link)
-{
- return is_anon_link_type(link, "[eventpoll]");
-}
-
-static void pr_info_eventpoll_tfd(char *action, EventpollTfdEntry *e)
-{
- pr_info("%seventpoll-tfd: id %#08x tfd %#08x events %#08x data %#016"PRIx64"\n",
- action, e->id, e->tfd, e->events, e->data);
-}
-
-static void pr_info_eventpoll(char *action, EventpollFileEntry *e)
-{
- pr_info("%seventpoll: id %#08x flags %#04x\n", action, e->id, e->flags);
-}
-
-struct eventpoll_list {
- struct list_head list;
- int n;
-};
-
-static int dump_eventpoll_entry(union fdinfo_entries *e, void *arg)
-{
- struct eventpoll_list *ep_list = (struct eventpoll_list *) arg;
- EventpollTfdEntry *efd = &e->epl.e;
-
- pr_info_eventpoll_tfd("Dumping: ", efd);
-
- list_add_tail(&e->epl.node, &ep_list->list);
- ep_list->n++;
-
- return 0;
-}
-
-static int dump_one_eventpoll(int lfd, u32 id, const struct fd_parms *p)
-{
- EventpollFileEntry e = EVENTPOLL_FILE_ENTRY__INIT;
- struct eventpoll_list ep_list = {LIST_HEAD_INIT(ep_list.list), 0};
- union fdinfo_entries *te, *tmp;
- int i, ret = -1;
-
- e.id = id;
- e.flags = p->flags;
- e.fown = (FownEntry *)&p->fown;
-
- if (parse_fdinfo(lfd, FD_TYPES__EVENTPOLL, dump_eventpoll_entry, &ep_list))
- goto out;
-
- e.tfd = xmalloc(sizeof(struct EventpollTfdEntry *) * ep_list.n);
- if (!e.tfd)
- goto out;
-
- i = 0;
- list_for_each_entry(te, &ep_list.list, epl.node)
- e.tfd[i++] = &te->epl.e;
- e.n_tfd = ep_list.n;
-
- pr_info_eventpoll("Dumping ", &e);
- ret = pb_write_one(img_from_set(glob_imgset, CR_FD_EVENTPOLL_FILE),
- &e, PB_EVENTPOLL_FILE);
-out:
- list_for_each_entry_safe(te, tmp, &ep_list.list, epl.node)
- free_event_poll_entry(te);
-
- return ret;
-}
-
-const struct fdtype_ops eventpoll_dump_ops = {
- .type = FD_TYPES__EVENTPOLL,
- .dump = dump_one_eventpoll,
-};
-
-static int eventpoll_open(struct file_desc *d)
-{
- struct eventpoll_file_info *info;
- int tmp;
-
- info = container_of(d, struct eventpoll_file_info, d);
-
- pr_info_eventpoll("Restore ", info->efe);
-
- tmp = epoll_create(1);
- if (tmp < 0) {
- pr_perror("Can't create epoll %#08x",
- info->efe->id);
- return -1;
- }
-
- if (rst_file_params(tmp, info->efe->fown, info->efe->flags)) {
- pr_perror("Can't restore file params on epoll %#08x",
- info->efe->id);
- goto err_close;
- }
-
- return tmp;
-err_close:
- close(tmp);
- return -1;
-}
-static int eventpoll_retore_tfd(int fd, int id, EventpollTfdEntry *tdefe)
-{
- struct epoll_event event;
-
- pr_info_eventpoll_tfd("Restore ", tdefe);
-
- event.events = tdefe->events;
- event.data.u64 = tdefe->data;
- if (epoll_ctl(fd, EPOLL_CTL_ADD, tdefe->tfd, &event)) {
- pr_perror("Can't add event on %#08x", id);
- return -1;
- }
-
- return 0;
-}
-
-static int eventpoll_post_open(struct file_desc *d, int fd)
-{
- struct eventpoll_tfd_file_info *td_info;
- struct eventpoll_file_info *info;
- int i;
-
- info = container_of(d, struct eventpoll_file_info, d);
-
- for (i = 0; i < info->efe->n_tfd; i++) {
- if (eventpoll_retore_tfd(fd, info->efe->id, info->efe->tfd[i]))
- return -1;
- }
-
- list_for_each_entry(td_info, &eventpoll_tfds, list) {
- if (td_info->tdefe->id != info->efe->id)
- continue;
-
- if (eventpoll_retore_tfd(fd, info->efe->id, td_info->tdefe))
- return -1;
-
- }
-
- return 0;
-}
-
-static void eventpoll_collect_fd(struct file_desc *d,
- struct fdinfo_list_entry *fle, struct rst_info *ri)
-{
- list_add_tail(&fle->ps_list, &ri->eventpoll);
-}
-
-static struct file_desc_ops desc_ops = {
- .type = FD_TYPES__EVENTPOLL,
- .open = eventpoll_open,
- .post_open = eventpoll_post_open,
- .collect_fd = eventpoll_collect_fd,
-};
-
-static int collect_one_epoll_tfd(void *o, ProtobufCMessage *msg)
-{
- struct eventpoll_tfd_file_info *info = o;
-
- info->tdefe = pb_msg(msg, EventpollTfdEntry);
- list_add(&info->list, &eventpoll_tfds);
- pr_info_eventpoll_tfd("Collected ", info->tdefe);
-
- return 0;
-}
-
-struct collect_image_info epoll_tfd_cinfo = {
- .fd_type = CR_FD_EVENTPOLL_TFD,
- .pb_type = PB_EVENTPOLL_TFD,
- .priv_size = sizeof(struct eventpoll_tfd_file_info),
- .collect = collect_one_epoll_tfd,
-};
-
-static int collect_one_epoll(void *o, ProtobufCMessage *msg)
-{
- struct eventpoll_file_info *info = o;
-
- info->efe = pb_msg(msg, EventpollFileEntry);
- pr_info_eventpoll("Collected ", info->efe);
- return file_desc_add(&info->d, info->efe->id, &desc_ops);
-}
-
-struct collect_image_info epoll_cinfo = {
- .fd_type = CR_FD_EVENTPOLL_FILE,
- .pb_type = PB_EVENTPOLL_FILE,
- .priv_size = sizeof(struct eventpoll_file_info),
- .collect = collect_one_epoll,
-};
diff --git a/fault-injection.c b/fault-injection.c
deleted file mode 100644
index f239fd9db649..000000000000
--- a/fault-injection.c
+++ /dev/null
@@ -1,22 +0,0 @@
-#include <stdlib.h>
-#include "fault-injection.h"
-
-enum faults fi_strategy;
-
-int fault_injection_init()
-{
- char *val;
- int strat;
-
- val = getenv("CRIU_FAULT");
- if (val == NULL)
- return 0;
-
- strat = atoi(val);
-
- if (strat <= 0 || strat >= FI_MAX)
- return -1;
-
- fi_strategy = strat;
- return 0;
-}
diff --git a/fifo.c b/fifo.c
deleted file mode 100644
index bd06da9c16e7..000000000000
--- a/fifo.c
+++ /dev/null
@@ -1,168 +0,0 @@
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdlib.h>
-
-#include "imgset.h"
-#include "image.h"
-#include "files.h"
-#include "files-reg.h"
-#include "pipes.h"
-
-#include "fifo.h"
-
-#include "protobuf.h"
-#include "protobuf/regfile.pb-c.h"
-#include "protobuf/fifo.pb-c.h"
-
-/*
- * FIFO checkpoint and restore is done in a bit unusual manner.
- * We use files-reg.c engine to save fifo path and flags,
- * thus regular files image will contain fifo descriptors which
- * are useless for reg-files engine itself but needed for our fifo
- * engine.
- *
- * In particular we dump fifo-entry automatically and appropriate
- * reg-file entry manually, thus on restore we need to ask reg-file
- * engine to restore fifo path and flags via direct call.
- */
-
-struct fifo_info {
- struct list_head list;
- struct file_desc d;
- FifoEntry *fe;
- bool restore_data;
- struct file_desc *reg_d;
-};
-
-static LIST_HEAD(fifo_head);
-static struct pipe_data_dump pd_fifo = { .img_type = CR_FD_FIFO_DATA, };
-
-static int dump_one_fifo(int lfd, u32 id, const struct fd_parms *p)
-{
- struct cr_img *img = img_from_set(glob_imgset, CR_FD_FIFO);
- FifoEntry e = FIFO_ENTRY__INIT;
-
- /*
- * It's a trick here, we use regular files dumping
- * code to save path to a fifo, then we reuse it
- * on restore.
- */
- if (dump_one_reg_file(lfd, id, p))
- return -1;
-
- pr_info("Dumping fifo %d with id %#x pipe_id %#x\n",
- lfd, id, pipe_id(p));
-
- e.id = id;
- e.pipe_id = pipe_id(p);
-
- if (pb_write_one(img, &e, PB_FIFO))
- return -1;
-
- return dump_one_pipe_data(&pd_fifo, lfd, p);
-}
-
-const struct fdtype_ops fifo_dump_ops = {
- .type = FD_TYPES__FIFO,
- .dump = dump_one_fifo,
-};
-
-static struct pipe_data_rst *pd_hash_fifo[PIPE_DATA_HASH_SIZE];
-
-static int do_open_fifo(int ns_root_fd, struct reg_file_info *rfi, void *arg)
-{
- struct fifo_info *info = arg;
- int new_fifo, fake_fifo = -1;
-
- /*
- * The fifos (except read-write fifos) do wait until
- * another pipe-end get connected, so to be able to
- * proceed the restoration procedure we open a fake
- * fifo here.
- */
- fake_fifo = openat(ns_root_fd, rfi->path, O_RDWR);
- if (fake_fifo < 0) {
- pr_perror("Can't open fake fifo %#x [%s]", info->fe->id, rfi->path);
- return -1;
- }
-
- new_fifo = openat(ns_root_fd, rfi->path, rfi->rfe->flags);
- if (new_fifo < 0) {
- pr_perror("Can't open fifo %#x [%s]", info->fe->id, rfi->path);
- goto out;
- }
-
- if (info->restore_data)
- if (restore_pipe_data(CR_FD_FIFO_DATA, fake_fifo,
- info->fe->pipe_id, pd_hash_fifo)) {
- close(new_fifo);
- new_fifo = -1;
- }
-
-out:
- close(fake_fifo);
- return new_fifo;
-}
-
-static int open_fifo_fd(struct file_desc *d)
-{
- struct fifo_info *info = container_of(d, struct fifo_info, d);
-
- return open_path(info->reg_d, do_open_fifo, info);
-}
-
-static void collect_fifo_fd(struct file_desc *d,
- struct fdinfo_list_entry *fle, struct rst_info *ri)
-{
- struct fifo_info *info;
-
- info = container_of(d, struct fifo_info, d);
- info->reg_d = collect_special_file(info->fe->id);
- BUG_ON(info->reg_d == NULL);
- collect_gen_fd(fle, ri);
-}
-
-static struct file_desc_ops fifo_desc_ops = {
- .type = FD_TYPES__FIFO,
- .open = open_fifo_fd,
- .collect_fd = collect_fifo_fd,
-};
-
-static int collect_one_fifo(void *o, ProtobufCMessage *base)
-{
- struct fifo_info *info = o, *f;
-
- info->fe = pb_msg(base, FifoEntry);
- pr_info("Collected fifo entry ID %#x PIPE ID %#x\n",
- info->fe->id, info->fe->pipe_id);
-
- /* check who will restore the fifo data */
- list_for_each_entry(f, &fifo_head, list)
- if (f->fe->pipe_id == info->fe->pipe_id)
- break;
-
- if (&f->list == &fifo_head) {
- list_add(&info->list, &fifo_head);
- info->restore_data = true;
- } else {
- INIT_LIST_HEAD(&info->list);
- info->restore_data = false;
- }
-
- return file_desc_add(&info->d, info->fe->id, &fifo_desc_ops);
-
-}
-
-struct collect_image_info fifo_cinfo = {
- .fd_type = CR_FD_FIFO,
- .pb_type = PB_FIFO,
- .priv_size = sizeof(struct fifo_info),
- .collect = collect_one_fifo,
-};
-
-int collect_fifo(void)
-{
- return collect_pipe_data(CR_FD_FIFO_DATA, pd_hash_fifo);
-}
diff --git a/file-ids.c b/file-ids.c
deleted file mode 100644
index f23924a0516b..000000000000
--- a/file-ids.c
+++ /dev/null
@@ -1,113 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdarg.h>
-#include <signal.h>
-#include <limits.h>
-#include <unistd.h>
-#include <errno.h>
-#include <string.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-
-#include "asm/types.h"
-#include "file-ids.h"
-#include "rbtree.h"
-#include "kcmp-ids.h"
-#include "compiler.h"
-#include "image.h"
-#include "util.h"
-#include "irmap.h"
-#include "files.h"
-
-static DECLARE_KCMP_TREE(fd_tree, KCMP_FILE);
-
-#define FDID_BITS 5
-#define FDID_SIZE (1 << FDID_BITS)
-#define FDID_MASK (FDID_SIZE - 1)
-
-static inline int fdid_hashfn(unsigned int s_dev, unsigned long i_ino)
-{
- return (s_dev + i_ino) & FDID_MASK;
-}
-
-struct fd_id {
- int mnt_id;
- unsigned int dev;
- unsigned long ino;
- u32 id;
- struct fd_id *n;
-};
-
-static struct fd_id *fd_id_cache[FDID_SIZE];
-
-static void fd_id_cache_one(u32 id, struct fd_parms *p)
-{
- struct fd_id *fi;
- unsigned hv;
-
- fi = xmalloc(sizeof(*fi));
- if (fi) {
- fi->dev = p->stat.st_dev;
- fi->ino = p->stat.st_ino;
- fi->mnt_id = p->mnt_id;
- fi->id = id;
-
- hv = fdid_hashfn(p->stat.st_dev, p->stat.st_ino);
- fi->n = fd_id_cache[hv];
- fd_id_cache[hv] = fi;
- }
-}
-
-static struct fd_id *fd_id_cache_lookup(struct fd_parms *p)
-{
- struct stat *st = &p->stat;
- struct fd_id *fi;
-
- for (fi = fd_id_cache[fdid_hashfn(st->st_dev, st->st_ino)];
- fi; fi = fi->n)
- if (fi->dev == st->st_dev &&
- fi->ino == st->st_ino &&
- fi->mnt_id == p->mnt_id)
- return fi;
-
- return NULL;
-}
-
-int fd_id_generate_special(struct fd_parms *p, u32 *id)
-{
- if (p) {
- struct fd_id *fi;
-
- fi = fd_id_cache_lookup(p);
- if (fi) {
- *id = fi->id;
- return 0;
- }
- }
-
- *id = fd_tree.subid++;
- if (p)
- fd_id_cache_one(*id, p);
- return 1;
-}
-
-int fd_id_generate(pid_t pid, FdinfoEntry *fe, struct fd_parms *p)
-{
- u32 id;
- struct kid_elem e;
- int new_id = 0;
-
- e.pid = pid;
- e.genid = fe->id;
- e.idx = fe->fd;
-
- id = kid_generate_gen(&fd_tree, &e, &new_id);
- if (!id)
- return -ENOMEM;
-
- if (new_id)
- fd_id_cache_one(id, p);
-
- fe->id = id;
- return new_id;
-}
diff --git a/file-lock.c b/file-lock.c
deleted file mode 100644
index 8e4e48192ecf..000000000000
--- a/file-lock.c
+++ /dev/null
@@ -1,377 +0,0 @@
-#include <stdlib.h>
-#include <unistd.h>
-#include <sys/file.h>
-#include <fcntl.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include "cr_options.h"
-#include "imgset.h"
-#include "files.h"
-#include "fs-magic.h"
-#include "kerndat.h"
-#include "image.h"
-#include "mount.h"
-#include "proc_parse.h"
-#include "servicefd.h"
-#include "file-lock.h"
-#include "parasite.h"
-#include "parasite-syscall.h"
-
-struct file_lock_rst {
- FileLockEntry *fle;
- struct list_head l;
-};
-
-struct list_head file_lock_list = LIST_HEAD_INIT(file_lock_list);
-
-static int collect_one_file_lock(void *o, ProtobufCMessage *m)
-{
- struct file_lock_rst *lr = o;
-
- lr->fle = pb_msg(m, FileLockEntry);
- list_add_tail(&lr->l, &file_lock_list);
-
- return 0;
-}
-
-struct collect_image_info file_locks_cinfo = {
- .fd_type = CR_FD_FILE_LOCKS,
- .pb_type = PB_FILE_LOCK,
- .priv_size = sizeof(struct file_lock_rst),
- .collect = collect_one_file_lock,
-};
-
-struct file_lock *alloc_file_lock(void)
-{
- struct file_lock *flock;
-
- flock = xzalloc(sizeof(*flock));
- if (!flock)
- return NULL;
-
- INIT_LIST_HEAD(&flock->list);
- flock->real_owner = -1;
- flock->owners_fd = -1;
-
- return flock;
-}
-
-void free_file_locks(void)
-{
- struct file_lock *flock, *tmp;
-
- list_for_each_entry_safe(flock, tmp, &file_lock_list, list) {
- xfree(flock);
- }
-
- INIT_LIST_HEAD(&file_lock_list);
-}
-
-static int dump_one_file_lock(FileLockEntry *fle)
-{
- pr_info("LOCK flag: %d,type: %d,pid: %d,fd: %d,start: %8"PRIx64",len: %8"PRIx64"\n",
- fle->flag, fle->type, fle->pid, fle->fd, fle->start, fle->len);
-
- return pb_write_one(img_from_set(glob_imgset, CR_FD_FILE_LOCKS),
- fle, PB_FILE_LOCK);
-}
-
-static void fill_flock_entry(FileLockEntry *fle, int fl_kind, int fl_ltype)
-{
- fle->flag |= fl_kind;
- fle->type = fl_ltype;
-}
-
-int dump_file_locks(void)
-{
- FileLockEntry fle;
- struct file_lock *fl;
- int ret = 0;
-
- pr_info("Dumping file-locks\n");
-
- list_for_each_entry(fl, &file_lock_list, list) {
- if (fl->real_owner == -1) {
- if (fl->fl_kind == FL_POSIX) {
- pr_err("Unresolved lock found pid %d ino %ld\n",
- fl->fl_owner, fl->i_no);
- return -1;
- }
-
- continue;
- }
-
- file_lock_entry__init(&fle);
- fle.pid = fl->real_owner;
- fle.fd = fl->owners_fd;
- fill_flock_entry(&fle, fl->fl_kind, fl->fl_ltype);
- fle.start = fl->start;
- if (!strncmp(fl->end, "EOF", 3))
- fle.len = 0;
- else
- fle.len = (atoll(fl->end) + 1) - fl->start;
-
- ret = dump_one_file_lock(&fle);
- if (ret) {
- pr_err("Dump file lock failed!\n");
- goto err;
- }
- }
-
-err:
- return ret;
-}
-
-static int lock_btrfs_file_match(pid_t pid, int fd, struct file_lock *fl, struct fd_parms *p)
-{
- int phys_dev = MKKDEV(fl->maj, fl->min);
- char link[PATH_MAX], t[32];
- struct ns_id *ns;
- int ret;
-
- snprintf(t, sizeof(t), "/proc/%d/fd/%d", pid, fd);
- ret = readlink(t, link, sizeof(link)) - 1;
- if (ret < 0) {
- pr_perror("Can't read link of fd %d", fd);
- return -1;
- } else if ((size_t)ret == sizeof(link)) {
- pr_err("Buffer for read link of fd %d is too small\n", fd);
- return -1;
- }
- link[ret] = 0;
-
- ns = lookup_nsid_by_mnt_id(p->mnt_id);
- return phys_stat_dev_match(p->stat.st_dev, phys_dev, ns, link);
-}
-
-static inline int lock_file_match(pid_t pid, int fd, struct file_lock *fl, struct fd_parms *p)
-{
- dev_t dev = p->stat.st_dev;
-
- if (fl->i_no != p->stat.st_ino)
- return 0;
-
- /*
- * Get the right devices for BTRFS. Look at phys_stat_resolve_dev()
- * for more details.
- */
- if (p->fs_type == BTRFS_SUPER_MAGIC) {
- if (p->mnt_id != -1) {
- struct mount_info *m;
-
- m = lookup_mnt_id(p->mnt_id);
- BUG_ON(m == NULL);
- dev = kdev_to_odev(m->s_dev);
- } else /* old kernel */
- return lock_btrfs_file_match(pid, fd, fl, p);
- }
-
- return makedev(fl->maj, fl->min) == dev;
-}
-
-static int lock_check_fd(int lfd, struct file_lock *fl)
-{
- int ret;
-
- if (fl->fl_ltype & LOCK_MAND)
- ret = flock(lfd, LOCK_MAND | LOCK_RW);
- else
- ret = flock(lfd, LOCK_EX | LOCK_NB);
- pr_debug(" `- %d/%d\n", ret, errno);
- if (ret != 0) {
- if (errno != EAGAIN) {
- pr_err("Bogus lock test result %d\n", ret);
- return -1;
- }
-
- return 0;
- } else {
- /*
- * The ret == 0 means, that new lock doesn't conflict
- * with any others on the file. But since we do know,
- * that there should be some other one (file is found
- * in /proc/locks), it means that the lock is already
- * on file pointed by fd.
- */
- pr_debug(" `- downgrading lock back\n");
- if (fl->fl_ltype & LOCK_MAND)
- flock(lfd, fl->fl_ltype);
- else if (fl->fl_ltype == F_RDLCK)
- flock(lfd, LOCK_SH);
- }
-
- return 1;
-}
-
-int note_file_lock(struct pid *pid, int fd, int lfd, struct fd_parms *p)
-{
- struct file_lock *fl;
- int ret;
-
- if (kdat.has_fdinfo_lock)
- return 0;
-
- list_for_each_entry(fl, &file_lock_list, list) {
- ret = lock_file_match(pid->real, fd, fl, p);
- if (ret < 0)
- return -1;
- if (ret == 0)
- continue;
-
- if (!opts.handle_file_locks) {
- pr_err("Some file locks are hold by dumping tasks!"
- "You can try --" OPT_FILE_LOCKS " to dump them.\n");
- return -1;
- }
-
- if (fl->fl_kind == FL_POSIX) {
- /*
- * POSIX locks cannot belong to anyone
- * but creator.
- */
- if (fl->fl_owner != pid->real)
- continue;
- } else /* fl->fl_kind == FL_FLOCK */ {
- int ret;
-
- /*
- * FLOCKs can be inherited across fork,
- * thus we can have any task as lock
- * owner. But the creator is preferred
- * anyway.
- */
-
- if (fl->fl_owner != pid->real &&
- fl->real_owner != -1)
- continue;
-
- pr_debug("Checking lock holder %d:%d\n", pid->real, fd);
- ret = lock_check_fd(lfd, fl);
- if (ret < 0)
- return ret;
- if (ret == 0)
- continue;
- }
-
- fl->real_owner = pid->virt;
- fl->owners_fd = fd;
-
- pr_info("Found lock entry %d.%d %d vs %d\n",
- pid->real, pid->virt, fd,
- fl->fl_owner);
- }
-
- return 0;
-}
-
-static int restore_file_lock(FileLockEntry *fle)
-{
- int ret = -1;
- unsigned int cmd;
-
- if (fle->flag & FL_FLOCK) {
- if (fle->type & LOCK_MAND) {
- cmd = fle->type;
- } else if (fle->type == F_RDLCK) {
- cmd = LOCK_SH;
- } else if (fle->type == F_WRLCK) {
- cmd = LOCK_EX;
- } else if (fle->type == F_UNLCK) {
- cmd = LOCK_UN;
- } else {
- pr_err("Unknown flock type!\n");
- goto err;
- }
-
- pr_info("(flock)flag: %d, type: %d, cmd: %d, pid: %d, fd: %d\n",
- fle->flag, fle->type, cmd, fle->pid, fle->fd);
-
- ret = flock(fle->fd, cmd);
- if (ret < 0) {
- pr_err("Can not set flock!\n");
- goto err;
- }
- } else if (fle->flag & FL_POSIX) {
- struct flock flk;
- memset(&flk, 0, sizeof(flk));
-
- flk.l_whence = SEEK_SET;
- flk.l_start = fle->start;
- flk.l_len = fle->len;
- flk.l_pid = fle->pid;
- flk.l_type = fle->type;
-
- pr_info("(posix)flag: %d, type: %d, pid: %d, fd: %d, "
- "start: %8"PRIx64", len: %8"PRIx64"\n",
- fle->flag, fle->type, fle->pid, fle->fd,
- fle->start, fle->len);
-
- ret = fcntl(fle->fd, F_SETLKW, &flk);
- if (ret < 0) {
- pr_err("Can not set posix lock!\n");
- goto err;
- }
- } else {
- pr_err("Unknown file lock style!\n");
- goto err;
- }
-
- return 0;
-err:
- return ret;
-}
-
-static int restore_file_locks(int pid)
-{
- int ret = 0;
- struct file_lock_rst *lr;
-
- list_for_each_entry(lr, &file_lock_list, l) {
- if (lr->fle->pid == pid) {
- ret = restore_file_lock(lr->fle);
- if (ret)
- break;
- }
- }
-
- return ret;
-}
-
-static int restore_file_locks_legacy(int pid)
-{
- int ret = -1;
- struct cr_img *img;
- FileLockEntry *fle;
-
- img = open_image(CR_FD_FILE_LOCKS_PID, O_RSTR, pid);
- if (!img)
- return -1;
-
- while (1) {
- ret = pb_read_one_eof(img, &fle, PB_FILE_LOCK);
- if (ret <= 0)
- break;
-
- ret = restore_file_lock(fle);
- file_lock_entry__free_unpacked(fle, NULL);
- if (ret)
- break;
- }
-
- close_image(img);
- return ret;
-}
-
-int prepare_file_locks(int pid)
-{
- if (!opts.handle_file_locks)
- return 0;
-
- pr_info("Restore file locks.\n");
- if (file_locks_cinfo.flags & COLLECT_HAPPENED)
- return restore_file_locks(pid);
-
- return restore_file_locks_legacy(pid);
-}
diff --git a/files-ext.c b/files-ext.c
deleted file mode 100644
index b196b259006b..000000000000
--- a/files-ext.c
+++ /dev/null
@@ -1,93 +0,0 @@
-/* An external file is a file, which is dumped with help a plugin */
-
-#include <unistd.h>
-
-#include "imgset.h"
-#include "files.h"
-#include "plugin.h"
-
-#include "protobuf.h"
-#include "protobuf/ext-file.pb-c.h"
-
-static int dump_one_ext_file(int lfd, u32 id, const struct fd_parms *p)
-{
- int ret;
- struct cr_img *rimg;
-
- ExtFileEntry xfe = EXT_FILE_ENTRY__INIT;
-
- ret = run_plugins(DUMP_EXT_FILE, lfd, id);
- if (ret < 0)
- return ret;
-
- xfe.id = id;
- xfe.fown = (FownEntry *)&p->fown;
-
- rimg = img_from_set(glob_imgset, CR_FD_EXT_FILES);
- return pb_write_one(rimg, &xfe, PB_EXT_FILE);
-}
-
-const struct fdtype_ops ext_dump_ops = {
- .type = FD_TYPES__EXT,
- .dump = dump_one_ext_file,
-};
-
-struct ext_file_info {
- struct file_desc d;
- ExtFileEntry *xfe;
-};
-
-static int open_fd(struct file_desc *d)
-{
- struct ext_file_info *xfi;
- int fd;
-
- xfi = container_of(d, struct ext_file_info, d);
-
- fd = run_plugins(RESTORE_EXT_FILE, xfi->xfe->id);
- if (fd < 0) {
- pr_err("Unable to restore %#x\n", xfi->xfe->id);
- return -1;
- }
-
- if (restore_fown(fd, xfi->xfe->fown))
- return -1;
-
- return fd;
-}
-
-static struct file_desc_ops ext_desc_ops = {
- .type = FD_TYPES__EXT,
- .open = open_fd,
-};
-
-static int collect_one_ext(void *o, ProtobufCMessage *base)
-{
- struct ext_file_info *xfi = o;
-
- xfi->xfe = pb_msg(base, ExtFileEntry);
-
- pr_info("Collected external file with ID %#x\n", xfi->xfe->id);
- return file_desc_add(&xfi->d, xfi->xfe->id, &ext_desc_ops);
-}
-
-struct collect_image_info ext_file_cinfo = {
- .fd_type = CR_FD_EXT_FILES,
- .pb_type = PB_EXT_FILE,
- .priv_size = sizeof(struct ext_file_info),
- .collect = collect_one_ext,
-};
-
-int dump_unsupp_fd(struct fd_parms *p, int lfd,
- struct cr_img *img, char *more, char *info)
-{
- int ret;
-
- ret = do_dump_gen_file(p, lfd, &ext_dump_ops, img);
- if (ret == 0)
- return 0;
- if (ret == -ENOTSUP)
- pr_err("Can't dump file %d of that type [%o] (%s %s)\n",
- p->fd, p->stat.st_mode, more, info);
- return -1;
-}
diff --git a/files-reg.c b/files-reg.c
deleted file mode 100644
index 7911d667351b..000000000000
--- a/files-reg.c
+++ /dev/null
@@ -1,1643 +0,0 @@
-#include <stdlib.h>
-#include <unistd.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <string.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/vfs.h>
-#include <sys/prctl.h>
-#include <ctype.h>
-#include <sched.h>
-
-/* Stolen from kernel/fs/nfs/unlink.c */
-#define SILLYNAME_PREF ".nfs"
-#define SILLYNAME_SUFF_LEN (((unsigned)sizeof(u64) << 1) + ((unsigned)sizeof(unsigned int) << 1))
-
-#include "cr_options.h"
-#include "imgset.h"
-#include "file-ids.h"
-#include "mount.h"
-#include "files.h"
-#include "image.h"
-#include "list.h"
-#include "util.h"
-#include "fs-magic.h"
-#include "asm/atomic.h"
-#include "namespaces.h"
-#include "proc_parse.h"
-#include "pstree.h"
-
-#include "protobuf.h"
-#include "protobuf/regfile.pb-c.h"
-#include "protobuf/remap-file-path.pb-c.h"
-
-#include "files-reg.h"
-#include "plugin.h"
-
-int setfsuid(uid_t fsuid);
-
-/*
- * Ghost files are those not visible from the FS. Dumping them is
- * nasty and the only way we have -- just carry its contents with
- * us. Any brave soul to implement link unlinked file back?
- */
-struct ghost_file {
- struct list_head list;
- u32 id;
-
- u32 dev;
- u32 ino;
-
- struct file_remap remap;
-};
-
-static u32 ghost_file_ids = 1;
-static LIST_HEAD(ghost_files);
-
-static mutex_t *ghost_file_mutex;
-
-static LIST_HEAD(remaps);
-
-/*
- * Remember the name to delete it if needed on error or
- * rollback action. Note we don't expect that there will
- * be a HUGE number of link remaps, so in a sake of speed
- * we keep all data in memory.
- */
-struct link_remap_rlb {
- struct list_head list;
- struct ns_id *mnt_ns;
- char *path;
-};
-
-static int note_link_remap(char *path, struct ns_id *nsid)
-{
- struct link_remap_rlb *rlb;
-
- rlb = xmalloc(sizeof(*rlb));
- if (!rlb)
- goto err;
-
- rlb->path = strdup(path);
- if (!rlb->path)
- goto err2;
-
- rlb->mnt_ns = nsid;
- list_add(&rlb->list, &remaps);
-
- return 0;
-
-err2:
- xfree(rlb);
-err:
- pr_err("Can't note link remap for %s\n", path);
- return -1;
-}
-
-/* Trim "a/b/c/d" to "a/b/d" */
-static int trim_last_parent(char *path)
-{
- char *fname, *p;
-
- p = strrchr(path, '/');
- fname = p + 1;
- if (!p || *fname == '\0')
- return -1;
-
- while (p >= path && *p == '/')
- p--;
-
- if (p < path)
- return -1;
-
- while (p >= path && *p != '/')
- p--;
- p++;
-
- while (*fname != '\0')
- *p++ = *fname++;
- *p = '\0';
-
- return 0;
-}
-
-static int mkreg_ghost(char *path, u32 mode, struct ghost_file *gf, struct cr_img *img)
-{
- int gfd, ret;
-
- gfd = open(path, O_WRONLY | O_CREAT | O_EXCL, mode);
- if (gfd < 0)
- return -1;
-
- ret = copy_file(img_raw_fd(img), gfd, 0);
- close(gfd);
-
- return ret;
-}
-
-static int ghost_apply_metadata(const char *path, GhostFileEntry *gfe)
-{
- struct timeval tv[2];
- int ret = -1;
-
- if (chown(path, gfe->uid, gfe->gid) < 0) {
- pr_perror("Can't reset user/group on ghost %s", path);
- goto err;
- }
-
- if (chmod(path, gfe->mode)) {
- pr_perror("Can't set perms %o on ghost %s", gfe->mode, path);
- goto err;
- }
-
- if (gfe->atim) {
- tv[0].tv_sec = gfe->atim->tv_sec;
- tv[0].tv_usec = gfe->atim->tv_usec;
- tv[1].tv_sec = gfe->mtim->tv_sec;
- tv[1].tv_usec = gfe->mtim->tv_usec;
- if (lutimes(path, tv)) {
- pr_perror("Can't set access and modufication times on ghost %s", path);
- goto err;
- }
- }
-
- ret = 0;
-err:
- return ret;
-}
-
-static int create_ghost(struct ghost_file *gf, GhostFileEntry *gfe, struct cr_img *img)
-{
- char path[PATH_MAX];
- int ret, root_len;
- char *msg;
-
- root_len = ret = rst_get_mnt_root(gf->remap.rmnt_id, path, sizeof(path));
- if (ret < 0) {
- pr_err("The %d mount is not found for ghost\n", gf->remap.rmnt_id);
- goto err;
- }
-
- snprintf(path + ret, sizeof(path) - ret, "/%s", gf->remap.rpath);
- ret = -1;
-again:
- if (S_ISFIFO(gfe->mode)) {
- if ((ret = mknod(path, gfe->mode, 0)) < 0)
- msg = "Can't create node for ghost file";
- } else if (S_ISCHR(gfe->mode) || S_ISBLK(gfe->mode)) {
- if (!gfe->has_rdev) {
- pr_err("No rdev for ghost device\n");
- goto err;
- }
- if ((ret = mknod(path, gfe->mode, gfe->rdev)) < 0)
- msg = "Can't create node for ghost dev";
- } else if (S_ISDIR(gfe->mode)) {
- if ((ret = mkdir(path, gfe->mode)) < 0) {
- pr_perror("Can't make ghost dir");
- goto err;
- }
- } else {
- if ((ret = mkreg_ghost(path, gfe->mode, gf, img)) < 0)
- msg = "Can't create ghost regfile\n";
- }
-
- if (ret < 0) {
- /* Use grand parent, if parent directory does not exist */
- if (errno == ENOENT) {
- if (trim_last_parent(path) < 0) {
- pr_err("trim failed: @%s@\n", path);
- goto err;
- }
- goto again;
- }
-
- pr_perror("%s", msg);
- goto err;
- }
-
- strcpy(gf->remap.rpath, path + root_len + 1);
- pr_debug("Remap rpath is %s\n", gf->remap.rpath);
-
- ret = -1;
- if (ghost_apply_metadata(path, gfe))
- goto err;
-
- ret = 0;
-err:
- return ret;
-}
-
-static inline void ghost_path(char *path, int plen,
- struct reg_file_info *rfi, RemapFilePathEntry *rfe)
-{
- snprintf(path, plen, "%s.cr.%x.ghost", rfi->path, rfe->remap_id);
-}
-
-static int open_remap_ghost(struct reg_file_info *rfi,
- RemapFilePathEntry *rfe)
-{
- struct ghost_file *gf;
- GhostFileEntry *gfe = NULL;
- struct cr_img *img;
-
- list_for_each_entry(gf, &ghost_files, list)
- if (gf->id == rfe->remap_id)
- goto gf_found;
-
- /*
- * Ghost not found. We will create one in the same dir
- * as the very first client of it thus resolving any
- * issues with cross-device links.
- */
-
- pr_info("Opening ghost file %#x for %s\n", rfe->remap_id, rfi->path);
-
- gf = shmalloc(sizeof(*gf));
- if (!gf)
- return -1;
-
- gf->remap.rpath = xmalloc(PATH_MAX);
- if (!gf->remap.rpath)
- goto err;
-
- img = open_image(CR_FD_GHOST_FILE, O_RSTR, rfe->remap_id);
- if (!img)
- goto err;
-
- if (pb_read_one(img, &gfe, PB_GHOST_FILE) < 0)
- goto close_ifd;
-
- /*
- * For old formats where optional has_[dev|ino] is
- * not present we will have zeros here which is quite
- * a sign for "absent" fields.
- */
- gf->dev = gfe->dev;
- gf->ino = gfe->ino;
- gf->remap.rmnt_id = rfi->rfe->mnt_id;
-
- if (S_ISDIR(gfe->mode))
- strncpy(gf->remap.rpath, rfi->path, PATH_MAX);
- else
- ghost_path(gf->remap.rpath, PATH_MAX, rfi, rfe);
-
- if (create_ghost(gf, gfe, img))
- goto close_ifd;
-
- ghost_file_entry__free_unpacked(gfe, NULL);
- close_image(img);
-
- gf->id = rfe->remap_id;
- gf->remap.users = 0;
- gf->remap.is_dir = S_ISDIR(gfe->mode);
- gf->remap.owner = gfe->uid;
- list_add_tail(&gf->list, &ghost_files);
-gf_found:
- rfi->remap = &gf->remap;
- return 0;
-
-close_ifd:
- close_image(img);
-err:
- if (gfe)
- ghost_file_entry__free_unpacked(gfe, NULL);
- xfree(gf->remap.rpath);
- shfree_last(gf);
- return -1;
-}
-
-static int open_remap_linked(struct reg_file_info *rfi,
- RemapFilePathEntry *rfe)
-{
- struct file_remap *rm;
- struct file_desc *rdesc;
- struct reg_file_info *rrfi;
- uid_t owner = -1;
-
- rdesc = find_file_desc_raw(FD_TYPES__REG, rfe->remap_id);
- if (!rdesc) {
- pr_err("Can't find target file %x\n", rfe->remap_id);
- return -1;
- }
-
- rm = xmalloc(sizeof(*rm));
- if (!rm)
- return -1;
-
- rrfi = container_of(rdesc, struct reg_file_info, d);
- pr_info("Remapped %s -> %s\n", rfi->path, rrfi->path);
-
- if (root_ns_mask & CLONE_NEWUSER) {
- int rfd;
- struct stat st;
-
- rfd = mntns_get_root_by_mnt_id(rfi->rfe->mnt_id);
- if (fstatat(rfd, rrfi->path, &st, AT_SYMLINK_NOFOLLOW)) {
- pr_perror("Can't get owner of link remap %s", rrfi->path);
- xfree(rm);
- return -1;
- }
-
- owner = st.st_uid;
- }
-
- rm->rpath = rrfi->path;
- rm->users = 0;
- rm->is_dir = false;
- rm->owner = owner;
- rm->rmnt_id = rfi->rfe->mnt_id;
- rfi->remap = rm;
- return 0;
-}
-
-static int open_remap_dead_process(struct reg_file_info *rfi,
- RemapFilePathEntry *rfe)
-{
- struct pstree_item *helper;
-
- for_each_pstree_item(helper) {
- /* don't need to add multiple tasks */
- if (helper->pid.virt == rfe->remap_id) {
- pr_info("Skipping helper for restoring /proc/%d; pid exists\n", rfe->remap_id);
- return 0;
- }
- }
-
- helper = alloc_pstree_helper();
- if (!helper)
- return -1;
-
- helper->sid = root_item->sid;
- helper->pgid = root_item->pgid;
- helper->pid.virt = rfe->remap_id;
- helper->parent = root_item;
- list_add_tail(&helper->sibling, &root_item->children);
-
- pr_info("Added a helper for restoring /proc/%d\n", helper->pid.virt);
-
- return 0;
-}
-
-struct remap_info {
- struct list_head list;
- RemapFilePathEntry *rfe;
- struct reg_file_info *rfi;
-};
-
-static int collect_one_remap(void *obj, ProtobufCMessage *msg)
-{
- struct remap_info *ri = obj;
- RemapFilePathEntry *rfe;
- struct file_desc *fdesc;
-
- ri->rfe = rfe = pb_msg(msg, RemapFilePathEntry);
-
- if (!rfe->has_remap_type) {
- rfe->has_remap_type = true;
- /* backward compatibility with images */
- if (rfe->remap_id & REMAP_GHOST) {
- rfe->remap_id &= ~REMAP_GHOST;
- rfe->remap_type = REMAP_TYPE__GHOST;
- } else
- rfe->remap_type = REMAP_TYPE__LINKED;
- }
-
- fdesc = find_file_desc_raw(FD_TYPES__REG, rfe->orig_id);
- if (fdesc == NULL) {
- pr_err("Remap for non existing file %#x\n", rfe->orig_id);
- return -1;
- }
-
- ri->rfi = container_of(fdesc, struct reg_file_info, d);
-
- list_add_tail(&ri->list, &remaps);
-
- return 0;
-}
-
-static int prepare_one_remap(struct remap_info *ri)
-{
- int ret = -1;
- RemapFilePathEntry *rfe = ri->rfe;
- struct reg_file_info *rfi = ri->rfi;
-
- pr_info("Configuring remap %#x -> %#x\n", rfi->rfe->id, rfe->remap_id);
-
- switch (rfe->remap_type) {
- case REMAP_TYPE__LINKED:
- ret = open_remap_linked(rfi, rfe);
- break;
- case REMAP_TYPE__GHOST:
- ret = open_remap_ghost(rfi, rfe);
- break;
- case REMAP_TYPE__PROCFS:
- /* handled earlier by prepare_procfs_remaps */
- ret = 0;
- break;
- default:
- pr_err("unknown remap type %u\n", rfe->remap_type);
- goto out;
- }
-
-out:
- return ret;
-}
-
-/* We separate the prepartion of PROCFS remaps because they allocate pstree
- * items, which need to be seen by the root task. We can't do all remaps here,
- * because the files haven't been loaded yet.
- */
-int prepare_procfs_remaps(void)
-{
- struct remap_info *ri;
-
- list_for_each_entry(ri, &remaps, list) {
- RemapFilePathEntry *rfe = ri->rfe;
- struct reg_file_info *rfi = ri->rfi;
-
- switch (rfe->remap_type) {
- case REMAP_TYPE__PROCFS:
- if (open_remap_dead_process(rfi, rfe) < 0)
- return -1;
- break;
- default:
- continue;
- }
- }
-
- return 0;
-}
-
-int prepare_remaps(void)
-{
- struct remap_info *ri;
- int ret = 0;
-
- list_for_each_entry(ri, &remaps, list) {
- ret = prepare_one_remap(ri);
- if (ret)
- break;
- }
-
- return ret;
-}
-
-static void try_clean_ghost(struct remap_info *ri)
-{
- char path[PATH_MAX];
- int mnt_id, ret;
-
- mnt_id = ri->rfi->rfe->mnt_id; /* rirfirfe %) */
- ret = rst_get_mnt_root(mnt_id, path, sizeof(path));
- if (ret < 0)
- return;
-
- ghost_path(path + ret, sizeof(path) - 1, ri->rfi, ri->rfe);
- if (!unlink(path)) {
- pr_info(" `- X [%s] ghost\n", path);
- return;
- }
-
- /*
- * We can also find out the ghost type by stat()-ing
- * it or by reading the ghost image, but this way
- * is the fastest one.
- */
-
- if ((errno == EISDIR)) {
- strncpy(path + ret, ri->rfi->path, sizeof(path) - 1);
- if (!rmdir(path)) {
- pr_info(" `- Xd [%s] ghost\n", path);
- return;
- }
- }
-
- pr_perror(" `- XFail [%s] ghost", path);
-}
-
-void try_clean_remaps(int ns_fd)
-{
- struct remap_info *ri;
- int old_ns = -1;
- int cwd_fd = -1;
-
- if (list_empty(&remaps))
- goto out;
-
- if (ns_fd >= 0) {
- pr_info("Switching to new ns to clean ghosts\n");
-
- old_ns = open_proc(PROC_SELF, "ns/mnt");
- if (old_ns < 0) {
- pr_perror("`- Can't keep old ns");
- return;
- }
-
- cwd_fd = open(".", O_DIRECTORY);
- if (cwd_fd < 0) {
- pr_perror("Unable to open cwd");
- return;
- }
-
- if (setns(ns_fd, CLONE_NEWNS) < 0) {
- close(old_ns);
- close(cwd_fd);
- pr_perror("`- Can't switch");
- return;
- }
- }
-
- list_for_each_entry(ri, &remaps, list)
- if (ri->rfe->remap_type == REMAP_TYPE__GHOST)
- try_clean_ghost(ri);
-
- if (old_ns >= 0) {
- if (setns(old_ns, CLONE_NEWNS) < 0)
- pr_perror("Fail to switch back!");
- close(old_ns);
- }
-
- if (cwd_fd >= 0) {
- if (fchdir(cwd_fd)) {
- pr_perror("Unable to restore cwd");
- close(cwd_fd);
- return;
- }
- close(cwd_fd);
- }
-
-out:
- if (ns_fd >= 0)
- close(ns_fd);
-}
-
-static struct collect_image_info remap_cinfo = {
- .fd_type = CR_FD_REMAP_FPATH,
- .pb_type = PB_REMAP_FPATH,
- .priv_size = sizeof(struct remap_info),
- .collect = collect_one_remap,
-};
-
-static int dump_ghost_file(int _fd, u32 id, const struct stat *st, dev_t phys_dev)
-{
- struct cr_img *img;
- GhostFileEntry gfe = GHOST_FILE_ENTRY__INIT;
- Timeval atim = TIMEVAL__INIT, mtim = TIMEVAL__INIT;
-
- pr_info("Dumping ghost file contents (id %#x)\n", id);
-
- img = open_image(CR_FD_GHOST_FILE, O_DUMP, id);
- if (!img)
- return -1;
-
- gfe.uid = userns_uid(st->st_uid);
- gfe.gid = userns_gid(st->st_gid);
- gfe.mode = st->st_mode;
-
- gfe.atim = &atim;
- gfe.mtim = &mtim;
- gfe.atim->tv_sec = st->st_atim.tv_sec;
- gfe.atim->tv_usec = st->st_atim.tv_nsec / 1000;
- gfe.mtim->tv_sec = st->st_mtim.tv_sec;
- gfe.mtim->tv_usec = st->st_mtim.tv_nsec / 1000;
-
- gfe.has_dev = gfe.has_ino = true;
- gfe.dev = phys_dev;
- gfe.ino = st->st_ino;
-
- if (S_ISCHR(st->st_mode) || S_ISBLK(st->st_mode)) {
- gfe.has_rdev = true;
- gfe.rdev = st->st_rdev;
- }
-
- if (pb_write_one(img, &gfe, PB_GHOST_FILE))
- return -1;
-
- if (S_ISREG(st->st_mode)) {
- int fd, ret;
- char lpath[PSFDS];
-
- /*
- * Reopen file locally since it may have no read
- * permissions when drained
- */
- sprintf(lpath, "/proc/self/fd/%d", _fd);
- fd = open(lpath, O_RDONLY);
- if (fd < 0) {
- pr_perror("Can't open ghost original file");
- return -1;
- }
- ret = copy_file(fd, img_raw_fd(img), st->st_size);
- close(fd);
- if (ret)
- return -1;
- }
-
- close_image(img);
- return 0;
-}
-
-void remap_put(struct file_remap *remap)
-{
- mutex_lock(ghost_file_mutex);
- if (--remap->users == 0) {
- int mntns_root;
-
- pr_info("Unlink the ghost %s\n", remap->rpath);
-
- mntns_root = mntns_get_root_by_mnt_id(remap->rmnt_id);
- unlinkat(mntns_root, remap->rpath, 0);
- }
- mutex_unlock(ghost_file_mutex);
-}
-
-struct file_remap *lookup_ghost_remap(u32 dev, u32 ino)
-{
- struct ghost_file *gf;
-
- mutex_lock(ghost_file_mutex);
- list_for_each_entry(gf, &ghost_files, list) {
- if (gf->ino == ino && (gf->dev == dev)) {
- gf->remap.users++;
- mutex_unlock(ghost_file_mutex);
- return &gf->remap;
- }
- }
- mutex_unlock(ghost_file_mutex);
-
- return NULL;
-}
-
-static int dump_ghost_remap(char *path, const struct stat *st,
- int lfd, u32 id, struct ns_id *nsid)
-{
- struct ghost_file *gf;
- RemapFilePathEntry rpe = REMAP_FILE_PATH_ENTRY__INIT;
- dev_t phys_dev;
-
- pr_info("Dumping ghost file for fd %d id %#x\n", lfd, id);
-
- if (st->st_size > opts.ghost_limit) {
- pr_err("Can't dump ghost file %s of %"PRIu64" size, increase limit\n",
- path, st->st_size);
- return -1;
- }
-
- phys_dev = phys_stat_resolve_dev(nsid, st->st_dev, path);
- list_for_each_entry(gf, &ghost_files, list)
- if ((gf->dev == phys_dev) && (gf->ino == st->st_ino))
- goto dump_entry;
-
- gf = xmalloc(sizeof(*gf));
- if (gf == NULL)
- return -1;
-
- gf->dev = phys_dev;
- gf->ino = st->st_ino;
- gf->id = ghost_file_ids++;
- list_add_tail(&gf->list, &ghost_files);
-
- if (dump_ghost_file(lfd, gf->id, st, phys_dev))
- return -1;
-
-dump_entry:
- rpe.orig_id = id;
- rpe.remap_id = gf->id;
- rpe.has_remap_type = true;
- rpe.remap_type = REMAP_TYPE__GHOST;
-
- return pb_write_one(img_from_set(glob_imgset, CR_FD_REMAP_FPATH),
- &rpe, PB_REMAP_FPATH);
-}
-
-static void __rollback_link_remaps(bool do_unlink)
-{
- struct link_remap_rlb *rlb, *tmp;
- int mntns_root;
-
- list_for_each_entry_safe(rlb, tmp, &remaps, list) {
- if (do_unlink) {
- mntns_root = mntns_get_root_fd(rlb->mnt_ns);
- if (mntns_root >= 0)
- unlinkat(mntns_root, rlb->path, 0);
- else
- pr_err("Failed to clenaup %s link remap\n", rlb->path);
- }
-
- list_del(&rlb->list);
- xfree(rlb->path);
- xfree(rlb);
- }
-}
-
-void delete_link_remaps(void) { __rollback_link_remaps(true); }
-void free_link_remaps(void) { __rollback_link_remaps(false); }
-
-static int create_link_remap(char *path, int len, int lfd,
- u32 *idp, struct ns_id *nsid)
-{
- char link_name[PATH_MAX], *tmp;
- RegFileEntry rfe = REG_FILE_ENTRY__INIT;
- FownEntry fwn = FOWN_ENTRY__INIT;
- int mntns_root;
-
- if (!opts.link_remap_ok) {
- pr_err("Can't create link remap for %s. "
- "Use " LREMAP_PARAM " option.\n", path);
- return -1;
- }
-
- /*
- * Linked remapping -- we create a hard link on a removed file
- * in the directory original file used to sit.
- *
- * Bad news is than we can't easily open lfd's parent dir. Thus
- * we have to just generate an absolute path and use it. The linkat
- * will fail if we chose the bad one.
- */
-
- link_name[0] = '.';
- memcpy(link_name + 1, path, len);
- tmp = link_name + len;
- while (*tmp != '/') {
- BUG_ON(tmp == link_name);
- tmp--;
- }
-
- fd_id_generate_special(NULL, idp);
- rfe.id = *idp;
- rfe.flags = 0;
- rfe.pos = 0;
- rfe.fown = &fwn;
- rfe.name = link_name + 1;
-
- /* Any 'unique' name works here actually. Remap works by reg-file ids. */
- snprintf(tmp + 1, sizeof(link_name) - (size_t)(tmp - link_name - 1), "link_remap.%d", rfe.id);
-
- mntns_root = mntns_get_root_fd(nsid);
-
- if (linkat(lfd, "", mntns_root, link_name, AT_EMPTY_PATH) < 0) {
- pr_perror("Can't link remap to %s", path);
- return -1;
- }
-
- if (note_link_remap(link_name, nsid))
- return -1;
-
- return pb_write_one(img_from_set(glob_imgset, CR_FD_REG_FILES), &rfe, PB_REG_FILE);
-}
-
-static int dump_linked_remap(char *path, int len, const struct stat *ost,
- int lfd, u32 id, struct ns_id *nsid)
-{
- u32 lid;
- RemapFilePathEntry rpe = REMAP_FILE_PATH_ENTRY__INIT;
-
- if (create_link_remap(path, len, lfd, &lid, nsid))
- return -1;
-
- rpe.orig_id = id;
- rpe.remap_id = lid;
-
- return pb_write_one(img_from_set(glob_imgset, CR_FD_REMAP_FPATH),
- &rpe, PB_REMAP_FPATH);
-}
-
-static int have_seen_dead_pid(pid_t pid)
-{
- static pid_t *dead_pids = NULL;
- static int n_dead_pids = 0;
- size_t i;
-
- for (i = 0; i < n_dead_pids; i++) {
- if (dead_pids[i] == pid)
- return 1;
- }
-
- if (xrealloc_safe(&dead_pids, sizeof(*dead_pids) * (n_dead_pids + 1)))
- return -1;
- dead_pids[n_dead_pids++] = pid;
-
- return 0;
-}
-
-static int dump_dead_process_remap(pid_t pid, u32 id)
-{
- RemapFilePathEntry rpe = REMAP_FILE_PATH_ENTRY__INIT;
- int ret;
-
- ret = have_seen_dead_pid(pid);
- if (ret < 0)
- return -1;
- if (ret) {
- pr_info("Found dead pid %d already, skipping remap\n", pid);
- return 0;
- }
-
- rpe.orig_id = id;
- rpe.remap_id = pid;
- rpe.has_remap_type = true;
- rpe.remap_type = REMAP_TYPE__PROCFS;
-
- return pb_write_one(img_from_set(glob_imgset, CR_FD_REMAP_FPATH),
- &rpe, PB_REMAP_FPATH);
-}
-
-static bool is_sillyrename_name(char *name)
-{
- int i;
-
- name = strrchr(name, '/');
- BUG_ON(name == NULL); /* see check in dump_one_reg_file */
- name++;
-
- /*
- * Strictly speaking this check is not bullet-proof. User
- * can create file with this name by hands and we have no
- * API to distinguish really-silly-renamed files from those
- * fake names :(
- *
- * But since NFS people expect .nfsXXX files to be unstable,
- * we treat them as such too.
- */
-
- if (strncmp(name, SILLYNAME_PREF, sizeof(SILLYNAME_PREF) - 1))
- return false;
-
- name += sizeof(SILLYNAME_PREF) - 1;
- for (i = 0; i < SILLYNAME_SUFF_LEN; i++)
- if (!isxdigit(name[i]))
- return false;
-
- return true;
-}
-
-static inline bool nfs_silly_rename(char *rpath, const struct fd_parms *parms)
-{
- return (parms->fs_type == NFS_SUPER_MAGIC) && is_sillyrename_name(rpath);
-}
-
-int strip_deleted(struct fd_link *link)
-{
- struct dcache_prepends {
- const char *str;
- size_t len;
- } static const prepends[] = {
- {
- .str = " (deleted)",
- .len = 10,
- }, {
- .str = "//deleted",
- .len = 9,
- }
- };
- size_t i;
-
- for (i = 0; i < ARRAY_SIZE(prepends); i++) {
- size_t at;
-
- if (link->len <= prepends[i].len)
- continue;
-
- at = link->len - prepends[i].len;
- if (!strcmp(&link->name[at], prepends[i].str)) {
- pr_debug("Strip '%s' tag from '%s'\n",
- prepends[i].str, link->name);
- link->name[at] = '\0';
- link->len -= prepends[i].len;
- return 1;
- }
- }
- return 0;
-}
-
-static int check_path_remap(struct fd_link *link, const struct fd_parms *parms,
- int lfd, u32 id, struct ns_id *nsid)
-{
- char *rpath = link->name;
- int plen = link->len;
- int ret, mntns_root;
- struct stat pst;
- const struct stat *ost = &parms->stat;
-
- if (parms->fs_type == PROC_SUPER_MAGIC) {
- /* The file points to /proc/pid/<foo> where pid is a dead
- * process. We remap this file by adding this pid to be
- * fork()ed into a TASK_HELPER state so that we can point to it
- * on restore.
- */
- pid_t pid;
- char *start, *end;
-
- /* skip "./proc/" */
- start = strstr(rpath, "/");
- if (!start)
- return -1;
- start = strstr(start + 1, "/");
- if (!start)
- return -1;
- pid = strtol(start + 1, &end, 10);
-
- /* If strtol didn't convert anything, then we are looking at
- * something like /proc/kmsg, which we shouldn't mess with.
- * Anything under /proc/<pid> (including that directory itself)
- * can be c/r'd with a dead pid remap, so let's allow all such
- * cases.
- */
- if (pid != 0) {
- bool is_dead = strip_deleted(link);
-
- /* /proc/<pid> will be "/proc/1 (deleted)" when it is
- * dead, but a path like /proc/1/mountinfo won't have
- * the suffix, since it isn't actually deleted (still
- * exists, but the parent dir is deleted). So, if we
- * have a path like /proc/1/mountinfo, test if /proc/1
- * exists instead, since this is what CRIU will need to
- * open on restore.
- */
- if (!is_dead) {
- *end = 0;
- is_dead = access(rpath, F_OK);
- *end = '/';
- }
-
- if (is_dead) {
- pr_info("Dumping dead process remap of %d\n", pid);
- return dump_dead_process_remap(pid, id);
- }
- }
-
- return 0;
- } else if (parms->fs_type == DEVPTS_SUPER_MAGIC) {
- /*
- * It's safe to call stripping here because
- * file paths are having predefined format for
- * this FS and can't have a valid " (deleted)"
- * postfix as a part of not deleted filename.
- */
- strip_deleted(link);
- /*
- * Devpts devices/files are generated by the
- * kernel itself so we should not try to generate
- * any kind of ghost files here even if file is
- * no longer exist.
- */
- return 0;
- }
-
- if (ost->st_nlink == 0) {
- /*
- * Unpleasant, but easy case. File is completely invisible
- * from the FS. Just dump its contents and that's it. But
- * be careful whether anybody still has any of its hardlinks
- * also open.
- */
- strip_deleted(link);
- return dump_ghost_remap(rpath + 1, ost, lfd, id, nsid);
- }
-
- if (nfs_silly_rename(rpath, parms)) {
- /*
- * If this is NFS silly-rename file the path we have at hands
- * will be accessible by fstat(), but once we kill the dumping
- * tasks it will disappear. So we just go ahead an dump it as
- * linked-remap file (NFS will allow us to create more hard
- * links on it) to have some persistent name at hands.
- */
- pr_debug("Dump silly-rename linked remap for %x\n", id);
- return dump_linked_remap(rpath + 1, plen - 1, ost, lfd, id, nsid);
- }
-
- mntns_root = mntns_get_root_fd(nsid);
- if (mntns_root < 0)
- return -1;
-
- ret = fstatat(mntns_root, rpath, &pst, 0);
- if (ret < 0) {
- /*
- * Linked file, but path is not accessible (unless any
- * other error occurred). We can create a temporary link to it
- * uning linkat with AT_EMPTY_PATH flag and remap it to this
- * name.
- */
-
- if (errno == ENOENT)
- return dump_linked_remap(rpath + 1, plen - 1,
- ost, lfd, id, nsid);
-
- pr_perror("Can't stat path");
- return -1;
- }
-
- if ((pst.st_ino != ost->st_ino) || (pst.st_dev != ost->st_dev)) {
- if (opts.evasive_devices &&
- (S_ISCHR(ost->st_mode) || S_ISBLK(ost->st_mode)) &&
- pst.st_rdev == ost->st_rdev)
- return 0;
- /*
- * FIXME linked file, but the name we see it by is reused
- * by somebody else. We can dump it with linked remaps, but
- * we'll have difficulties on restore -- we will have to
- * move the exisint file aside, then restore this one,
- * unlink, then move the original file back. It's fairly
- * easy to do, but we don't do it now, since unlinked files
- * have the "(deleted)" suffix in proc and name conflict
- * is unlikely :)
- */
- pr_err("Unaccessible path opened %u:%u, need %u:%u\n",
- (int)pst.st_dev, (int)pst.st_ino,
- (int)ost->st_dev, (int)ost->st_ino);
- return -1;
- }
-
- /*
- * File is linked and visible by the name it is opened by
- * this task. Go ahead and dump it.
- */
- return 0;
-}
-
-static bool should_check_size(int flags)
-{
- /* Skip size if file has O_APPEND and O_WRONLY flags (e.g. log file). */
- if (((flags & O_ACCMODE) == O_WRONLY) &&
- (flags & O_APPEND))
- return false;
-
- return true;
-}
-
-int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p)
-{
- struct fd_link _link, *link;
- struct ns_id *nsid;
- struct cr_img *rimg;
-
- RegFileEntry rfe = REG_FILE_ENTRY__INIT;
-
- if (!p->link) {
- if (fill_fdlink(lfd, p, &_link))
- return -1;
- link = &_link;
- } else
- link = p->link;
-
- nsid = lookup_nsid_by_mnt_id(p->mnt_id);
- if (nsid == NULL) {
- pr_err("Can't lookup mount=%d for fd=%d path=%s\n",
- p->mnt_id, p->fd, link->name + 1);
- return -1;
- }
-
- if (p->mnt_id >= 0 && (root_ns_mask & CLONE_NEWNS)) {
- rfe.mnt_id = p->mnt_id;
- rfe.has_mnt_id = true;
- }
-
- pr_info("Dumping path for %d fd via self %d [%s]\n",
- p->fd, lfd, &link->name[1]);
-
- /*
- * The regular path we can handle should start with slash.
- */
- if (link->name[1] != '/') {
- pr_err("The path [%s] is not supported\n", &link->name[1]);
- return -1;
- }
-
- if (check_path_remap(link, p, lfd, id, nsid))
- return -1;
-
- rfe.id = id;
- rfe.flags = p->flags;
- rfe.pos = p->pos;
- rfe.fown = (FownEntry *)&p->fown;
- rfe.name = &link->name[1];
-
- if (S_ISREG(p->stat.st_mode) && should_check_size(rfe.flags)) {
- rfe.has_size = true;
- rfe.size = p->stat.st_size;
- }
-
- rimg = img_from_set(glob_imgset, CR_FD_REG_FILES);
- return pb_write_one(rimg, &rfe, PB_REG_FILE);
-}
-
-const struct fdtype_ops regfile_dump_ops = {
- .type = FD_TYPES__REG,
- .dump = dump_one_reg_file,
-};
-
-static void convert_path_from_another_mp(char *src, char *dst, int dlen,
- struct mount_info *smi,
- struct mount_info *dmi)
-{
- int off;
-
- /*
- * mi->mountpoint ./foo/bar
- * mi->ns_mountpoint /foo/bar
- * rfi->path foo/bar/baz
- */
- off = strlen(smi->ns_mountpoint + 1);
- BUG_ON(strlen(smi->root) < strlen(dmi->root));
-
- /*
- * Create paths relative to this mount.
- * Absolute path to the mount point + difference between source
- * and destination roots + path relative to the mountpoint.
- */
- snprintf(dst, dlen, "%s/%s/%s",
- dmi->ns_mountpoint + 1,
- smi->root + strlen(dmi->root),
- src + off);
-}
-
-static int linkat_hard(int odir, char *opath, int ndir, char *npath, uid_t owner)
-{
- int ret, old_fsuid = -1;
- int errno_save;
-
- if (root_ns_mask & CLONE_NEWUSER)
- /*
- * Kernel has strange secutiry restrictions about
- * linkat. If the fsuid of the caller doesn't equals
- * the uid of the file and the file is not "safe"
- * one, then only global CAP_CHOWN will be allowed
- * to link().
- *
- * Next, when we're in user namespace we're ns root,
- * but not global CAP_CHOWN. Thus, even though we
- * ARE ns root, we will not be allowed to link() at
- * files that belong to regular users %)
- *
- * Fortunately, the setfsuid() requires ns-level
- * CAP_SETUID which we have.
- */
-
- old_fsuid = setfsuid(owner);
-
- ret = linkat(odir, opath, ndir, npath, 0);
- errno_save = errno;
- if (ret < 0)
- pr_perror("Can't link %s -> %s", opath, npath);
-
- if (root_ns_mask & CLONE_NEWUSER) {
- setfsuid(old_fsuid);
- if (setfsuid(-1) != old_fsuid) {
- pr_warn("Failed to restore old fsuid!\n");
- /*
- * Don't fail here. We still have chances to run till
- * the pie/restorer, and if _this_ guy fails to set
- * the proper fsuid, then we'll abort the restore.
- */
- }
-
- /*
- * Restoring PR_SET_DUMPABLE flag is required after setfsuid,
- * as if it not set, proc inode will be created with root cred
- * (see proc_pid_make_inode), which will result in permission
- * check fail when trying to access files in /proc/self/
- */
- prctl(PR_SET_DUMPABLE, 1, 0);
- }
- errno = errno_save;
-
- return ret;
-}
-
-static void rm_parent_dirs(int mntns_root, char *path, int count)
-{
- char *p, *prev = NULL;
-
- if (!count)
- return;
-
- while (count--) {
- p = strrchr(path, '/');
- if (p)
- *p = '\0';
- if (prev)
- *prev = '/';
-
- if (unlinkat(mntns_root, path, AT_REMOVEDIR))
- pr_perror("Can't remove %s AT %d", path, mntns_root);
- else
- pr_debug("Unlinked parent dir: %s AT %d\n", path, mntns_root);
- prev = p;
- }
-
- if (prev)
- *prev = '/';
-}
-
-/* Construct parent dir name and mkdir parent/grandparents if they're not exist */
-static int make_parent_dirs_if_need(int mntns_root, char *path)
-{
- char *p, *last_delim;
- int err, count = 0;
- struct stat st;
-
- p = last_delim = strrchr(path, '/');
- if (!p) {
- pr_err("Path %s has no parent dir", path);
- return -1;
- }
- *p = '\0';
-
- if (fstatat(mntns_root, path, &st, AT_EMPTY_PATH) == 0)
- goto out;
- if (errno != ENOENT) {
- pr_perror("Can't stat %s", path);
- count = -1;
- goto out;
- }
-
- p = path;
- do {
- p = strchr(p, '/');
- if (p)
- *p = '\0';
-
- err = mkdirat(mntns_root, path, 0777);
- if (err && errno != EEXIST) {
- pr_perror("Can't create dir: %s AT %d", path, mntns_root);
- rm_parent_dirs(mntns_root, path, count);
- count = -1;
- goto out;
- } else if (!err) {
- pr_debug("Created parent dir: %s AT %d\n", path, mntns_root);
- count++;
- }
-
- if (p)
- *p++ = '/';
- } while (p);
-out:
- *last_delim = '/';
- return count;
-}
-
-/*
- * This routine properly resolves d's path handling ghost/link-remaps.
- * The open_cb is a routine that does actual open, it differs for
- * files, directories, fifos, etc.
- */
-
-static int rfi_remap(struct reg_file_info *rfi, int *level)
-{
- struct mount_info *mi, *rmi, *tmi;
- char _path[PATH_MAX], *path = _path;
- char _rpath[PATH_MAX], *rpath = _rpath;
- int mntns_root;
-
- if (rfi->rfe->mnt_id == -1) {
- /* Know nothing about mountpoints */
- mntns_root = mntns_get_root_by_mnt_id(-1);
- path = rfi->path;
- rpath = rfi->remap->rpath;
- goto out_root;
- }
-
- mi = lookup_mnt_id(rfi->rfe->mnt_id);
- if (rfi->rfe->mnt_id == rfi->remap->rmnt_id) {
- /* Both links on the same mount point */
- tmi = mi;
- path = rfi->path;
- rpath = rfi->remap->rpath;
- goto out;
- }
-
- rmi = lookup_mnt_id(rfi->remap->rmnt_id);
-
- /*
- * Find the common bind-mount. We know that one mount point was
- * really mounted and all other were bind-mounted from it, so the
- * lowest mount must contains all bind-mounts.
- */
- for (tmi = mi; tmi->bind; tmi = tmi->bind)
- ;
-
- BUG_ON(tmi->s_dev != rmi->s_dev);
- BUG_ON(tmi->s_dev != mi->s_dev);
-
- /* Calcalate paths on the device (root mount) */
- convert_path_from_another_mp(rfi->path, path, sizeof(_path), mi, tmi);
- convert_path_from_another_mp(rfi->remap->rpath, rpath, sizeof(_rpath), rmi, tmi);
-
-out:
- pr_debug("%d: Link %s -> %s\n", tmi->mnt_id, rpath, path);
- mntns_root = mntns_get_root_fd(tmi->nsid);
-
-out_root:
- *level = make_parent_dirs_if_need(mntns_root, path);
- if (*level < 0)
- return -1;
-
- if (linkat_hard(mntns_root, rpath, mntns_root, path, rfi->remap->owner) < 0) {
- rm_parent_dirs(mntns_root, path, *level);
- return -1;
- }
-
- return 0;
-}
-
-int open_path(struct file_desc *d,
- int(*open_cb)(int mntns_root, struct reg_file_info *, void *), void *arg)
-{
- int tmp, mntns_root, level;
- struct reg_file_info *rfi;
- char *orig_path = NULL;
-
- if (inherited_fd(d, &tmp))
- return tmp;
-
- rfi = container_of(d, struct reg_file_info, d);
- if (rfi->remap) {
- mutex_lock(ghost_file_mutex);
- if (rfi->remap->is_dir) {
- /*
- * FIXME Can't make directory under new name.
- * Will have to open it under the ghost one :(
- */
- orig_path = rfi->path;
- rfi->path = rfi->remap->rpath;
- } else if (rfi_remap(rfi, &level) < 0) {
- static char tmp_path[PATH_MAX];
-
- if (errno != EEXIST) {
- pr_err("Can't link %s -> %s", rfi->path,
- rfi->remap->rpath);
- return -1;
- }
-
- /*
- * The file whose name we're trying to create
- * exists. Need to pick some other one, we're
- * going to remove it anyway.
- *
- * Strictly speaking, this is cheating, file
- * name shouldn't change. But since NFS with
- * its silly-rename doesn't care, why should we?
- */
-
- orig_path = rfi->path;
- rfi->path = tmp_path;
- snprintf(tmp_path, sizeof(tmp_path), "%s.cr_link", orig_path);
- pr_debug("Fake %s -> %s link\n", rfi->path, rfi->remap->rpath);
-
- if (rfi_remap(rfi, &level) < 0) {
- pr_perror("Can't create even fake link!");
- return -1;
- }
- }
- }
-
- mntns_root = mntns_get_root_by_mnt_id(rfi->rfe->mnt_id);
- tmp = open_cb(mntns_root, rfi, arg);
- if (tmp < 0) {
- pr_perror("Can't open file %s", rfi->path);
- return -1;
- }
-
- if (rfi->rfe->has_size && !rfi->size_checked) {
- struct stat st;
-
- if (fstat(tmp, &st) < 0) {
- pr_perror("Can't fstat opened file");
- return -1;
- }
-
- if (st.st_size != rfi->rfe->size) {
- pr_err("File %s has bad size %"PRIu64" (expect %"PRIu64")\n",
- rfi->path, st.st_size,
- rfi->rfe->size);
- return -1;
- }
-
- /*
- * This is only visible in the current process, so
- * change w/o locks. Other tasks sharing the same
- * file will get one via unix sockets.
- */
- rfi->size_checked = true;
- }
-
- if (rfi->remap) {
- if (!rfi->remap->is_dir) {
- unlinkat(mntns_root, rfi->path, 0);
- rm_parent_dirs(mntns_root, rfi->path, level);
- }
-
- BUG_ON(!rfi->remap->users);
- if (--rfi->remap->users == 0) {
- pr_info("Unlink the ghost %s\n", rfi->remap->rpath);
- mntns_root = mntns_get_root_by_mnt_id(rfi->remap->rmnt_id);
- unlinkat(mntns_root, rfi->remap->rpath, rfi->remap->is_dir ? AT_REMOVEDIR : 0);
- }
-
- if (orig_path)
- rfi->path = orig_path;
- mutex_unlock(ghost_file_mutex);
- }
-
- if (restore_fown(tmp, rfi->rfe->fown))
- return -1;
-
- return tmp;
-}
-
-int do_open_reg_noseek_flags(int ns_root_fd, struct reg_file_info *rfi, void *arg)
-{
- u32 flags = *(u32 *)arg;
- int fd;
-
- fd = openat(ns_root_fd, rfi->path, flags);
- if (fd < 0) {
- pr_perror("Can't open file %s on restore", rfi->path);
- return fd;
- }
-
- return fd;
-}
-
-static int do_open_reg_noseek(int ns_root_fd, struct reg_file_info *rfi, void *arg)
-{
- return do_open_reg_noseek_flags(ns_root_fd, rfi, &rfi->rfe->flags);
-}
-
-static int do_open_reg(int ns_root_fd, struct reg_file_info *rfi, void *arg)
-{
- int fd;
-
- fd = do_open_reg_noseek(ns_root_fd, rfi, arg);
- if (fd < 0)
- return fd;
-
- if ((rfi->rfe->pos != -1ULL) &&
- lseek(fd, rfi->rfe->pos, SEEK_SET) < 0) {
- pr_perror("Can't restore file pos");
- close(fd);
- return -1;
- }
-
- return fd;
-}
-
-int open_reg_fd(struct file_desc *fd)
-{
- return open_path(fd, do_open_reg_noseek, NULL);
-}
-
-int open_reg_by_id(u32 id)
-{
- struct file_desc *fd;
-
- /*
- * This one gets called by exe link, chroot and cwd
- * restoring code. No need in calling lseek on either
- * of them.
- */
-
- fd = find_file_desc_raw(FD_TYPES__REG, id);
- if (fd == NULL) {
- pr_err("Can't find regfile for %#x\n", id);
- return -1;
- }
-
- return open_reg_fd(fd);
-}
-
-int get_filemap_fd(struct vma_area *vma)
-{
- u32 flags;
-
- /*
- * Thevma->fd should have been assigned in collect_filemap
- *
- * We open file w/o lseek, as mappings don't care about it
- */
-
- BUG_ON(vma->vmfd == NULL);
- if (vma->e->has_fdflags)
- flags = vma->e->fdflags;
- else if ((vma->e->prot & PROT_WRITE) &&
- vma_area_is(vma, VMA_FILE_SHARED))
- flags = O_RDWR;
- else
- flags = O_RDONLY;
-
- return open_path(vma->vmfd, do_open_reg_noseek_flags, &flags);
-}
-
-static void remap_get(struct file_desc *fdesc, char typ)
-{
- struct reg_file_info *rfi;
-
- rfi = container_of(fdesc, struct reg_file_info, d);
- if (rfi->remap) {
- pr_debug("One more remap user (%c) for %s\n",
- typ, rfi->remap->rpath);
- /* No lock, we're still sngle-process here */
- rfi->remap->users++;
- }
-}
-
-static void collect_reg_fd(struct file_desc *fdesc,
- struct fdinfo_list_entry *fle, struct rst_info *ri)
-{
- if (list_empty(&fdesc->fd_info_head))
- remap_get(fdesc, 'f');
-
- collect_gen_fd(fle, ri);
-}
-
-static int open_fe_fd(struct file_desc *fd)
-{
- return open_path(fd, do_open_reg, NULL);
-}
-
-static char *reg_file_path(struct file_desc *d, char *buf, size_t s)
-{
- struct reg_file_info *rfi;
-
- rfi = container_of(d, struct reg_file_info, d);
- return rfi->path;
-}
-
-static struct file_desc_ops reg_desc_ops = {
- .type = FD_TYPES__REG,
- .open = open_fe_fd,
- .collect_fd = collect_reg_fd,
- .name = reg_file_path,
-};
-
-struct file_desc *try_collect_special_file(u32 id, int optional)
-{
- struct file_desc *fdesc;
-
- /*
- * Files dumped for vmas/exe links can have remaps
- * configured. Need to bump-up users for them, otherwise
- * the open_path() would unlink the remap file after
- * the very first open.
- */
-
- fdesc = find_file_desc_raw(FD_TYPES__REG, id);
- if (fdesc == NULL) {
- if (!optional)
- pr_err("No entry for reg-file-ID %#x\n", id);
- return NULL;
- }
-
- remap_get(fdesc, 's');
- return fdesc;
-}
-
-static int collect_one_regfile(void *o, ProtobufCMessage *base)
-{
- struct reg_file_info *rfi = o;
- static char dot[] = ".";
-
- rfi->rfe = pb_msg(base, RegFileEntry);
- /* change "/foo" into "foo" and "/" into "." */
- if (rfi->rfe->name[1] == '\0')
- rfi->path = dot;
- else
- rfi->path = rfi->rfe->name + 1;
- rfi->remap = NULL;
- rfi->size_checked = false;
-
- pr_info("Collected [%s] ID %#x\n", rfi->path, rfi->rfe->id);
- return file_desc_add(&rfi->d, rfi->rfe->id, ®_desc_ops);
-}
-
-static struct collect_image_info reg_file_cinfo = {
- .fd_type = CR_FD_REG_FILES,
- .pb_type = PB_REG_FILE,
- .priv_size = sizeof(struct reg_file_info),
- .collect = collect_one_regfile,
-};
-
-int prepare_shared_reg_files(void)
-{
- ghost_file_mutex = shmalloc(sizeof(*ghost_file_mutex));
- if (!ghost_file_mutex)
- return -1;
-
- mutex_init(ghost_file_mutex);
- return 0;
-}
-
-int collect_remaps_and_regfiles(void)
-{
- if (collect_image(®_file_cinfo))
- return -1;
-
- if (collect_image(&remap_cinfo))
- return -1;
-
- return 0;
-}
diff --git a/files.c b/files.c
deleted file mode 100644
index db15527e9ed9..000000000000
--- a/files.c
+++ /dev/null
@@ -1,1587 +0,0 @@
-#include <unistd.h>
-#include <fcntl.h>
-#include <errno.h>
-
-#include <linux/limits.h>
-#include <linux/major.h>
-
-#include <sys/types.h>
-#include <sys/prctl.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/socket.h>
-#include <sys/un.h>
-#include <stdlib.h>
-
-#include "files.h"
-#include "file-ids.h"
-#include "files-reg.h"
-#include "file-lock.h"
-#include "image.h"
-#include "list.h"
-#include "util.h"
-#include "util-pie.h"
-#include "lock.h"
-#include "sockets.h"
-#include "pstree.h"
-#include "tty.h"
-#include "pipes.h"
-#include "fifo.h"
-#include "eventfd.h"
-#include "eventpoll.h"
-#include "fsnotify.h"
-#include "mount.h"
-#include "signalfd.h"
-#include "namespaces.h"
-#include "tun.h"
-#include "timerfd.h"
-#include "imgset.h"
-#include "fs-magic.h"
-#include "proc_parse.h"
-#include "cr_options.h"
-
-#include "parasite.h"
-#include "parasite-syscall.h"
-
-#include "protobuf.h"
-#include "protobuf/fs.pb-c.h"
-#include "protobuf/ext-file.pb-c.h"
-
-#include "plugin.h"
-
-#define FDESC_HASH_SIZE 64
-static struct hlist_head file_desc_hash[FDESC_HASH_SIZE];
-
-int prepare_shared_fdinfo(void)
-{
- int i;
-
- for (i = 0; i < FDESC_HASH_SIZE; i++)
- INIT_HLIST_HEAD(&file_desc_hash[i]);
-
- return 0;
-}
-
-void file_desc_init(struct file_desc *d, u32 id, struct file_desc_ops *ops)
-{
- INIT_LIST_HEAD(&d->fd_info_head);
- INIT_HLIST_NODE(&d->hash);
-
- d->id = id;
- d->ops = ops;
-}
-
-int file_desc_add(struct file_desc *d, u32 id, struct file_desc_ops *ops)
-{
- file_desc_init(d, id, ops);
- hlist_add_head(&d->hash, &file_desc_hash[id % FDESC_HASH_SIZE]);
- return 0; /* this is to make tail-calls in collect_one_foo look nice */
-}
-
-struct file_desc *find_file_desc_raw(int type, u32 id)
-{
- struct file_desc *d;
- struct hlist_head *chain;
-
- chain = &file_desc_hash[id % FDESC_HASH_SIZE];
- hlist_for_each_entry(d, chain, hash)
- if (d->ops->type == type && d->id == id)
- return d;
-
- return NULL;
-}
-
-static inline struct file_desc *find_file_desc(FdinfoEntry *fe)
-{
- return find_file_desc_raw(fe->type, fe->id);
-}
-
-/*
- * A file may be shared between several file descriptors. E.g
- * when doing a fork() every fd of a forker and respective fds
- * of the child have such. Another way of getting shared files
- * is by dup()-ing them or sending them via unix sockets in
- * SCM_RIGHTS message.
- *
- * We restore this type of things in 3 steps (states[] below)
- *
- * 1. Prepare step.
- * Select which task will create the file (open() one, or
- * call any other syscall for than (socket, pipe, etc.). All
- * the others, that share one, create unix sockets under the
- * respective file descriptor (transport socket).
- * 2. Open step.
- * The one who creates the file (the 'master') creates one,
- * then creates one more unix socket (transport) and sends the
- * created file over this socket to the other recepients.
- * 3. Receive step.
- * Those, who wait for the file to appear, receive one via
- * the transport socket, then close the socket and dup() the
- * received file descriptor into its place.
- *
- * There's the 4th step in the states[] array -- the post_open
- * one. This one is not about file-sharing resolving, but about
- * doing something with a file using it's 'desired' fd. The
- * thing is that while going the 3-step process above, the file
- * may appear in variuos places in the task's fd table, and if
- * we want to do something with it's _final_ descriptor value,
- * we should wait for it to appear there. So the post_open is
- * called when the file is finally set into its place.
- */
-
-struct fdinfo_list_entry *file_master(struct file_desc *d)
-{
- if (list_empty(&d->fd_info_head)) {
- pr_err("Empty list on file desc id %#x(%d)\n", d->id,
- d->ops ? d->ops->type : -1);
- BUG();
- }
-
- return list_first_entry(&d->fd_info_head,
- struct fdinfo_list_entry, desc_list);
-}
-
-void show_saved_files(void)
-{
- int i;
- struct file_desc *fd;
-
- pr_info("File descs:\n");
- for (i = 0; i < FDESC_HASH_SIZE; i++)
- hlist_for_each_entry(fd, &file_desc_hash[i], hash) {
- struct fdinfo_list_entry *le;
-
- pr_info(" `- type %d ID %#x\n", fd->ops->type, fd->id);
- list_for_each_entry(le, &fd->fd_info_head, desc_list)
- pr_info(" `- FD %d pid %d\n", le->fe->fd, le->pid);
- }
-}
-
-/*
- * Workaround for the OverlayFS bug present before Kernel 4.2
- *
- * This is here only to support the Linux Kernel between versions
- * 3.18 and 4.2. After that, this workaround is not needed anymore,
- * but it will work properly on both a kernel with and withouth the bug.
- *
- * When a process has a file open in an OverlayFS directory,
- * the information in /proc/<pid>/fd/<fd> and /proc/<pid>/fdinfo/<fd>
- * is wrong. We can't even rely on stat()-ing /proc/<pid>/fd/<fd> since
- * this will show us the wrong filesystem type.
- *
- * So we grab that information from the mountinfo table instead. This is done
- * every time fill_fdlink is called. See lookup_overlayfs for more details.
- *
- */
-static int fixup_overlayfs(struct fd_parms *p, struct fd_link *link)
-{
- struct mount_info *m;
-
- if (!link)
- return 0;
-
- m = lookup_overlayfs(link->name, p->stat.st_dev, p->stat.st_ino, p->mnt_id);
- if (IS_ERR(m))
- return -1;
-
- if (!m)
- return 0;
-
- p->mnt_id = m->mnt_id;
-
- /*
- * If the bug is present, the file path from /proc/<pid>/fd
- * does not include the mountpoint, so we prepend it ourselves.
- */
- if (strcmp("./", m->mountpoint) != 0) {
- char buf[PATH_MAX];
- int n;
-
- strncpy(buf, link->name, PATH_MAX - 1);
- n = snprintf(link->name, PATH_MAX, "%s/%s", m->mountpoint, buf + 2);
- if (n >= PATH_MAX) {
- pr_err("Not enough space to replace %s\n", buf);
- return -1;
- }
- }
- return 0;
-}
-
-/*
- * The gen_id thing is used to optimize the comparison of shared files.
- * If two files have different gen_ids, then they are different for sure.
- * If it matches, we don't know it and have to call sys_kcmp().
- *
- * The kcmp-ids.c engine does this trick, see comments in it for more info.
- */
-
-static u32 make_gen_id(const struct fd_parms *p)
-{
- return ((u32)p->stat.st_dev) ^ ((u32)p->stat.st_ino) ^ ((u32)p->pos);
-}
-
-int do_dump_gen_file(struct fd_parms *p, int lfd,
- const struct fdtype_ops *ops, struct cr_img *img)
-{
- FdinfoEntry e = FDINFO_ENTRY__INIT;
- int ret = -1;
-
- e.type = ops->type;
- e.id = make_gen_id(p);
- e.fd = p->fd;
- e.flags = p->fd_flags;
-
- ret = fd_id_generate(p->pid, &e, p);
- if (ret == 1) /* new ID generated */
- ret = ops->dump(lfd, e.id, p);
-
- if (ret < 0)
- return ret;
-
- pr_info("fdinfo: type: 0x%2x flags: %#o/%#o pos: 0x%8"PRIx64" fd: %d\n",
- ops->type, p->flags, (int)p->fd_flags, p->pos, p->fd);
-
- return pb_write_one(img, &e, PB_FDINFO);
-}
-
-int fill_fdlink(int lfd, const struct fd_parms *p, struct fd_link *link)
-{
- int len;
-
- link->name[0] = '.';
-
- len = read_fd_link(lfd, &link->name[1], sizeof(link->name) - 1);
- if (len < 0) {
- pr_err("Can't read link for pid %d fd %d\n", p->pid, p->fd);
- return -1;
- }
-
- link->len = len + 1;
-
- if (opts.overlayfs)
- if (fixup_overlayfs((struct fd_parms *)p, link) < 0)
- return -1;
- return 0;
-}
-
-static int fill_fd_params(struct parasite_ctl *ctl, int fd, int lfd,
- struct fd_opts *opts, struct fd_parms *p)
-{
- int ret;
- struct statfs fsbuf;
- struct fdinfo_common fdinfo = { .mnt_id = -1, .owner = ctl->pid.virt };
-
- if (fstat(lfd, &p->stat) < 0) {
- pr_perror("Can't stat fd %d", lfd);
- return -1;
- }
-
- if (fstatfs(lfd, &fsbuf) < 0) {
- pr_perror("Can't statfs fd %d", lfd);
- return -1;
- }
-
- if (parse_fdinfo_pid(ctl->pid.real, fd, FD_TYPES__UND, NULL, &fdinfo))
- return -1;
-
- p->fs_type = fsbuf.f_type;
- p->ctl = ctl;
- p->fd = fd;
- p->pos = fdinfo.pos;
- p->flags = fdinfo.flags;
- p->mnt_id = fdinfo.mnt_id;
- p->pid = ctl->pid.real;
- p->fd_flags = opts->flags;
-
- fown_entry__init(&p->fown);
-
- pr_info("%d fdinfo %d: pos: 0x%16"PRIx64" flags: %16o/%#x\n",
- ctl->pid.real, fd, p->pos, p->flags, (int)p->fd_flags);
-
- ret = fcntl(lfd, F_GETSIG, 0);
- if (ret < 0) {
- pr_perror("Can't get owner signum on %d", lfd);
- return -1;
- }
- p->fown.signum = ret;
-
- if (opts->fown.pid == 0)
- return 0;
-
- p->fown.pid = opts->fown.pid;
- p->fown.pid_type = opts->fown.pid_type;
- p->fown.uid = opts->fown.uid;
- p->fown.euid = opts->fown.euid;
-
- return 0;
-}
-
-static const struct fdtype_ops *get_misc_dev_ops(int minor)
-{
- switch (minor) {
- case TUN_MINOR:
- return &tunfile_dump_ops;
- };
-
- return NULL;
-}
-
-static const struct fdtype_ops *get_mem_dev_ops(struct fd_parms *p, int minor)
-{
- const struct fdtype_ops *ops = NULL;
-
- switch (minor) {
- case 11:
- /*
- * If /dev/kmsg is opened in write-only mode the file position
- * should not be set up upon restore, kernel doesn't allow that.
- */
- if ((p->flags & O_ACCMODE) == O_WRONLY && p->pos == 0)
- p->pos = -1ULL;
- /*
- * Fallthrough.
- */
- default:
- ops = ®file_dump_ops;
- break;
- };
-
- return ops;
-}
-
-static int dump_chrdev(struct fd_parms *p, int lfd, struct cr_img *img)
-{
- int maj = major(p->stat.st_rdev);
- const struct fdtype_ops *ops;
-
- switch (maj) {
- case MEM_MAJOR:
- ops = get_mem_dev_ops(p, minor(p->stat.st_rdev));
- break;
- case MISC_MAJOR:
- ops = get_misc_dev_ops(minor(p->stat.st_rdev));
- if (ops)
- break;
- /* fallthrough */
- default: {
- char more[32];
-
- if (is_tty(p->stat.st_rdev, p->stat.st_dev)) {
- struct fd_link link;
-
- if (fill_fdlink(lfd, p, &link))
- return -1;
- p->link = &link;
- ops = &tty_dump_ops;
- break;
- }
-
- sprintf(more, "%d:%d", maj, minor(p->stat.st_rdev));
- return dump_unsupp_fd(p, lfd, img, "chr", more);
- }
- }
-
- return do_dump_gen_file(p, lfd, ops, img);
-}
-
-static int dump_one_file(struct parasite_ctl *ctl, int fd, int lfd, struct fd_opts *opts,
- struct cr_img *img)
-{
- struct fd_parms p = FD_PARMS_INIT;
- const struct fdtype_ops *ops;
-
- if (fill_fd_params(ctl, fd, lfd, opts, &p) < 0) {
- pr_perror("Can't get stat on %d", fd);
- return -1;
- }
-
- if (note_file_lock(&ctl->pid, fd, lfd, &p))
- return -1;
-
- if (S_ISSOCK(p.stat.st_mode))
- return dump_socket(&p, lfd, img);
-
- if (S_ISCHR(p.stat.st_mode))
- return dump_chrdev(&p, lfd, img);
-
- if (p.fs_type == ANON_INODE_FS_MAGIC) {
- char link[32];
-
- if (read_fd_link(lfd, link, sizeof(link)) < 0)
- return -1;
-
- if (is_eventfd_link(link))
- ops = &eventfd_dump_ops;
- else if (is_eventpoll_link(link))
- ops = &eventpoll_dump_ops;
- else if (is_inotify_link(link))
- ops = &inotify_dump_ops;
- else if (is_fanotify_link(link))
- ops = &fanotify_dump_ops;
- else if (is_signalfd_link(link))
- ops = &signalfd_dump_ops;
- else if (is_timerfd_link(link))
- ops = &timerfd_dump_ops;
- else
- return dump_unsupp_fd(&p, lfd, img, "anon", link);
-
- return do_dump_gen_file(&p, lfd, ops, img);
- }
-
- if (S_ISREG(p.stat.st_mode) || S_ISDIR(p.stat.st_mode)) {
- struct fd_link link;
-
- if (fill_fdlink(lfd, &p, &link))
- return -1;
-
- p.link = &link;
- if (link.name[1] == '/')
- return do_dump_gen_file(&p, lfd, ®file_dump_ops, img);
-
- if (check_ns_proc(&link))
- return do_dump_gen_file(&p, lfd, &nsfile_dump_ops, img);
-
- return dump_unsupp_fd(&p, lfd, img, "reg", link.name + 1);
- }
-
- if (S_ISFIFO(p.stat.st_mode)) {
- if (p.fs_type == PIPEFS_MAGIC)
- ops = &pipe_dump_ops;
- else
- ops = &fifo_dump_ops;
-
- return do_dump_gen_file(&p, lfd, ops, img);
- }
-
- return dump_unsupp_fd(&p, lfd, img, "unknown", NULL);
-}
-
-int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item,
- struct parasite_drain_fd *dfds)
-{
- int *lfds;
- struct cr_img *img;
- struct fd_opts *opts;
- int i, ret = -1;
-
- pr_info("\n");
- pr_info("Dumping opened files (pid: %d)\n", ctl->pid.real);
- pr_info("----------------------------------------\n");
-
- lfds = xmalloc(dfds->nr_fds * sizeof(int));
- if (!lfds)
- goto err;
-
- opts = xmalloc(dfds->nr_fds * sizeof(struct fd_opts));
- if (!opts)
- goto err1;
-
- ret = parasite_drain_fds_seized(ctl, dfds, lfds, opts);
- if (ret)
- goto err2;
-
- img = open_image(CR_FD_FDINFO, O_DUMP, item->ids->files_id);
- if (!img)
- goto err2;
-
- for (i = 0; i < dfds->nr_fds; i++) {
- ret = dump_one_file(ctl, dfds->fds[i], lfds[i], opts + i, img);
- close(lfds[i]);
- if (ret)
- break;
- }
-
- close_image(img);
-
- pr_info("----------------------------------------\n");
-err2:
- xfree(opts);
-err1:
- xfree(lfds);
-err:
- return ret;
-}
-
-static int predump_one_fd(int pid, int fd)
-{
- const struct fdtype_ops *ops;
- char link[PATH_MAX], t[32];
- int ret = 0;
-
- snprintf(t, sizeof(t), "/proc/%d/fd/%d", pid, fd);
- ret = readlink(t, link, sizeof(link));
- if (ret < 0) {
- pr_perror("Can't read link of fd %d", fd);
- return -1;
- } else if ((size_t)ret == sizeof(link)) {
- pr_err("Buffer for read link of fd %d is too small\n", fd);
- return -1;
- }
- link[ret] = 0;
-
- ret = 0;
- if (is_inotify_link(link))
- ops = &inotify_dump_ops;
- else if (is_fanotify_link(link))
- ops = &fanotify_dump_ops;
- else
- goto out;
-
- pr_debug("Pre-dumping %d's %d fd\n", pid, fd);
- ret = ops->pre_dump(pid, fd);
-out:
- return ret;
-}
-
-int predump_task_files(int pid)
-{
- struct dirent *de;
- DIR *fd_dir;
- int ret = -1;
-
- pr_info("Pre-dump fds for %d)\n", pid);
-
- fd_dir = opendir_proc(pid, "fd");
- if (!fd_dir)
- return -1;
-
- while ((de = readdir(fd_dir))) {
- if (dir_dots(de))
- continue;
-
- if (predump_one_fd(pid, atoi(de->d_name)))
- goto out;
- }
-
- ret = 0;
-out:
- closedir(fd_dir);
- return ret;
-}
-
-int restore_fown(int fd, FownEntry *fown)
-{
- struct f_owner_ex owner;
- uid_t uids[3];
- pid_t pid = getpid();
-
- if (fown->signum) {
- if (fcntl(fd, F_SETSIG, fown->signum)) {
- pr_perror("%d: Can't set signal", pid);
- return -1;
- }
- }
-
- /* May be untouched */
- if (!fown->pid)
- return 0;
-
- if (getresuid(&uids[0], &uids[1], &uids[2])) {
- pr_perror("%d: Can't get current UIDs", pid);
- return -1;
- }
-
- if (setresuid(fown->uid, fown->euid, uids[2])) {
- pr_perror("%d: Can't set UIDs", pid);
- return -1;
- }
-
- owner.type = fown->pid_type;
- owner.pid = fown->pid;
-
- if (fcntl(fd, F_SETOWN_EX, &owner)) {
- pr_perror("%d: Can't setup %d file owner pid",
- pid, fd);
- return -1;
- }
-
- if (setresuid(uids[0], uids[1], uids[2])) {
- pr_perror("%d: Can't revert UIDs back", pid);
- return -1;
- }
-
- return 0;
-}
-
-int rst_file_params(int fd, FownEntry *fown, int flags)
-{
- if (set_fd_flags(fd, flags) < 0)
- return -1;
- if (restore_fown(fd, fown) < 0)
- return -1;
- return 0;
-}
-
-static int collect_fd(int pid, FdinfoEntry *e, struct rst_info *rst_info)
-{
- struct fdinfo_list_entry *le, *new_le;
- struct file_desc *fdesc;
-
- pr_info("Collect fdinfo pid=%d fd=%d id=%#x\n",
- pid, e->fd, e->id);
-
- new_le = shmalloc(sizeof(*new_le));
- if (!new_le)
- return -1;
-
- futex_init(&new_le->real_pid);
- new_le->pid = pid;
- new_le->fe = e;
-
- fdesc = find_file_desc(e);
- if (fdesc == NULL) {
- pr_err("No file for fd %d id %#x\n", e->fd, e->id);
- return -1;
- }
-
- list_for_each_entry(le, &fdesc->fd_info_head, desc_list)
- if (pid_rst_prio(new_le->pid, le->pid))
- break;
-
- if (fdesc->ops->collect_fd)
- fdesc->ops->collect_fd(fdesc, new_le, rst_info);
- else
- collect_gen_fd(new_le, rst_info);
-
- list_add_tail(&new_le->desc_list, &le->desc_list);
- new_le->desc = fdesc;
-
- return 0;
-}
-
-int prepare_ctl_tty(int pid, struct rst_info *rst_info, u32 ctl_tty_id)
-{
- FdinfoEntry *e;
-
- if (!ctl_tty_id)
- return 0;
-
- pr_info("Requesting for ctl tty %#x into service fd\n", ctl_tty_id);
-
- e = xmalloc(sizeof(*e));
- if (!e)
- return -1;
-
- fdinfo_entry__init(e);
-
- e->id = ctl_tty_id;
- e->fd = reserve_service_fd(CTL_TTY_OFF);
- e->type = FD_TYPES__TTY;
-
- if (collect_fd(pid, e, rst_info)) {
- xfree(e);
- return -1;
- }
-
- return 0;
-}
-
-int prepare_fd_pid(struct pstree_item *item)
-{
- int ret = 0;
- struct cr_img *img;
- pid_t pid = item->pid.virt;
- struct rst_info *rst_info = rsti(item);
-
- INIT_LIST_HEAD(&rst_info->fds);
- INIT_LIST_HEAD(&rst_info->eventpoll);
- INIT_LIST_HEAD(&rst_info->tty_slaves);
- INIT_LIST_HEAD(&rst_info->tty_ctty);
-
- if (!fdinfo_per_id) {
- img = open_image(CR_FD_FDINFO, O_RSTR, pid);
- if (!img)
- return -1;
- } else {
- if (item->ids == NULL) /* zombie */
- return 0;
-
- if (rsti(item)->fdt && rsti(item)->fdt->pid != item->pid.virt)
- return 0;
-
- img = open_image(CR_FD_FDINFO, O_RSTR, item->ids->files_id);
- if (!img)
- return -1;
- }
-
- while (1) {
- FdinfoEntry *e;
-
- ret = pb_read_one_eof(img, &e, PB_FDINFO);
- if (ret <= 0)
- break;
-
- ret = collect_fd(pid, e, rst_info);
- if (ret < 0) {
- fdinfo_entry__free_unpacked(e, NULL);
- break;
- }
- }
-
- close_image(img);
- return ret;
-}
-
-#define SETFL_MASK (O_APPEND | O_ASYNC | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
-int set_fd_flags(int fd, int flags)
-{
- int ret;
-
- ret = fcntl(fd, F_GETFL, 0);
- if (ret < 0)
- goto err;
-
- flags = (SETFL_MASK & flags) | (ret & ~SETFL_MASK);
-
- ret = fcntl(fd, F_SETFL, flags);
- if (ret < 0)
- goto err;
-
- /* Let's check, that now actual flags contains those we need */
- ret = fcntl(fd, F_GETFL, 0);
- if (ret < 0)
- goto err;
-
- if (ret != flags) {
- pr_err("fcntl call on fd %d (flags %#o) succeeded, "
- "but some flags were dropped: %#o\n", fd, flags, ret);
- return -1;
- }
- return 0;
-
-err:
- pr_perror("fcntl call on fd %d (flags %x) failed", fd, flags);
- return -1;
-}
-
-struct fd_open_state {
- char *name;
- int (*cb)(int, struct fdinfo_list_entry *);
-
- /*
- * Two last stages -- receive fds and post-open them -- are
- * not required always. E.g. if no fd sharing takes place
- * or task doens't have any files that need to be post-opened.
- *
- * Thus, in order not to scan through fdinfo-s lists in vain
- * and speed things up a little bit, we may want to skeep these.
- */
- bool required;
-};
-
-static int open_transport_fd(int pid, struct fdinfo_list_entry *fle);
-static int open_fd(int pid, struct fdinfo_list_entry *fle);
-static int receive_fd(int pid, struct fdinfo_list_entry *fle);
-static int post_open_fd(int pid, struct fdinfo_list_entry *fle);
-
-static struct fd_open_state states[] = {
- { "prepare", open_transport_fd, true,},
- { "create", open_fd, true,},
- { "receive", receive_fd, false,},
- { "post_create", post_open_fd, false,},
-};
-
-#define want_recv_stage() do { states[2].required = true; } while (0)
-#define want_post_open_stage() do { states[3].required = true; } while (0)
-
-static void transport_name_gen(struct sockaddr_un *addr, int *len,
- int pid, int fd)
-{
- addr->sun_family = AF_UNIX;
- snprintf(addr->sun_path, UNIX_PATH_MAX, "x/crtools-fd-%d-%d", pid, fd);
- *len = SUN_LEN(addr);
- *addr->sun_path = '\0';
-}
-
-static int should_open_transport(FdinfoEntry *fe, struct file_desc *fd)
-{
- if (fd->ops->want_transport)
- return fd->ops->want_transport(fe, fd);
- else
- return 0;
-}
-
-static int open_transport_fd(int pid, struct fdinfo_list_entry *fle)
-{
- struct fdinfo_list_entry *flem;
- struct sockaddr_un saddr;
- int sock;
- int ret, sun_len;
-
- flem = file_master(fle->desc);
-
- if (flem->pid == pid) {
- if (flem->fe->fd != fle->fe->fd)
- /* dup-ed file. Will be opened in the open_fd */
- return 0;
-
- if (!should_open_transport(fle->fe, fle->desc))
- /* pure master file */
- return 0;
-
- /*
- * some master file, that wants a transport, e.g.
- * a pipe or unix socket pair 'slave' end
- */
- }
-
- transport_name_gen(&saddr, &sun_len, getpid(), fle->fe->fd);
-
- pr_info("\t\tCreate transport fd %s\n", saddr.sun_path + 1);
-
-
- sock = socket(PF_UNIX, SOCK_DGRAM, 0);
- if (sock < 0) {
- pr_perror("Can't create socket");
- return -1;
- }
- ret = bind(sock, &saddr, sun_len);
- if (ret < 0) {
- pr_perror("Can't bind unix socket %s", saddr.sun_path + 1);
- goto err;
- }
-
- ret = reopen_fd_as(fle->fe->fd, sock);
- if (ret < 0)
- goto err;
-
- pr_info("\t\tWake up fdinfo pid=%d fd=%d\n", fle->pid, fle->fe->fd);
- futex_set_and_wake(&fle->real_pid, getpid());
- want_recv_stage();
-
- return 0;
-err:
- close(sock);
- return -1;
-}
-
-int send_fd_to_peer(int fd, struct fdinfo_list_entry *fle, int sock)
-{
- struct sockaddr_un saddr;
- int len;
-
- pr_info("\t\tWait fdinfo pid=%d fd=%d\n", fle->pid, fle->fe->fd);
- futex_wait_while(&fle->real_pid, 0);
- transport_name_gen(&saddr, &len,
- futex_get(&fle->real_pid), fle->fe->fd);
- pr_info("\t\tSend fd %d to %s\n", fd, saddr.sun_path + 1);
- return send_fd(sock, &saddr, len, fd);
-}
-
-static int send_fd_to_self(int fd, struct fdinfo_list_entry *fle, int *sock)
-{
- int dfd = fle->fe->fd;
-
- if (fd == dfd)
- return 0;
-
- /* make sure we won't clash with an inherit fd */
- if (inherit_fd_resolve_clash(dfd) < 0)
- return -1;
-
- pr_info("\t\t\tGoing to dup %d into %d\n", fd, dfd);
- if (move_img_fd(sock, dfd))
- return -1;
-
- if (dup2(fd, dfd) != dfd) {
- pr_perror("Can't dup local fd %d -> %d", fd, dfd);
- return -1;
- }
-
- if (fcntl(dfd, F_SETFD, fle->fe->flags) == -1) {
- pr_perror("Unable to set file descriptor flags");
- return -1;
- }
-
- return 0;
-}
-
-static int post_open_fd(int pid, struct fdinfo_list_entry *fle)
-{
- struct file_desc *d = fle->desc;
-
- if (!d->ops->post_open)
- return 0;
-
- if (is_service_fd(fle->fe->fd, CTL_TTY_OFF))
- return d->ops->post_open(d, fle->fe->fd);
-
- if (fle != file_master(d))
- return 0;
-
- return d->ops->post_open(d, fle->fe->fd);
-}
-
-
-static int serve_out_fd(int pid, int fd, struct file_desc *d)
-{
- int sock, ret;
- struct fdinfo_list_entry *fle;
-
- sock = socket(PF_UNIX, SOCK_DGRAM, 0);
- if (sock < 0) {
- pr_perror("Can't create socket");
- return -1;
- }
-
- pr_info("\t\tCreate fd for %d\n", fd);
-
- list_for_each_entry(fle, &d->fd_info_head, desc_list) {
- if (pid == fle->pid)
- ret = send_fd_to_self(fd, fle, &sock);
- else
- ret = send_fd_to_peer(fd, fle, sock);
-
- if (ret) {
- pr_err("Can't sent fd %d to %d\n", fd, fle->pid);
- goto out;
- }
- }
-
- ret = 0;
-out:
- close(sock);
- return ret;
-}
-
-static int open_fd(int pid, struct fdinfo_list_entry *fle)
-{
- struct file_desc *d = fle->desc;
- int new_fd;
-
- if (d->ops->post_open)
- want_post_open_stage();
-
- if (fle != file_master(d))
- return 0;
-
- new_fd = d->ops->open(d);
- if (new_fd < 0)
- return -1;
-
- if (reopen_fd_as(fle->fe->fd, new_fd))
- return -1;
-
- if (fcntl(fle->fe->fd, F_SETFD, fle->fe->flags) == -1) {
- pr_perror("Unable to set file descriptor flags");
- return -1;
- }
-
- return serve_out_fd(pid, fle->fe->fd, d);
-}
-
-static int receive_fd(int pid, struct fdinfo_list_entry *fle)
-{
- int tmp;
- struct fdinfo_list_entry *flem;
-
- flem = file_master(fle->desc);
- if (flem->pid == pid)
- return 0;
-
- pr_info("\tReceive fd for %d\n", fle->fe->fd);
-
- tmp = recv_fd(fle->fe->fd);
- if (tmp < 0) {
- pr_err("Can't get fd %d\n", tmp);
- return -1;
- }
- close(fle->fe->fd);
-
- if (reopen_fd_as(fle->fe->fd, tmp) < 0)
- return -1;
-
- if (fcntl(fle->fe->fd, F_SETFD, fle->fe->flags) == -1) {
- pr_perror("Unable to set file descriptor flags");
- return -1;
- }
-
- return 0;
-}
-
-static int open_fdinfo(int pid, struct fdinfo_list_entry *fle, int state)
-{
- pr_info("\tRestoring fd %d (state -> %s)\n",
- fle->fe->fd, states[state].name);
- return states[state].cb(pid, fle);
-}
-
-static int open_fdinfos(int pid, struct list_head *list, int state)
-{
- int ret = 0;
- struct fdinfo_list_entry *fle;
-
- list_for_each_entry(fle, list, ps_list) {
- ret = open_fdinfo(pid, fle, state);
- if (ret)
- break;
- }
-
- return ret;
-}
-
-static struct inherit_fd *inherit_fd_lookup_fd(int fd, const char *caller);
-
-int close_old_fds(void)
-{
- DIR *dir;
- struct dirent *de;
- int fd, ret;
-
- dir = opendir_proc(PROC_SELF, "fd");
- if (dir == NULL)
- return -1;
-
- while ((de = readdir(dir))) {
- if (dir_dots(de))
- continue;
-
- ret = sscanf(de->d_name, "%d", &fd);
- if (ret != 1) {
- pr_err("Can't parse %s\n", de->d_name);
- return -1;
- }
-
- if ((!is_any_service_fd(fd)) && (dirfd(dir) != fd) &&
- !inherit_fd_lookup_fd(fd, __FUNCTION__))
- close_safe(&fd);
- }
-
- closedir(dir);
- close_pid_proc();
-
- return 0;
-}
-
-int prepare_fds(struct pstree_item *me)
-{
- u32 ret = 0;
- int state;
-
- pr_info("Opening fdinfo-s\n");
-
- /*
- * This must be done after forking to allow child
- * to get the cgroup fd so it can move into the
- * correct /tasks file if it is in a different cgroup
- * set than its parent
- */
- close_service_fd(CGROUP_YARD);
- close_pid_proc(); /* flush any proc cached fds we may have */
-
- if (rsti(me)->fdt) {
- struct fdt *fdt = rsti(me)->fdt;
-
- /*
- * Wait all tasks, who share a current fd table.
- * We should be sure, that nobody use any file
- * descriptor while fdtable is being restored.
- */
- futex_inc_and_wake(&fdt->fdt_lock);
- futex_wait_while_lt(&fdt->fdt_lock, fdt->nr);
-
- if (fdt->pid != me->pid.virt) {
- pr_info("File descriptor table is shared with %d\n", fdt->pid);
- futex_wait_until(&fdt->fdt_lock, fdt->nr + 1);
- goto out;
- }
- }
-
- for (state = 0; state < ARRAY_SIZE(states); state++) {
- if (!states[state].required) {
- pr_debug("Skipping %s fd stage\n", states[state].name);
- continue;
- }
-
- ret = open_fdinfos(me->pid.virt, &rsti(me)->fds, state);
- if (ret)
- break;
-
- /*
- * Now handle TTYs. Slaves are delayed to be sure masters
- * are already opened.
- */
- ret = open_fdinfos(me->pid.virt, &rsti(me)->tty_slaves, state);
- if (ret)
- break;
-
- /*
- * The eventpoll descriptors require all the other ones
- * to be already restored, thus we store them in a separate
- * list and restore at the very end.
- */
- ret = open_fdinfos(me->pid.virt, &rsti(me)->eventpoll, state);
- if (ret)
- break;
- }
-
- if (ret)
- goto out_w;
-
- for (state = 0; state < ARRAY_SIZE(states); state++) {
- if (!states[state].required) {
- pr_debug("Skipping %s fd stage\n", states[state].name);
- continue;
- }
-
- /*
- * Opening current TTYs require session to be already set up,
- * thus slave peers already handled now it's time for cttys,
- */
- ret = open_fdinfos(me->pid.virt, &rsti(me)->tty_ctty, state);
- if (ret)
- break;
- }
-out_w:
- if (rsti(me)->fdt)
- futex_inc_and_wake(&rsti(me)->fdt->fdt_lock);
-out:
- close_service_fd(CR_PROC_FD_OFF);
- tty_fini_fds();
- return ret;
-}
-
-static int fchroot(int fd)
-{
- char fd_path[PSFDS];
- int proc;
-
- /*
- * There's no such thing in syscalls. We can emulate
- * it using the /proc/self/fd/ :)
- *
- * But since there might be no /proc mount in our mount
- * namespace, we will have to ... workaround it.
- */
-
- proc = get_service_fd(PROC_FD_OFF);
- if (fchdir(proc) < 0) {
- pr_perror("Can't chdir to proc");
- return -1;
- }
-
- sprintf(fd_path, "./self/fd/%d", fd);
- pr_debug("Going to chroot into %s\n", fd_path);
- return chroot(fd_path);
-}
-
-int restore_fs(struct pstree_item *me)
-{
- int dd_root = -1, dd_cwd = -1, ret, err = -1;
- struct rst_info *ri = rsti(me);
-
- /*
- * First -- open both descriptors. We will not
- * be able to open the cwd one after we chroot.
- */
-
- dd_root = open_reg_fd(ri->root);
- if (dd_root < 0) {
- pr_err("Can't open root\n");
- goto out;
- }
-
- dd_cwd = open_reg_fd(ri->cwd);
- if (dd_cwd < 0) {
- pr_err("Can't open cwd\n");
- goto out;
- }
-
- /*
- * Now do chroot/chdir. Chroot goes first as it
- * calls chdir into proc service descriptor so
- * we'd need to fix chdir after it anyway.
- */
-
- ret = fchroot(dd_root);
- if (ret < 0) {
- pr_perror("Can't change root");
- goto out;
- }
-
- ret = fchdir(dd_cwd);
- if (ret < 0) {
- pr_perror("Can't change cwd");
- goto out;
- }
-
- if (ri->has_umask) {
- pr_info("Restoring umask to %o\n", ri->umask);
- umask(ri->umask);
- }
-
- err = 0;
-out:
- if (dd_cwd >= 0)
- close(dd_cwd);
- if (dd_root >= 0)
- close(dd_root);
-
- return err;
-}
-
-int prepare_fs_pid(struct pstree_item *item)
-{
- pid_t pid = item->pid.virt;
- struct rst_info *ri = rsti(item);
- struct cr_img *img;
- FsEntry *fe;
- int ret = -1;
-
- img = open_image(CR_FD_FS, O_RSTR, pid);
- if (!img)
- goto out;
-
- ret = pb_read_one_eof(img, &fe, PB_FS);
- close_image(img);
- if (ret <= 0)
- goto out;
-
- ri->cwd = collect_special_file(fe->cwd_id);
- if (!ri->cwd) {
- pr_err("Can't find task cwd file\n");
- goto out_f;
- }
-
- ri->root = collect_special_file(fe->root_id);
- if (!ri->root) {
- pr_err("Can't find task root file\n");
- goto out_f;
- }
-
- ri->has_umask = fe->has_umask;
- ri->umask = fe->umask;
-
- ret = 0;
-out_f:
- fs_entry__free_unpacked(fe, NULL);
-out:
- return ret;
-}
-
-int shared_fdt_prepare(struct pstree_item *item)
-{
- struct pstree_item *parent = item->parent;
- struct fdt *fdt;
-
- if (!rsti(parent)->fdt) {
- fdt = shmalloc(sizeof(*rsti(item)->fdt));
- if (fdt == NULL)
- return -1;
-
- rsti(parent)->fdt = fdt;
-
- futex_init(&fdt->fdt_lock);
- fdt->nr = 1;
- fdt->pid = parent->pid.virt;
- } else
- fdt = rsti(parent)->fdt;
-
- rsti(item)->fdt = fdt;
- rsti(item)->service_fd_id = fdt->nr;
- fdt->nr++;
- if (pid_rst_prio(item->pid.virt, fdt->pid))
- fdt->pid = item->pid.virt;
-
- return 0;
-}
-
-/*
- * Inherit fd support.
- *
- * There are cases where a process's file descriptor cannot be restored
- * from the checkpointed image. For example, a pipe file descriptor with
- * one end in the checkpointed process and the other end in a separate
- * process (that was not part of the checkpointed process tree) cannot be
- * restored because after checkpoint the pipe would be broken and removed.
- *
- * There are also cases where the user wants to use a new file during
- * restore instead of the original file in the checkpointed image. For
- * example, the user wants to change the log file of a process from
- * /path/to/oldlog to /path/to/newlog.
- *
- * In these cases, criu's caller should set up a new file descriptor to be
- * inherited by the restored process and specify it with the --inherit-fd
- * command line option. The argument of --inherit-fd has the format
- * fd[%d]:%s, where %d tells criu which of its own file descriptor to use
- * for restoring file identified by %s.
- *
- * As a debugging aid, if the argument has the format debug[%d]:%s, it tells
- * criu to write out the string after colon to the file descriptor %d. This
- * can be used to leave a "restore marker" in the output stream of the process.
- *
- * It's important to note that inherit fd support breaks applications
- * that depend on the state of the file descriptor being inherited. So,
- * consider inherit fd only for specific use cases that you know for sure
- * won't break the application.
- *
- * For examples please visit http://criu.org/Category:HOWTO.
- */
-
-struct inherit_fd {
- struct list_head inh_list;
- char *inh_id; /* file identifier */
- int inh_fd; /* criu's descriptor to inherit */
- dev_t inh_dev;
- ino_t inh_ino;
- mode_t inh_mode;
- dev_t inh_rdev;
-};
-
-/*
- * Return 1 if inherit fd has been closed or reused, 0 otherwise.
- *
- * Some parts of the file restore engine can close an inherit fd
- * explicitly by close() or implicitly by dup2() to reuse that descriptor.
- * In some specific functions (for example, send_fd_to_self()), we
- * check for clashes at the beginning of the function and, therefore,
- * these specific functions will not reuse an inherit fd. However, to
- * avoid adding a ton of clash detect and resolve code everywhere we close()
- * and/or dup2(), we just make sure that when we're dup()ing or close()ing
- * our inherit fd we're still dealing with the same fd that we inherited.
- */
-static int inherit_fd_reused(struct inherit_fd *inh)
-{
- struct stat sbuf;
-
- if (fstat(inh->inh_fd, &sbuf) == -1) {
- if (errno == EBADF) {
- pr_debug("Inherit fd %s -> %d has been closed\n",
- inh->inh_id, inh->inh_fd);
- return 1;
- }
- pr_perror("Can't fstat inherit fd %d", inh->inh_fd);
- return -1;
- }
-
- if (inh->inh_dev != sbuf.st_dev || inh->inh_ino != sbuf.st_ino ||
- inh->inh_mode != sbuf.st_mode || inh->inh_rdev != sbuf.st_rdev) {
- pr_info("Inherit fd %s -> %d has been reused\n",
- inh->inh_id, inh->inh_fd);
- return 1;
- }
- return 0;
-}
-
-/*
- * We can't print diagnostics messages in this function because the
- * log file isn't initialized yet.
- */
-int inherit_fd_parse(char *optarg)
-{
- char *cp = NULL;
- int n = -1;
- int fd = -1;
- int dbg = 0;
-
- /*
- * Parse the argument.
- */
- if (!strncmp(optarg, "fd", 2))
- cp = &optarg[2];
- else if (!strncmp(optarg, "debug", 5)) {
- cp = &optarg[5];
- dbg = 1;
- }
- if (cp) {
- n = sscanf(cp, "[%d]:", &fd);
- cp = strchr(optarg, ':');
- }
- if (n != 1 || fd < 0 || !cp || !cp[1]) {
- pr_err("Invalid inherit fd argument: %s\n", optarg);
- return -1;
- }
-
- /*
- * If the argument is a debug string, write it to fd.
- * Otherwise, add it to the inherit fd list.
- */
- cp++;
- if (dbg) {
- n = strlen(cp);
- if (write(fd, cp, n) != n) {
- pr_err("Can't write debug message %s to inherit fd %d\n",
- cp, fd);
- return -1;
- }
- return 0;
- }
-
- return inherit_fd_add(fd, cp);
-}
-
-int inherit_fd_add(int fd, char *key)
-{
- struct inherit_fd *inh;
- struct stat sbuf;
-
- if (fstat(fd, &sbuf) == -1) {
- pr_perror("Can't fstat inherit fd %d", fd);
- return -1;
- }
-
- inh = xmalloc(sizeof *inh);
- if (inh == NULL)
- return -1;
-
- inh->inh_id = key;
- inh->inh_fd = fd;
- inh->inh_dev = sbuf.st_dev;
- inh->inh_ino = sbuf.st_ino;
- inh->inh_mode = sbuf.st_mode;
- inh->inh_rdev = sbuf.st_rdev;
- list_add_tail(&inh->inh_list, &opts.inherit_fds);
- return 0;
-}
-
-/*
- * Log the inherit fd list. Called for diagnostics purposes
- * after the log file is initialized.
- */
-void inherit_fd_log(void)
-{
- struct inherit_fd *inh;
-
- list_for_each_entry(inh, &opts.inherit_fds, inh_list) {
- pr_info("File %s will be restored from inherit fd %d\n",
- inh->inh_id, inh->inh_fd);
- }
-}
-
-/*
- * Look up the inherit fd list by a file identifier.
- */
-int inherit_fd_lookup_id(char *id)
-{
- int ret;
- struct inherit_fd *inh;
-
- ret = -1;
- list_for_each_entry(inh, &opts.inherit_fds, inh_list) {
- if (!strcmp(inh->inh_id, id)) {
- if (!inherit_fd_reused(inh)) {
- ret = inh->inh_fd;
- pr_debug("Found id %s (fd %d) in inherit fd list\n",
- id, ret);
- }
- break;
- }
- }
- return ret;
-}
-
-bool inherited_fd(struct file_desc *d, int *fd_p)
-{
- char buf[32], *id_str;
- int i_fd;
-
- if (!d->ops->name)
- return false;
-
- id_str = d->ops->name(d, buf, sizeof(buf));
- i_fd = inherit_fd_lookup_id(id_str);
- if (i_fd < 0)
- return false;
-
- if (fd_p == NULL)
- return true;
-
- *fd_p = dup(i_fd);
- if (*fd_p < 0)
- pr_perror("Inherit fd DUP failed");
- else
- pr_info("File %s will be restored from fd %d dumped "
- "from inherit fd %d\n", id_str, *fd_p, i_fd);
- return true;
-}
-
-/*
- * Look up the inherit fd list by a file descriptor.
- */
-static struct inherit_fd *inherit_fd_lookup_fd(int fd, const char *caller)
-{
- struct inherit_fd *ret;
- struct inherit_fd *inh;
-
- ret = NULL;
- list_for_each_entry(inh, &opts.inherit_fds, inh_list) {
- if (inh->inh_fd == fd) {
- if (!inherit_fd_reused(inh)) {
- ret = inh;
- pr_debug("Found fd %d (id %s) in inherit fd list (caller %s)\n",
- fd, inh->inh_id, caller);
- }
- break;
- }
- }
- return ret;
-}
-
-/*
- * If the specified fd clashes with an inherit fd,
- * move the inherit fd.
- */
-int inherit_fd_resolve_clash(int fd)
-{
- int newfd;
- struct inherit_fd *inh;
-
- inh = inherit_fd_lookup_fd(fd, __FUNCTION__);
- if (inh == NULL)
- return 0;
-
- newfd = dup(fd);
- if (newfd == -1) {
- pr_perror("Can't dup inherit fd %d", fd);
- return -1;
- }
-
- if (close(fd) == -1) {
- close(newfd);
- pr_perror("Can't close inherit fd %d", fd);
- return -1;
- }
-
- inh->inh_fd = newfd;
- pr_debug("Inherit fd %d moved to %d to resolve clash\n", fd, inh->inh_fd);
- return 0;
-}
-
-/*
- * Close all inherit fds.
- */
-int inherit_fd_fini()
-{
- int reused;
- struct inherit_fd *inh;
-
- list_for_each_entry(inh, &opts.inherit_fds, inh_list) {
- if (inh->inh_fd < 0) {
- pr_err("File %s in inherit fd list has invalid fd %d\n",
- inh->inh_id, inh->inh_fd);
- return -1;
- }
-
- reused = inherit_fd_reused(inh);
- if (reused < 0)
- return -1;
-
- if (!reused) {
- pr_debug("Closing inherit fd %d -> %s\n", inh->inh_fd,
- inh->inh_id);
- if (close_safe(&inh->inh_fd) < 0)
- return -1;
- }
- }
- return 0;
-}
-
-bool external_lookup_id(char *id)
-{
- struct external *ext;
-
- list_for_each_entry(ext, &opts.external, node)
- if (!strcmp(ext->id, id))
- return true;
- return false;
-}
diff --git a/fsnotify.c b/fsnotify.c
deleted file mode 100644
index 59259c13e2c3..000000000000
--- a/fsnotify.c
+++ /dev/null
@@ -1,940 +0,0 @@
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include <signal.h>
-#include <string.h>
-#include <utime.h>
-#include <dirent.h>
-#include <limits.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sys/inotify.h>
-#include <sys/vfs.h>
-#include <linux/magic.h>
-#include <sys/wait.h>
-#include <sys/poll.h>
-#include <sys/mman.h>
-#include <sys/mount.h>
-#include <aio.h>
-
-#include <sys/fanotify.h>
-
-#include "compiler.h"
-#include "asm/types.h"
-#include "imgset.h"
-#include "fsnotify.h"
-#include "proc_parse.h"
-#include "mount.h"
-#include "image.h"
-#include "util.h"
-#include "files.h"
-#include "files-reg.h"
-#include "file-ids.h"
-#include "log.h"
-#include "list.h"
-#include "lock.h"
-#include "irmap.h"
-#include "cr_options.h"
-#include "namespaces.h"
-#include "pstree.h"
-
-#include "protobuf.h"
-#include "protobuf/fsnotify.pb-c.h"
-#include "protobuf/mnt.pb-c.h"
-
-#undef LOG_PREFIX
-#define LOG_PREFIX "fsnotify: "
-
-struct fsnotify_mark_info {
- struct list_head list;
- union {
- InotifyWdEntry *iwe;
- FanotifyMarkEntry *fme;
- };
- struct file_remap *remap;
-};
-
-struct fsnotify_file_info {
- struct list_head list;
- union {
- InotifyFileEntry *ife;
- FanotifyFileEntry *ffe;
- };
- struct list_head marks;
- struct file_desc d;
-};
-
-/* File handle */
-typedef struct {
- u32 bytes;
- u32 type;
- u64 __handle[16];
-} fh_t;
-
-static LIST_HEAD(inotify_info_head);
-static LIST_HEAD(fanotify_info_head);
-
-/* Checks if file descriptor @lfd is inotify */
-int is_inotify_link(char *link)
-{
- return is_anon_link_type(link, "inotify");
-}
-
-/* Checks if file descriptor @lfd is fanotify */
-int is_fanotify_link(char *link)
-{
- return is_anon_link_type(link, "[fanotify]");
-}
-
-static void decode_handle(fh_t *handle, FhEntry *img)
-{
- memzero(handle, sizeof(*handle));
-
- handle->type = img->type;
- handle->bytes = img->bytes;
-
- memcpy(handle->__handle, img->handle,
- min(pb_repeated_size(img, handle),
- sizeof(handle->__handle)));
-}
-
-static int open_by_handle(void *arg, int fd, int pid)
-{
- return open_by_handle_at(fd, arg, O_PATH);
-}
-
-static char *alloc_openable(unsigned int s_dev, unsigned long i_ino, FhEntry *f_handle)
-{
- struct mount_info *m;
- fh_t handle;
- int fd = -1;
- char *path;
-
- decode_handle(&handle, f_handle);
-
- /*
- * We gonna try to open the handle and then
- * depending on command line options and type
- * of the filesystem (tmpfs/devtmpfs do not
- * preserve their inodes between mounts) we
- * might need to find out an openable path
- * get used on restore as a watch destination.
- */
- for (m = mntinfo; m; m = m->next) {
- char buf[PATH_MAX], *__path;
- int mntfd, openable_fd;
- struct stat st;
-
- if (m->s_dev != s_dev)
- continue;
-
- mntfd = __open_mountpoint(m, -1);
- pr_debug("\t\tTrying via mntid %d root %s ns_mountpoint @%s (%d)\n",
- m->mnt_id, m->root, m->ns_mountpoint, mntfd);
- if (mntfd < 0)
- continue;
-
- fd = userns_call(open_by_handle, UNS_FDOUT, &handle,
- sizeof(handle), mntfd);
- close(mntfd);
- if (fd < 0)
- continue;
-
- if (read_fd_link(fd, buf, sizeof(buf)) < 0) {
- close(fd);
- goto err;
- }
- close(fd);
-
- /*
- * Convert into a relative path.
- */
- __path = (buf[1] != '\0') ? buf + 1 : ".";
- pr_debug("\t\t\tlink as %s\n", __path);
-
- mntfd = mntns_get_root_fd(m->nsid);
- if (mntfd < 0)
- goto err;
-
- openable_fd = openat(mntfd, __path, O_PATH);
- if (openable_fd >= 0) {
- if (fstat(openable_fd, &st)) {
- pr_perror("Can't stat on %s\n", __path);
- close(openable_fd);
- return ERR_PTR(-errno);
- }
- close(openable_fd);
-
- pr_debug("\t\t\topenable (inode %s) as %s\n",
- st.st_ino == i_ino ?
- "match" : "don't match", __path);
-
- if (st.st_ino == i_ino) {
- path = xstrdup(buf);
- if (path == NULL)
- goto err;
-
- f_handle->has_mnt_id = true;
- f_handle->mnt_id = m->mnt_id;
- return path;
- }
- } else
- pr_debug("\t\t\tnot openable as %s (%m)\n", __path);
- }
-
- return ERR_PTR(-ENOENT);
-err:
- return ERR_PTR(-1);
-}
-
-static int open_handle(unsigned int s_dev, unsigned long i_ino,
- FhEntry *f_handle)
-{
- int mntfd, fd = -1;
- fh_t handle;
-
- decode_handle(&handle, f_handle);
-
- pr_debug("Opening fhandle %x:%Lx...\n",
- s_dev, (unsigned long long)handle.__handle[0]);
-
- mntfd = open_mount(s_dev);
- if (mntfd < 0) {
- pr_err("Mount root for 0x%08x not found\n", s_dev);
- goto out;
- }
-
- fd = userns_call(open_by_handle, UNS_FDOUT, &handle, sizeof(handle), mntfd);
- if (fd < 0) {
- errno = -fd;
- pr_perror("Can't open file handle for 0x%08x:0x%016lx",
- s_dev, i_ino);
- }
-
- close(mntfd);
-out:
- return fd;
-}
-
-int check_open_handle(unsigned int s_dev, unsigned long i_ino,
- FhEntry *f_handle)
-{
- int fd = -1;
- char *path;
-
- fd = open_handle(s_dev, i_ino, f_handle);
- if (fd >= 0) {
- struct mount_info *mi;
-
- pr_debug("\tHandle 0x%x:0x%lx is openable\n", s_dev, i_ino);
-
- mi = lookup_mnt_sdev(s_dev);
- if (mi == NULL) {
- pr_err("Unable to lookup a mount by dev 0x%x\n", s_dev);
- goto err;
- }
-
- /*
- * Always try to fetch watchee path first. There are several reasons:
- *
- * - tmpfs/devtmps do not save inode numbers between mounts,
- * so it is critical to have the complete path under our
- * hands for restore purpose;
- *
- * - in case of migration the inodes might be changed as well
- * so the only portable solution is to carry the whole path
- * to the watchee inside image.
- */
- path = alloc_openable(s_dev, i_ino, f_handle);
- if (!IS_ERR_OR_NULL(path))
- goto out;
-
- if ((mi->fstype->code == FSTYPE__TMPFS) ||
- (mi->fstype->code == FSTYPE__DEVTMPFS)) {
- pr_err("Can't find suitable path for handle (dev %#x ino %#lx): %d\n",
- s_dev, i_ino, (int)PTR_ERR(path));
- goto err;
- }
-
- if (!opts.force_irmap)
- /*
- * If we're not forced to do irmap, then
- * say we have no path for watch. Otherwise
- * do irmap scan even if the handle is
- * working.
- *
- * FIXME -- no need to open-by-handle if
- * we are in force-irmap and not on tempfs
- */
- goto out_nopath;
- }
-
- pr_warn("\tHandle 0x%x:0x%lx cannot be opened\n", s_dev, i_ino);
- path = irmap_lookup(s_dev, i_ino);
- if (!path) {
- pr_err("\tCan't dump that handle\n");
- return -1;
- }
-out:
- pr_debug("\tDumping %s as path for handle\n", path);
- f_handle->path = path;
-out_nopath:
- close_safe(&fd);
- return 0;
-err:
- close_safe(&fd);
- return -1;
-}
-
-struct watch_list {
- struct fsnotify_params fsn_params;
- struct list_head list;
- int n;
-};
-
-static int dump_inotify_entry(union fdinfo_entries *e, void *arg)
-{
- struct watch_list *wd_list = (struct watch_list *) arg;
- struct inotify_wd_entry *wd_entry = (struct inotify_wd_entry *) e;
- InotifyWdEntry *we = &wd_entry->e;
-
- pr_info("wd: wd 0x%08x s_dev 0x%08x i_ino 0x%16"PRIx64" mask 0x%08x\n",
- we->wd, we->s_dev, we->i_ino, we->mask);
- pr_info("\t[fhandle] bytes 0x%08x type 0x%08x __handle 0x%016"PRIx64":0x%016"PRIx64"\n",
- we->f_handle->bytes, we->f_handle->type,
- we->f_handle->handle[0], we->f_handle->handle[1]);
-
- if (we->mask & KERNEL_FS_EVENT_ON_CHILD)
- pr_warn_once("\t\tDetected FS_EVENT_ON_CHILD bit "
- "in mask (will be ignored on restore)\n");
-
- if (check_open_handle(we->s_dev, we->i_ino, we->f_handle)) {
- free_inotify_wd_entry(e);
- return -1;
- }
-
- list_add_tail(&wd_entry->node, &wd_list->list);
- wd_list->n++;
-
- return 0;
-}
-
-static int dump_one_inotify(int lfd, u32 id, const struct fd_parms *p)
-{
- struct watch_list wd_list = {.list = LIST_HEAD_INIT(wd_list.list), .n = 0};
- InotifyFileEntry ie = INOTIFY_FILE_ENTRY__INIT;
- union fdinfo_entries *we, *tmp;
- int exit_code = -1, i, ret;
-
- ret = fd_has_data(lfd);
- if (ret < 0)
- return -1;
- else if (ret > 0)
- pr_warn("The 0x%08x inotify events will be dropped\n", id);
-
- ie.id = id;
- ie.flags = p->flags;
- ie.fown = (FownEntry *)&p->fown;
-
- if (parse_fdinfo(lfd, FD_TYPES__INOTIFY, dump_inotify_entry, &wd_list))
- goto free;
-
- ie.wd = xmalloc(sizeof(*ie.wd) * wd_list.n);
- if (!ie.wd)
- goto free;
-
- i = 0;
- list_for_each_entry(we, &wd_list.list, ify.node)
- ie.wd[i++] = &we->ify.e;
- ie.n_wd = wd_list.n;
-
- pr_info("id 0x%08x flags 0x%08x\n", ie.id, ie.flags);
- if (pb_write_one(img_from_set(glob_imgset, CR_FD_INOTIFY_FILE), &ie, PB_INOTIFY_FILE))
- goto free;
-
- exit_code = 0;
-free:
- xfree(ie.wd);
- list_for_each_entry_safe(we, tmp, &wd_list.list, ify.node)
- free_inotify_wd_entry(we);
-
- return exit_code;
-}
-
-static int pre_dump_inotify_entry(union fdinfo_entries *e, void *arg)
-{
- InotifyWdEntry *we = &e->ify.e;
- int ret;
-
- ret = irmap_queue_cache(we->s_dev, we->i_ino, we->f_handle);
- free_inotify_wd_entry(e);
-
- return ret;
-}
-
-static int pre_dump_one_inotify(int pid, int lfd)
-{
- return parse_fdinfo_pid(pid, lfd, FD_TYPES__INOTIFY, pre_dump_inotify_entry, NULL);
-}
-
-const struct fdtype_ops inotify_dump_ops = {
- .type = FD_TYPES__INOTIFY,
- .dump = dump_one_inotify,
- .pre_dump = pre_dump_one_inotify,
-};
-
-static int dump_fanotify_entry(union fdinfo_entries *e, void *arg)
-{
- struct watch_list *wd_list = (struct watch_list *) arg;
- FanotifyMarkEntry *fme = &e->ffy.e;
-
- if (fme->type == MARK_TYPE__INODE) {
-
- BUG_ON(!fme->ie);
-
- pr_info("mark: s_dev 0x%08x i_ino 0x%016"PRIx64" mask 0x%08x\n",
- fme->s_dev, fme->ie->i_ino, fme->mask);
-
- pr_info("\t[fhandle] bytes 0x%08x type 0x%08x __handle 0x%016"PRIx64":0x%016"PRIx64"\n",
- fme->ie->f_handle->bytes, fme->ie->f_handle->type,
- fme->ie->f_handle->handle[0], fme->ie->f_handle->handle[1]);
-
- if (check_open_handle(fme->s_dev, fme->ie->i_ino, fme->ie->f_handle))
- goto out;
- }
-
- if (fme->type == MARK_TYPE__MOUNT) {
- struct mount_info *m;
-
- BUG_ON(!fme->me);
-
- m = lookup_mnt_id(fme->me->mnt_id);
- if (!m) {
- pr_err("Can't find mnt_id 0x%x\n", fme->me->mnt_id);
- goto out;
- }
- fme->s_dev = m->s_dev;
-
- pr_info("mark: s_dev 0x%08x mnt_id 0x%08x mask 0x%08x\n",
- fme->s_dev, fme->me->mnt_id, fme->mask);
-
- }
-
- list_add_tail(&e->ffy.node, &wd_list->list);
- wd_list->n++;
-
- return 0;
-out:
- free_fanotify_mark_entry(e);
- return -1;
-}
-
-static int dump_one_fanotify(int lfd, u32 id, const struct fd_parms *p)
-{
- struct watch_list wd_list = {.list = LIST_HEAD_INIT(wd_list.list), .n = 0};
- FanotifyFileEntry fe = FANOTIFY_FILE_ENTRY__INIT;
- union fdinfo_entries *we, *tmp;
- int ret = -1, i;
-
- ret = fd_has_data(lfd);
- if (ret < 0)
- return -1;
- else if (ret > 0)
- pr_warn("The 0x%08x fanotify events will be dropped\n", id);
- ret = -1;
-
- fe.id = id;
- fe.flags = p->flags;
- fe.fown = (FownEntry *)&p->fown;
-
- if (parse_fdinfo(lfd, FD_TYPES__FANOTIFY,
- dump_fanotify_entry, &wd_list) < 0)
- goto free;
-
- fe.mark = xmalloc(sizeof(*fe.mark) * wd_list.n);
- if (!fe.mark)
- goto free;
-
- i = 0;
- list_for_each_entry(we, &wd_list.list, ify.node)
- fe.mark[i++] = &we->ffy.e;
- fe.n_mark = wd_list.n;
-
- pr_info("id 0x%08x flags 0x%08x\n", fe.id, fe.flags);
-
- fe.faflags = wd_list.fsn_params.faflags;
- fe.evflags = wd_list.fsn_params.evflags;
-
- ret = pb_write_one(img_from_set(glob_imgset, CR_FD_FANOTIFY_FILE), &fe, PB_FANOTIFY_FILE);
-free:
- xfree(fe.mark);
- list_for_each_entry_safe(we, tmp, &wd_list.list, ify.node)
- free_fanotify_mark_entry(we);
- return ret;
-}
-
-static int pre_dump_fanotify_entry(union fdinfo_entries *e, void *arg)
-{
- FanotifyMarkEntry *fme = &e->ffy.e;
- int ret = 0;
-
- if (fme->type == MARK_TYPE__INODE)
- ret = irmap_queue_cache(fme->s_dev, fme->ie->i_ino,
- fme->ie->f_handle);
-
- free_fanotify_mark_entry(e);
- return ret;
-}
-
-static int pre_dump_one_fanotify(int pid, int lfd)
-{
- struct fsnotify_params fsn_params = { };
- return parse_fdinfo_pid(pid, lfd, FD_TYPES__FANOTIFY, pre_dump_fanotify_entry, &fsn_params);
-}
-
-const struct fdtype_ops fanotify_dump_ops = {
- .type = FD_TYPES__FANOTIFY,
- .dump = dump_one_fanotify,
- .pre_dump = pre_dump_one_fanotify,
-};
-
-static char *get_mark_path(const char *who, struct file_remap *remap,
- FhEntry *f_handle, unsigned long i_ino,
- unsigned int s_dev, char *buf, int *target)
-{
- char *path = NULL;
-
- if (remap) {
- int mntns_root;
-
- mntns_root = mntns_get_root_by_mnt_id(remap->rmnt_id);
-
- pr_debug("\t\tRestore %s watch for 0x%08x:0x%016lx (via %s)\n",
- who, s_dev, i_ino, remap->rpath);
- *target = openat(mntns_root, remap->rpath, O_PATH);
- } else if (f_handle->path) {
- int mntns_root;
- char *path = ".";
- uint32_t mnt_id = f_handle->has_mnt_id ? f_handle->mnt_id : -1;
-
-
- /* irmap cache is collected in the root namespaces. */
- mntns_root = mntns_get_root_by_mnt_id(mnt_id);
-
- /* change "/foo" into "foo" and "/" into "." */
- if (f_handle->path[1] != '\0')
- path = f_handle->path + 1;
-
- pr_debug("\t\tRestore with path hint %d:%s\n", mnt_id, path);
- *target = openat(mntns_root, path, O_PATH);
- } else
- *target = open_handle(s_dev, i_ino, f_handle);
-
- if (*target < 0) {
- pr_perror("Unable to open %s", f_handle->path);
- goto err;
- }
-
- /*
- * fanotify/inotify open syscalls want path to attach
- * watch to. But the only thing we have is an FD obtained
- * via fhandle. Fortunatelly, when trying to attach the
- * /proc/pid/fd/ link, we will watch the inode the link
- * points to, i.e. -- just what we want.
- */
-
- sprintf(buf, "/proc/self/fd/%d", *target);
- path = buf;
-
- if (!pr_quelled(LOG_DEBUG)) {
- char link[PATH_MAX];
-
- if (read_fd_link(*target, link, sizeof(link)) < 0)
- link[0] = '\0';
-
- pr_debug("\t\tRestore %s watch for 0x%08x:0x%016lx (via %s -> %s)\n",
- who, s_dev, i_ino, path, link);
- }
-err:
- return path;
-}
-
-static int restore_one_inotify(int inotify_fd, struct fsnotify_mark_info *info)
-{
- InotifyWdEntry *iwe = info->iwe;
- int ret = -1, target = -1;
- char buf[PSFDS], *path;
-
- path = get_mark_path("inotify", info->remap, iwe->f_handle,
- iwe->i_ino, iwe->s_dev, buf, &target);
- if (!path)
- goto err;
-
- /*
- * FIXME The kernel allocates wd-s sequentially,
- * this is suboptimal, but the kernel doesn't
- * provide and API for this yet :(
- */
- while (1) {
- int wd;
-
- wd = inotify_add_watch(inotify_fd, path, iwe->mask);
- if (wd < 0) {
- pr_perror("Can't add watch for 0x%x with 0x%x", inotify_fd, iwe->wd);
- break;
- } else if (wd == iwe->wd) {
- ret = 0;
- break;
- } else if (wd > iwe->wd) {
- pr_err("Unsorted watch 0x%x found for 0x%x with 0x%x\n", wd, inotify_fd, iwe->wd);
- break;
- }
-
- pr_debug("\t\tWatch got 0x%x but 0x%x expected\n", wd, iwe->wd);
- inotify_rm_watch(inotify_fd, wd);
- }
-
-err:
- if (info->remap)
- remap_put(info->remap);
-
- close_safe(&target);
- return ret;
-}
-
-static int restore_one_fanotify(int fd, struct fsnotify_mark_info *mark)
-{
- FanotifyMarkEntry *fme = mark->fme;
- unsigned int flags = FAN_MARK_ADD;
- int ret = -1, target = -1;
- char buf[PSFDS], *path = NULL;
-
- if (fme->type == MARK_TYPE__MOUNT) {
- struct mount_info *m;
- int mntns_root;
-
- m = lookup_mnt_id(fme->me->mnt_id);
- if (!m) {
- pr_err("Can't find mount mnt_id 0x%x\n", fme->me->mnt_id);
- return -1;
- }
-
- mntns_root = mntns_get_root_fd(m->nsid);
-
- target = openat(mntns_root, m->ns_mountpoint, O_PATH);
- if (target == -1) {
- pr_perror("Unable to open %s", m->ns_mountpoint);
- goto err;
- }
-
- flags |= FAN_MARK_MOUNT;
- snprintf(buf, sizeof(buf), "/proc/self/fd/%d", target);
- path = buf;
- } else if (fme->type == MARK_TYPE__INODE) {
- path = get_mark_path("fanotify", mark->remap,
- fme->ie->f_handle, fme->ie->i_ino,
- fme->s_dev, buf, &target);
- if (!path)
- goto err;
- } else {
- pr_err("Bad fsnotify mark type 0x%x\n", fme->type);
- goto err;
- }
-
- flags |= fme->mflags;
-
- if (mark->fme->mask) {
- ret = fanotify_mark(fd, flags, fme->mask, AT_FDCWD, path);
- if (ret) {
- pr_err("Adding fanotify mask 0x%x on 0x%x/%s failed (%d)\n",
- fme->mask, fme->id, path, ret);
- goto err;
- }
- }
-
- if (fme->ignored_mask) {
- ret = fanotify_mark(fd, flags | FAN_MARK_IGNORED_MASK,
- fme->ignored_mask, AT_FDCWD, path);
- if (ret) {
- pr_err("Adding fanotify ignored-mask 0x%x on 0x%x/%s failed (%d)\n",
- fme->ignored_mask, fme->id, path, ret);
- goto err;
- }
- }
-
- if (mark->remap)
- remap_put(mark->remap);
-
-err:
- close_safe(&target);
- return ret;
-}
-
-static int open_inotify_fd(struct file_desc *d)
-{
- struct fsnotify_file_info *info;
- struct fsnotify_mark_info *wd_info;
- int tmp;
-
- info = container_of(d, struct fsnotify_file_info, d);
-
- tmp = inotify_init1(info->ife->flags);
- if (tmp < 0) {
- pr_perror("Can't create inotify for 0x%08x", info->ife->id);
- return -1;
- }
-
- list_for_each_entry(wd_info, &info->marks, list) {
- pr_info("\tRestore 0x%x wd for 0x%08x\n", wd_info->iwe->wd, wd_info->iwe->id);
- if (restore_one_inotify(tmp, wd_info)) {
- close_safe(&tmp);
- break;
- }
- }
-
- if (restore_fown(tmp, info->ife->fown))
- close_safe(&tmp);
-
- return tmp;
-}
-
-static int open_fanotify_fd(struct file_desc *d)
-{
- struct fsnotify_file_info *info;
- struct fsnotify_mark_info *mark;
- unsigned int flags = 0;
- int ret;
-
- info = container_of(d, struct fsnotify_file_info, d);
-
- flags = info->ffe->faflags;
- if (info->ffe->flags & O_CLOEXEC)
- flags |= FAN_CLOEXEC;
- if (info->ffe->flags & O_NONBLOCK)
- flags |= FAN_NONBLOCK;
-
- ret = fanotify_init(flags, info->ffe->evflags);
- if (ret < 0) {
- errno = -ret;
- pr_perror("Can't init fanotify mark (%d)", ret);
- return -1;
- }
-
- list_for_each_entry(mark, &info->marks, list) {
- pr_info("\tRestore fanotify for 0x%08x\n", mark->fme->id);
- if (restore_one_fanotify(ret, mark)) {
- close_safe(&ret);
- break;
- }
- }
-
- if (restore_fown(ret, info->ffe->fown))
- close_safe(&ret);
-
- return ret;
-}
-
-static struct file_desc_ops inotify_desc_ops = {
- .type = FD_TYPES__INOTIFY,
- .open = open_inotify_fd,
-};
-
-static struct file_desc_ops fanotify_desc_ops = {
- .type = FD_TYPES__FANOTIFY,
- .open = open_fanotify_fd,
-};
-
-static struct fsnotify_file_info *find_inotify_info(unsigned id)
-{
- struct fsnotify_file_info *p;
- static struct fsnotify_file_info *last = NULL;
-
- if (last && last->ife->id == id) {
- /*
- * An optimization for clean dump image -- criu puts
- * wd-s for one inotify in one row, thus sometimes
- * we can avoid scanning the inotify_info_head.
- */
- pr_debug("\t\tlast ify for 0x%08x found\n", id);
- return last;
- }
-
- list_for_each_entry(p, &inotify_info_head, list)
- if (p->ife->id == id) {
- last = p;
- return p;
- }
-
- pr_err("Can't find inotify with id 0x%08x\n", id);
- return NULL;
-}
-
-static int __collect_inotify_mark(struct fsnotify_file_info *p, struct fsnotify_mark_info *mark)
-{
- struct fsnotify_mark_info *m;
-
- /*
- * We should put marks in wd ascending order. See comment
- * in restore_one_inotify() for explanation.
- */
- list_for_each_entry(m, &p->marks, list)
- if (m->iwe->wd > mark->iwe->wd)
- break;
-
- list_add_tail(&mark->list, &m->list);
- mark->remap = lookup_ghost_remap(mark->iwe->s_dev, mark->iwe->i_ino);
- return 0;
-}
-
-static int collect_inotify_mark(struct fsnotify_mark_info *mark)
-{
- struct fsnotify_file_info *p;
-
- p = find_inotify_info(mark->iwe->id);
- if (!p)
- return -1;
-
- return __collect_inotify_mark(p, mark);
-}
-
-static int __collect_fanotify_mark(struct fsnotify_file_info *p,
- struct fsnotify_mark_info *mark)
-{
- list_add(&mark->list, &p->marks);
- if (mark->fme->type == MARK_TYPE__INODE)
- mark->remap = lookup_ghost_remap(mark->fme->s_dev,
- mark->fme->ie->i_ino);
- return 0;
-}
-
-static int collect_fanotify_mark(struct fsnotify_mark_info *mark)
-{
- struct fsnotify_file_info *p;
-
- list_for_each_entry(p, &fanotify_info_head, list) {
- if (p->ffe->id == mark->fme->id)
- return __collect_inotify_mark(p, mark);
- }
-
- pr_err("Can't find fanotify with id 0x%08x\n", mark->fme->id);
- return -1;
-}
-
-static int collect_one_inotify(void *o, ProtobufCMessage *msg)
-{
- struct fsnotify_file_info *info = o;
- int i;
-
- info->ife = pb_msg(msg, InotifyFileEntry);
- INIT_LIST_HEAD(&info->marks);
- list_add(&info->list, &inotify_info_head);
- pr_info("Collected id 0x%08x flags 0x%08x\n", info->ife->id, info->ife->flags);
-
- for (i = 0; i < info->ife->n_wd; i++) {
- struct fsnotify_mark_info *mark;
-
- mark = xmalloc(sizeof(*mark));
- if (!mark)
- return -1;
-
- mark->iwe = info->ife->wd[i];
- INIT_LIST_HEAD(&mark->list);
- mark->remap = NULL;
-
- if (__collect_inotify_mark(info, mark))
- return -1;
- }
-
- return file_desc_add(&info->d, info->ife->id, &inotify_desc_ops);
-}
-
-struct collect_image_info inotify_cinfo = {
- .fd_type = CR_FD_INOTIFY_FILE,
- .pb_type = PB_INOTIFY_FILE,
- .priv_size = sizeof(struct fsnotify_file_info),
- .collect = collect_one_inotify,
-};
-
-static int collect_one_fanotify(void *o, ProtobufCMessage *msg)
-{
- struct fsnotify_file_info *info = o;
- int i;
-
- info->ffe = pb_msg(msg, FanotifyFileEntry);
- INIT_LIST_HEAD(&info->marks);
- list_add(&info->list, &fanotify_info_head);
- pr_info("Collected id 0x%08x flags 0x%08x\n", info->ffe->id, info->ffe->flags);
-
- for (i = 0; i < info->ffe->n_mark; i++) {
- struct fsnotify_mark_info *mark;
-
- mark = xmalloc(sizeof(*mark));
- if (!mark)
- return -1;
-
- mark->fme = info->ffe->mark[i];
- INIT_LIST_HEAD(&mark->list);
- mark->remap = NULL;
-
- if (__collect_fanotify_mark(info, mark))
- return -1;
- }
-
- return file_desc_add(&info->d, info->ffe->id, &fanotify_desc_ops);
-}
-
-struct collect_image_info fanotify_cinfo = {
- .fd_type = CR_FD_FANOTIFY_FILE,
- .pb_type = PB_FANOTIFY_FILE,
- .priv_size = sizeof(struct fsnotify_file_info),
- .collect = collect_one_fanotify,
-};
-
-static int collect_one_inotify_mark(void *o, ProtobufCMessage *msg)
-{
- struct fsnotify_mark_info *mark = o;
-
- mark->iwe = pb_msg(msg, InotifyWdEntry);
- INIT_LIST_HEAD(&mark->list);
- mark->remap = NULL;
-
- /*
- * The kernel prior 4.3 might export internal event
- * mask bits which are not part of user-space API. It
- * is fixed in kernel but we have to keep backward
- * compatibility with old images. So mask out
- * inappropriate bits (in particular fdinfo might
- * have FS_EVENT_ON_CHILD bit set).
- */
- mark->iwe->mask &= ~KERNEL_FS_EVENT_ON_CHILD;
-
- return collect_inotify_mark(mark);
-}
-
-struct collect_image_info inotify_mark_cinfo = {
- .fd_type = CR_FD_INOTIFY_WD,
- .pb_type = PB_INOTIFY_WD,
- .priv_size = sizeof(struct fsnotify_mark_info),
- .collect = collect_one_inotify_mark,
-};
-
-static int collect_one_fanotify_mark(void *o, ProtobufCMessage *msg)
-{
- struct fsnotify_mark_info *mark = o;
-
- mark->fme = pb_msg(msg, FanotifyMarkEntry);
- INIT_LIST_HEAD(&mark->list);
- mark->remap = NULL;
-
- return collect_fanotify_mark(mark);
-}
-
-struct collect_image_info fanotify_mark_cinfo = {
- .fd_type = CR_FD_FANOTIFY_MARK,
- .pb_type = PB_FANOTIFY_MARK,
- .priv_size = sizeof(struct fsnotify_mark_info),
- .collect = collect_one_fanotify_mark,
-};
diff --git a/image-desc.c b/image-desc.c
deleted file mode 100644
index 677067538120..000000000000
--- a/image-desc.c
+++ /dev/null
@@ -1,117 +0,0 @@
-#include <stdlib.h>
-
-#include "image-desc.h"
-#include "cr-show.h"
-#include "magic.h"
-#include "image.h"
-
-/*
- * The cr fd set is the set of files where the information
- * about dumped processes is stored. Each file carries some
- * small portion of info about the whole picture, see below
- * for more details.
- */
-
-#define FD_ENTRY(_name, _fmt) \
- [CR_FD_##_name] = { \
- .fmt = _fmt ".img", \
- .magic = _name##_MAGIC, \
- }
-
-#define FD_ENTRY_F(_name, _fmt, _f) \
- [CR_FD_##_name] = { \
- .fmt = _fmt ".img", \
- .magic = _name##_MAGIC, \
- .oflags = _f, \
- }
-
-struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX] = {
- FD_ENTRY(INVENTORY, "inventory"),
- FD_ENTRY(FDINFO, "fdinfo-%d"),
- FD_ENTRY(PAGEMAP, "pagemap-%ld"),
- FD_ENTRY(SHMEM_PAGEMAP, "pagemap-shmem-%ld"),
- FD_ENTRY(REG_FILES, "reg-files"),
- FD_ENTRY(EXT_FILES, "ext-files"),
- FD_ENTRY(NS_FILES, "ns-files"),
- FD_ENTRY(EVENTFD_FILE, "eventfd"),
- FD_ENTRY(EVENTPOLL_FILE,"eventpoll"),
- FD_ENTRY(EVENTPOLL_TFD, "eventpoll-tfd"),
- FD_ENTRY(SIGNALFD, "signalfd"),
- FD_ENTRY(INOTIFY_FILE, "inotify"),
- FD_ENTRY(INOTIFY_WD, "inotify-wd"),
- FD_ENTRY(FANOTIFY_FILE, "fanotify"),
- FD_ENTRY(FANOTIFY_MARK, "fanotify-mark"),
- FD_ENTRY(CORE, "core-%d"),
- FD_ENTRY(IDS, "ids-%d"),
- FD_ENTRY(MM, "mm-%d"),
- FD_ENTRY(VMAS, "vmas-%d"),
- FD_ENTRY(PIPES, "pipes"),
- FD_ENTRY_F(PIPES_DATA, "pipes-data", O_NOBUF), /* splices data */
- FD_ENTRY(FIFO, "fifo"),
- FD_ENTRY_F(FIFO_DATA, "fifo-data", O_NOBUF), /* the same */
- FD_ENTRY(PSTREE, "pstree"),
- FD_ENTRY(SIGACT, "sigacts-%d"),
- FD_ENTRY(UNIXSK, "unixsk"),
- FD_ENTRY(INETSK, "inetsk"),
- FD_ENTRY(PACKETSK, "packetsk"),
- FD_ENTRY(NETLINK_SK, "netlinksk"),
- FD_ENTRY_F(SK_QUEUES, "sk-queues", O_NOBUF), /* lseeks the image */
- FD_ENTRY(ITIMERS, "itimers-%d"),
- FD_ENTRY(POSIX_TIMERS, "posix-timers-%d"),
- FD_ENTRY(CREDS, "creds-%d"),
- FD_ENTRY(UTSNS, "utsns-%d"),
- FD_ENTRY(IPC_VAR, "ipcns-var-%d"),
- FD_ENTRY(IPCNS_SHM, "ipcns-shm-%d"),
- FD_ENTRY(IPCNS_MSG, "ipcns-msg-%d"),
- FD_ENTRY(IPCNS_SEM, "ipcns-sem-%d"),
- FD_ENTRY(FS, "fs-%d"),
- FD_ENTRY(REMAP_FPATH, "remap-fpath"),
- FD_ENTRY_F(GHOST_FILE, "ghost-file-%x", O_NOBUF),
- FD_ENTRY(TCP_STREAM, "tcp-stream-%x"),
- FD_ENTRY(MNTS, "mountpoints-%d"),
- FD_ENTRY(NETDEV, "netdev-%d"),
- FD_ENTRY(NETNS, "netns-%d"),
- FD_ENTRY_F(IFADDR, "ifaddr-%d", O_NOBUF),
- FD_ENTRY_F(ROUTE, "route-%d", O_NOBUF),
- FD_ENTRY_F(ROUTE6, "route6-%d", O_NOBUF),
- FD_ENTRY_F(RULE, "rule-%d", O_NOBUF),
- FD_ENTRY_F(IPTABLES, "iptables-%d", O_NOBUF),
- FD_ENTRY_F(IP6TABLES, "ip6tables-%d", O_NOBUF),
- FD_ENTRY_F(TMPFS_IMG, "tmpfs-%d.tar.gz", O_NOBUF),
- FD_ENTRY_F(TMPFS_DEV, "tmpfs-dev-%d.tar.gz", O_NOBUF),
- FD_ENTRY(BINFMT_MISC, "binfmt-misc-%d"),
- FD_ENTRY(TTY_FILES, "tty"),
- FD_ENTRY(TTY_INFO, "tty-info"),
- FD_ENTRY(FILE_LOCKS, "filelocks"),
- FD_ENTRY(RLIMIT, "rlimit-%d"),
- FD_ENTRY_F(PAGES, "pages-%u", O_NOBUF),
- FD_ENTRY_F(PAGES_OLD, "pages-%d", O_NOBUF),
- FD_ENTRY_F(SHM_PAGES_OLD, "pages-shmem-%ld", O_NOBUF),
- FD_ENTRY(SIGNAL, "signal-s-%d"),
- FD_ENTRY(PSIGNAL, "signal-p-%d"),
- FD_ENTRY(TUNFILE, "tunfile"),
- FD_ENTRY(CGROUP, "cgroup"),
- FD_ENTRY(TIMERFD, "timerfd"),
- FD_ENTRY(CPUINFO, "cpuinfo"),
- FD_ENTRY(SECCOMP, "seccomp"),
- FD_ENTRY(USERNS, "userns-%d"),
- FD_ENTRY(NETNF_CT, "netns-ct-%d"),
- FD_ENTRY(NETNF_EXP, "netns-exp-%d"),
-
- [CR_FD_STATS] = {
- .fmt = "stats-%s",
- .magic = STATS_MAGIC,
- .oflags = O_SERVICE,
- },
-
- [CR_FD_IRMAP_CACHE] = {
- .fmt = "irmap-cache",
- .magic = IRMAP_CACHE_MAGIC,
- .oflags = O_SERVICE,
- },
-
- [CR_FD_FILE_LOCKS_PID] = {
- .fmt = "filelocks-%d.img",
- .magic = FILE_LOCKS_MAGIC,
- },
-};
diff --git a/image.c b/image.c
deleted file mode 100644
index a164722bba5e..000000000000
--- a/image.c
+++ /dev/null
@@ -1,561 +0,0 @@
-#include <unistd.h>
-#include <stdarg.h>
-#include <fcntl.h>
-#include "crtools.h"
-#include "cr_options.h"
-#include "imgset.h"
-#include "image.h"
-#include "pstree.h"
-#include "stats.h"
-#include "cgroup.h"
-#include "lsm.h"
-#include "protobuf.h"
-#include "protobuf/inventory.pb-c.h"
-#include "protobuf/pagemap.pb-c.h"
-
-bool fdinfo_per_id = false;
-bool ns_per_id = false;
-bool img_common_magic = true;
-TaskKobjIdsEntry *root_ids;
-u32 root_cg_set;
-Lsmtype image_lsm;
-
-int check_img_inventory(void)
-{
- int ret = -1;
- struct cr_img *img;
- InventoryEntry *he;
-
- img = open_image(CR_FD_INVENTORY, O_RSTR);
- if (!img)
- return -1;
-
- if (pb_read_one(img, &he, PB_INVENTORY) < 0)
- goto out_close;
-
- fdinfo_per_id = he->has_fdinfo_per_id ? he->fdinfo_per_id : false;
- ns_per_id = he->has_ns_per_id ? he->ns_per_id : false;
-
- if (he->root_ids) {
- root_ids = xmalloc(sizeof(*root_ids));
- if (!root_ids)
- goto out_err;
-
- memcpy(root_ids, he->root_ids, sizeof(*root_ids));
- }
-
- if (he->has_root_cg_set) {
- if (he->root_cg_set == 0) {
- pr_err("Corrupted root cgset\n");
- goto out_err;
- }
-
- root_cg_set = he->root_cg_set;
- }
-
- image_lsm = he->lsmtype;
-
- switch (he->img_version) {
- case CRTOOLS_IMAGES_V1:
- /* good old images. OK */
- img_common_magic = false;
- break;
- case CRTOOLS_IMAGES_V1_1:
- /* newer images with extra magic in the head */
- break;
- default:
- pr_err("Not supported images version %u\n", he->img_version);
- goto out_err;
- }
-
- ret = 0;
-
-out_err:
- inventory_entry__free_unpacked(he, NULL);
-out_close:
- close_image(img);
- return ret;
-}
-
-int write_img_inventory(InventoryEntry *he)
-{
- struct cr_img *img;
-
- pr_info("Writing image inventory (version %u)\n", CRTOOLS_IMAGES_V1);
-
- img = open_image(CR_FD_INVENTORY, O_DUMP);
- if (!img)
- return -1;
-
- if (pb_write_one(img, he, PB_INVENTORY) < 0)
- return -1;
-
- xfree(he->root_ids);
- close_image(img);
- return 0;
-}
-
-int prepare_inventory(InventoryEntry *he)
-{
- struct {
- struct pstree_item i;
- struct dmp_info d;
- } crt = { };
-
- pr_info("Perparing image inventory (version %u)\n", CRTOOLS_IMAGES_V1);
-
- he->img_version = CRTOOLS_IMAGES_V1_1;
- he->fdinfo_per_id = true;
- he->has_fdinfo_per_id = true;
- he->ns_per_id = true;
- he->has_ns_per_id = true;
- he->lsmtype = host_lsm_type();
-
- crt.i.state = TASK_ALIVE;
- crt.i.pid.real = getpid();
- if (get_task_ids(&crt.i))
- return -1;
-
- he->has_root_cg_set = true;
- if (dump_task_cgroup(NULL, &he->root_cg_set))
- return -1;
-
- he->root_ids = crt.i.ids;
-
- return 0;
-}
-
-static struct cr_imgset *alloc_cr_imgset(int nr)
-{
- struct cr_imgset *cr_imgset;
- unsigned int i;
-
- cr_imgset = xmalloc(sizeof(*cr_imgset));
- if (cr_imgset == NULL)
- return NULL;
-
- cr_imgset->_imgs = xmalloc(nr * sizeof(struct cr_img *));
- if (cr_imgset->_imgs == NULL) {
- xfree(cr_imgset);
- return NULL;
- }
-
- for (i = 0; i < nr; i++)
- cr_imgset->_imgs[i] = NULL;
- cr_imgset->fd_nr = nr;
- return cr_imgset;
-}
-
-static void __close_cr_imgset(struct cr_imgset *cr_imgset)
-{
- unsigned int i;
-
- if (!cr_imgset)
- return;
-
- for (i = 0; i < cr_imgset->fd_nr; i++) {
- if (!cr_imgset->_imgs[i])
- continue;
- close_image(cr_imgset->_imgs[i]);
- cr_imgset->_imgs[i] = NULL;
- }
-}
-
-void close_cr_imgset(struct cr_imgset **cr_imgset)
-{
- if (!cr_imgset || !*cr_imgset)
- return;
-
- __close_cr_imgset(*cr_imgset);
-
- xfree((*cr_imgset)->_imgs);
- xfree(*cr_imgset);
- *cr_imgset = NULL;
-}
-
-struct cr_imgset *cr_imgset_open_range(int pid, int from, int to,
- unsigned long flags)
-{
- struct cr_imgset *imgset;
- unsigned int i;
-
- imgset = alloc_cr_imgset(to - from);
- if (!imgset)
- goto err;
-
- from++;
- imgset->fd_off = from;
- for (i = from; i < to; i++) {
- struct cr_img *img;
-
- img = open_image(i, flags, pid);
- if (!img) {
- if (!(flags & O_CREAT))
- /* caller should check himself */
- continue;
- goto err;
- }
-
- imgset->_imgs[i - from] = img;
- }
-
- return imgset;
-
-err:
- close_cr_imgset(&imgset);
- return NULL;
-}
-
-struct cr_imgset *cr_task_imgset_open(int pid, int mode)
-{
- return cr_imgset_open(pid, TASK, mode);
-}
-
-struct cr_imgset *cr_glob_imgset_open(int mode)
-{
- return cr_imgset_open(-1 /* ignored */, GLOB, mode);
-}
-
-static int do_open_image(struct cr_img *img, int dfd, int type, unsigned long flags, char *path);
-
-struct cr_img *open_image_at(int dfd, int type, unsigned long flags, ...)
-{
- struct cr_img *img;
- unsigned long oflags;
- char path[PATH_MAX];
- va_list args;
- bool lazy = false;
-
- if (dfd == -1) {
- dfd = get_service_fd(IMG_FD_OFF);
- lazy = (flags & O_CREAT);
- }
-
- img = xmalloc(sizeof(*img));
- if (!img)
- return NULL;
-
- oflags = flags | imgset_template[type].oflags;
-
- va_start(args, flags);
- vsnprintf(path, PATH_MAX, imgset_template[type].fmt, args);
- va_end(args);
-
- if (lazy) {
- img->fd = LAZY_IMG_FD;
- img->type = type;
- img->oflags = oflags;
- img->path = xstrdup(path);
- return img;
- } else
- img->fd = EMPTY_IMG_FD;
-
- if (do_open_image(img, dfd, type, oflags, path)) {
- close_image(img);
- return NULL;
- }
-
- return img;
-}
-
-static inline u32 head_magic(int oflags)
-{
- return oflags & O_SERVICE ? IMG_SERVICE_MAGIC : IMG_COMMON_MAGIC;
-}
-
-static int img_check_magic(struct cr_img *img, int oflags, int type, char *path)
-{
- u32 magic;
-
- if (read_img(img, &magic) < 0)
- return -1;
-
- if (img_common_magic && (type != CR_FD_INVENTORY)) {
- if (magic != head_magic(oflags)) {
- pr_err("Head magic doesn't match for %s\n", path);
- return -1;
- }
-
- if (read_img(img, &magic) < 0)
- return -1;
- }
-
- if (magic != imgset_template[type].magic) {
- pr_err("Magic doesn't match for %s\n", path);
- return -1;
- }
-
- return 0;
-}
-
-static int img_write_magic(struct cr_img *img, int oflags, int type)
-{
- if (img_common_magic && (type != CR_FD_INVENTORY)) {
- u32 cmagic;
-
- cmagic = head_magic(oflags);
- if (write_img(img, &cmagic))
- return -1;
- }
-
- return write_img(img, &imgset_template[type].magic);
-}
-
-static int do_open_image(struct cr_img *img, int dfd, int type, unsigned long oflags, char *path)
-{
- int ret, flags;
-
- flags = oflags & ~(O_NOBUF | O_SERVICE);
-
- ret = openat(dfd, path, flags, CR_FD_PERM);
- if (ret < 0) {
- if (!(flags & O_CREAT) && (errno == ENOENT)) {
- pr_info("No %s image\n", path);
- img->_x.fd = EMPTY_IMG_FD;
- goto skip_magic;
- }
-
- pr_perror("Unable to open %s", path);
- goto err;
- }
-
- img->_x.fd = ret;
- if (oflags & O_NOBUF)
- bfd_setraw(&img->_x);
- else {
- if (flags == O_RDONLY)
- ret = bfdopenr(&img->_x);
- else
- ret = bfdopenw(&img->_x);
-
- if (ret)
- goto err;
- }
-
- if (imgset_template[type].magic == RAW_IMAGE_MAGIC)
- goto skip_magic;
-
- if (flags == O_RDONLY)
- ret = img_check_magic(img, oflags, type, path);
- else
- ret = img_write_magic(img, oflags, type);
- if (ret)
- goto err;
-
-skip_magic:
- return 0;
-
-err:
- return -1;
-}
-
-int open_image_lazy(struct cr_img *img)
-{
- int dfd;
- char *path = img->path;
-
- img->path = NULL;
-
- dfd = get_service_fd(IMG_FD_OFF);
- if (do_open_image(img, dfd, img->type, img->oflags, path)) {
- xfree(path);
- return -1;
- }
-
- xfree(path);
- return 0;
-}
-
-void close_image(struct cr_img *img)
-{
- if (lazy_image(img)) {
- /*
- * Remove the image file if it's there so that
- * subsequent restore doesn't read wrong or fake
- * data from it.
- */
- unlinkat(get_service_fd(IMG_FD_OFF), img->path, 0);
- xfree(img->path);
- } else if (!empty_image(img))
- bclose(&img->_x);
-
- xfree(img);
-}
-
-struct cr_img *img_from_fd(int fd)
-{
- struct cr_img *img;
-
- img = xmalloc(sizeof(*img));
- if (img) {
- img->_x.fd = fd;
- bfd_setraw(&img->_x);
- }
-
- return img;
-}
-
-int open_image_dir(char *dir)
-{
- int fd, ret;
-
- fd = open(dir, O_RDONLY);
- if (fd < 0) {
- pr_perror("Can't open dir %s", dir);
- return -1;
- }
-
- ret = install_service_fd(IMG_FD_OFF, fd);
- close(fd);
- fd = ret;
-
- if (opts.img_parent) {
- ret = symlinkat(opts.img_parent, fd, CR_PARENT_LINK);
- if (ret < 0 && errno != EEXIST) {
- pr_perror("Can't link parent snapshot");
- goto err;
- }
- }
-
- return 0;
-
-err:
- close_image_dir();
- return -1;
-}
-
-void close_image_dir(void)
-{
- close_service_fd(IMG_FD_OFF);
-}
-
-static unsigned long page_ids = 1;
-
-void up_page_ids_base(void)
-{
- /*
- * When page server and criu dump work on
- * the same dir, the shmem pagemaps and regular
- * pagemaps may have IDs conflicts. Fix this by
- * making page server produce page images with
- * higher IDs.
- */
-
- BUG_ON(page_ids != 1);
- page_ids += 0x10000;
-}
-
-struct cr_img *open_pages_image_at(int dfd, unsigned long flags, struct cr_img *pmi)
-{
- unsigned id;
-
- if (flags == O_RDONLY || flags == O_RDWR) {
- PagemapHead *h;
- if (pb_read_one(pmi, &h, PB_PAGEMAP_HEAD) < 0)
- return NULL;
- id = h->pages_id;
- pagemap_head__free_unpacked(h, NULL);
- } else {
- PagemapHead h = PAGEMAP_HEAD__INIT;
- id = h.pages_id = page_ids++;
- if (pb_write_one(pmi, &h, PB_PAGEMAP_HEAD) < 0)
- return NULL;
- }
-
- return open_image_at(dfd, CR_FD_PAGES, flags, id);
-}
-
-struct cr_img *open_pages_image(unsigned long flags, struct cr_img *pmi)
-{
- return open_pages_image_at(get_service_fd(IMG_FD_OFF), flags, pmi);
-}
-
-/*
- * Write buffer @ptr of @size bytes into @fd file
- * Returns
- * 0 on success
- * -1 on error (error message is printed)
- */
-int write_img_buf(struct cr_img *img, const void *ptr, int size)
-{
- int ret;
-
- ret = bwrite(&img->_x, ptr, size);
- if (ret == size)
- return 0;
-
- if (ret < 0)
- pr_perror("Can't write img file");
- else
- pr_err("Img trimmed %d/%d\n", ret, size);
- return -1;
-}
-
-/*
- * Read buffer @ptr of @size bytes from @fd file
- * Returns
- * 1 on success
- * 0 on EOF (silently)
- * -1 on error (error message is printed)
- */
-int read_img_buf_eof(struct cr_img *img, void *ptr, int size)
-{
- int ret;
-
- ret = bread(&img->_x, ptr, size);
- if (ret == size)
- return 1;
- if (ret == 0)
- return 0;
-
- if (ret < 0)
- pr_perror("Can't read img file");
- else
- pr_err("Img trimmed %d/%d\n", ret, size);
- return -1;
-}
-
-/*
- * Read buffer @ptr of @size bytes from @fd file
- * Returns
- * 1 on success
- * -1 on error or EOF (error message is printed)
- */
-int read_img_buf(struct cr_img *img, void *ptr, int size)
-{
- int ret;
-
- ret = read_img_buf_eof(img, ptr, size);
- if (ret == 0) {
- pr_err("Unexpected EOF\n");
- ret = -1;
- }
-
- return ret;
-}
-
-/*
- * read_img_str -- same as read_img_buf, but allocates memory for
- * the buffer and puts the '\0' at the end
- */
-
-int read_img_str(struct cr_img *img, char **pstr, int size)
-{
- int ret;
- char *str;
-
- str = xmalloc(size + 1);
- if (!str)
- return -1;
-
- ret = read_img_buf(img, str, size);
- if (ret < 0) {
- xfree(str);
- return -1;
- }
-
- str[size] = '\0';
- *pstr = str;
- return 0;
-}
-
diff --git a/images/Makefile b/images/Makefile
index 12089a2192ab..16ecb6ab54ef 100644
--- a/images/Makefile
+++ b/images/Makefile
@@ -59,7 +59,7 @@ proto-obj-y += seccomp.o
proto-obj-y += binfmt-misc.o
proto-obj-y += time.o
-CFLAGS += -I$(obj)/
+CFLAGS += -iquote $(obj)/
#
# Generates a set of names from protobuf "import" directive.
@@ -101,8 +101,6 @@ $(obj)/built-in.o: $(addprefix $(obj)/,$(proto-obj-y))
$(E) " LINK " $@
$(Q) $(LD) $(ldflags-y) -r -o $@ $^
-_all += $(obj)/built-in.o
-
ifneq ($(MAKECMDGOALS),clean)
-include $(addprefix $(obj)/,$(proto-obj-y:.o=.pb-c.d))
endif
@@ -112,4 +110,3 @@ cleanup-y += $(obj)/google/protobuf/*.d
cleanup-y += $(obj)/google/protobuf/*.h
cleanup-y += $(obj)/google/protobuf/*.c
cleanup-y += $(obj)/google/protobuf/*.o
-cleanup-y += $(obj)/*.d $(obj)/*.i $(obj)/*.s $(obj)/*.o
diff --git a/include/action-scripts.h b/include/action-scripts.h
deleted file mode 100644
index 8ffc2c58bad6..000000000000
--- a/include/action-scripts.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef __CR_ACTION_SCRIPTS_H__
-#define __CR_ACTION_SCRIPTS_H__
-
-struct script {
- struct list_head node;
- char *path;
- int arg;
-};
-
-#define SCRIPT_RPC_NOTIFY (char *)0x1
-
-enum script_actions {
- ACT_PRE_DUMP = 0,
- ACT_POST_DUMP = 1,
- ACT_PRE_RESTORE = 2,
- ACT_POST_RESTORE = 3,
- ACT_NET_LOCK = 4,
- ACT_NET_UNLOCK = 5,
- ACT_SETUP_NS = 6,
- ACT_POST_SETUP_NS = 7,
-
- ACT_MAX
-};
-
-extern int add_script(char *path, int arg);
-extern int run_scripts(enum script_actions);
-extern int send_criu_rpc_script(enum script_actions act, char *name, int arg);
-
-#endif /* __CR_ACTION_SCRIPTS_H__ */
diff --git a/include/aio.h b/include/aio.h
deleted file mode 100644
index e839ec693da9..000000000000
--- a/include/aio.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef __CR_AIO_H__
-#define __CR_AIO_H__
-#include "protobuf/mm.pb-c.h"
-int dump_aio_ring(MmEntry *mme, struct vma_area *vma);
-void free_aios(MmEntry *mme);
-struct parasite_ctl;
-int parasite_check_aios(struct parasite_ctl *, struct vm_area_list *);
-unsigned long aio_rings_args_size(struct vm_area_list *);
-
-struct rst_aio_ring {
- unsigned long addr;
- unsigned long len;
- unsigned int nr_req;
-};
-#endif /* __CR_AIO_H__ */
diff --git a/include/asm-generic/bitops.h b/include/asm-generic/bitops.h
deleted file mode 100644
index 190e1ab638c0..000000000000
--- a/include/asm-generic/bitops.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Generic bits operations.
- *
- * Architectures that don't want their own implementation of those,
- * should include this file into the arch/$ARCH/include/asm/bitops.h
- */
-
-#ifndef __CR_GENERIC_BITOPS_H__
-#define __CR_GENERIC_BITOPS_H__
-
-#include "asm/bitsperlong.h"
-
-#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
-#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_LONG)
-
-#define DECLARE_BITMAP(name, bits) \
- unsigned long name[BITS_TO_LONGS(bits)]
-
-#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
-/* Technically wrong, but this avoids compilation errors on some gcc
- versions. */
-#define BITOP_ADDR(x) "=m" (*(volatile long *) (x))
-#else
-#define BITOP_ADDR(x) "+m" (*(volatile long *) (x))
-#endif
-
-#define ADDR BITOP_ADDR(addr)
-
-static inline void set_bit(int nr, volatile unsigned long *addr) {
- addr += nr / BITS_PER_LONG;
- *addr |= (1 << (nr % BITS_PER_LONG));
-}
-
-static inline void change_bit(int nr, volatile unsigned long *addr)
-{
- addr += nr / BITS_PER_LONG;
- *addr ^= (1 << (nr % BITS_PER_LONG));
-}
-
-static inline int test_bit(int nr, volatile const unsigned long *addr)
-{
- addr += nr / BITS_PER_LONG;
- return (*addr & (1 << (nr % BITS_PER_LONG))) ? -1 : 0;
-}
-
-static inline void clear_bit(int nr, volatile unsigned long *addr)
-{
- addr += nr / BITS_PER_LONG;
- *addr &= ~(1 << (nr % BITS_PER_LONG));
-}
-
-/**
- * __ffs - find first set bit in word
- * @word: The word to search
- *
- * Undefined if no bit exists, so code should check against 0 first.
- */
-static inline unsigned long __ffs(unsigned long word)
-{
- int p = 0;
-
- for (; p < 8*sizeof(word); ++p) {
- if (word & 1) {
- break;
- }
-
- word >>= 1;
- }
-
- return p;
-}
-
-#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
-
-/*
- * Find the next set bit in a memory region.
- */
-static inline
-unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
- unsigned long offset)
-{
- const unsigned long *p = addr + BITOP_WORD(offset);
- unsigned long result = offset & ~(BITS_PER_LONG-1);
- unsigned long tmp;
-
- if (offset >= size)
- return size;
- size -= result;
- offset %= BITS_PER_LONG;
- if (offset) {
- tmp = *(p++);
- tmp &= (~0UL << offset);
- if (size < BITS_PER_LONG)
- goto found_first;
- if (tmp)
- goto found_middle;
- size -= BITS_PER_LONG;
- result += BITS_PER_LONG;
- }
- while (size & ~(BITS_PER_LONG-1)) {
- if ((tmp = *(p++)))
- goto found_middle;
- result += BITS_PER_LONG;
- size -= BITS_PER_LONG;
- }
- if (!size)
- return result;
- tmp = *p;
-
-found_first:
- tmp &= (~0UL >> (BITS_PER_LONG - size));
- if (tmp == 0UL) /* Are any bits set? */
- return result + size; /* Nope. */
-found_middle:
- return result + __ffs(tmp);
-}
-
-#define for_each_bit(i, bitmask) \
- for (i = find_next_bit(bitmask, sizeof(bitmask), 0); \
- i < sizeof(bitmask); \
- i = find_next_bit(bitmask, sizeof(bitmask), i + 1))
-
-#endif /* __CR_GENERIC_BITOPS_H__ */
diff --git a/include/asm-generic/int.h b/include/asm-generic/int.h
deleted file mode 100644
index ac3088d5ac3b..000000000000
--- a/include/asm-generic/int.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef __CR_INT_H__
-#define __CR_INT_H__
-
-#include <stdint.h>
-
-typedef uint64_t u64;
-typedef int64_t s64;
-typedef uint32_t u32;
-typedef int32_t s32;
-typedef uint16_t u16;
-typedef int16_t s16;
-typedef uint8_t u8;
-typedef int8_t s8;
-
-#endif /* __CR_INT_H__ */
diff --git a/include/asm-generic/string.h b/include/asm-generic/string.h
deleted file mode 100644
index 0a545e65960d..000000000000
--- a/include/asm-generic/string.h
+++ /dev/null
@@ -1,51 +0,0 @@
-#ifndef __CR_ASM_GENERIC_STRING_H__
-#define __CR_ASM_GENERIC_STRING_H__
-
-#include "compiler.h"
-
-#ifndef HAS_BUILTIN_MEMCPY
-static always_inline void *builtin_memcpy(void *to, const void *from, unsigned int n)
-{
- int i;
- unsigned char *cto = to;
- const unsigned char *cfrom = from;
-
- for (i = 0; i < n; ++i, ++cto, ++cfrom) {
- *cto = *cfrom;
- }
-
- return to;
-}
-#endif
-
-#ifndef HAS_BUILTIN_MEMCMP
-static always_inline int builtin_memcmp(const void *cs, const void *ct, size_t count)
-{
- const unsigned char *su1, *su2;
- int res = 0;
-
- for (su1 = cs, su2 = ct; 0 < count; ++su1, ++su2, count--)
- if ((res = *su1 - *su2) != 0)
- break;
- return res;
-}
-#endif
-
-#ifndef HAS_BUILTIN_STRCMP
-static always_inline int builtin_strcmp(const char *cs, const char *ct)
-{
- unsigned char c1, c2;
-
- while (1) {
- c1 = *cs++;
- c2 = *ct++;
- if (c1 != c2)
- return c1 < c2 ? -1 : 1;
- if (!c1)
- break;
- }
- return 0;
-}
-#endif
-
-#endif /* __CR_ASM_GENERIC_STRING_H__ */
diff --git a/include/asm-generic/vdso.h b/include/asm-generic/vdso.h
deleted file mode 100644
index bb746055416b..000000000000
--- a/include/asm-generic/vdso.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef __CR_ASM_GENERIC_VDSO_H__
-#define __CR_ASM_GENERIC_VDSO_H__
-
-#define VDSO_PROT (PROT_READ | PROT_EXEC)
-#define VVAR_PROT (PROT_READ)
-
-#define VDSO_BAD_ADDR (-1ul)
-#define VVAR_BAD_ADDR VDSO_BAD_ADDR
-#define VDSO_BAD_PFN (-1ull)
-#define VVAR_BAD_PFN VDSO_BAD_PFN
-
-#endif /* __CR_ASM_GENERIC_VDSO_H__ */
diff --git a/include/bfd.h b/include/bfd.h
deleted file mode 100644
index e9b4d53a43c4..000000000000
--- a/include/bfd.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef __CR_BFD_H__
-#define __CR_BFD_H__
-
-#include "err.h"
-
-struct bfd_buf;
-struct xbuf {
- char *mem; /* buffer */
- char *data; /* position we see bytes at */
- unsigned int sz; /* bytes sitting after b->pos */
- struct bfd_buf *buf;
-};
-
-struct bfd {
- int fd;
- bool writable;
- struct xbuf b;
-};
-
-static inline bool bfd_buffered(struct bfd *b)
-{
- return b->b.mem != NULL;
-}
-
-static inline void bfd_setraw(struct bfd *b)
-{
- b->b.mem = NULL;
-}
-
-int bfdopenr(struct bfd *f);
-int bfdopenw(struct bfd *f);
-void bclose(struct bfd *f);
-char *breadline(struct bfd *f);
-char *breadchr(struct bfd *f, char c);
-int bwrite(struct bfd *f, const void *buf, int sz);
-struct iovec;
-int bwritev(struct bfd *f, const struct iovec *iov, int cnt);
-int bread(struct bfd *f, void *buf, int sz);
-int bfd_flush_images(void);
-#endif
diff --git a/include/bitmap.h b/include/bitmap.h
deleted file mode 100644
index 9e701b66cc2c..000000000000
--- a/include/bitmap.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef __CR_BITMAP_H__
-#define __CR_BITMAP_H__
-
-extern void bitmap_set(unsigned long *map, int start, int nr);
-extern void bitmap_clear(unsigned long *map, int start, int nr);
-
-#endif /* __CR_BITMAP_H__ */
diff --git a/include/bug.h b/include/bug.h
deleted file mode 100644
index a479c673bda9..000000000000
--- a/include/bug.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef __CR_BUG_H__
-#define __CR_BUG_H__
-
-#include <signal.h>
-#include <stdbool.h>
-
-#include "compiler.h"
-#include "log.h"
-
-#ifndef BUG_ON_HANDLER
-
-#ifdef CR_NOGLIBC
-# define __raise()
-#else
-# define __raise() raise(SIGABRT)
-#endif
-
-#ifndef __clang_analyzer__
-# define BUG_ON_HANDLER(condition) \
- do { \
- if ((condition)) { \
- pr_err("BUG at %s:%d\n", __FILE__, __LINE__); \
- __raise(); \
- *(volatile unsigned long *)NULL = 0xdead0000 + __LINE__; \
- } \
- } while (0)
-#else
-# define BUG_ON_HANDLER(condition) \
- do { \
- assert(!condition); \
- } while (0)
-#endif
-
-#endif /* BUG_ON_HANDLER */
-
-#define BUG_ON(condition) BUG_ON_HANDLER((condition))
-#define BUG() BUG_ON(true)
-
-#endif /* __CR_BUG_H__ */
diff --git a/include/cgroup.h b/include/cgroup.h
deleted file mode 100644
index 393ee3d9cc05..000000000000
--- a/include/cgroup.h
+++ /dev/null
@@ -1,65 +0,0 @@
-#ifndef __CR_CGROUP_H__
-#define __CR_CGROUP_H__
-#include "asm/int.h"
-struct pstree_item;
-extern u32 root_cg_set;
-int dump_task_cgroup(struct pstree_item *, u32 *);
-int dump_cgroups(void);
-int prepare_task_cgroup(struct pstree_item *);
-int prepare_cgroup(void);
-/* Restore things like cpu_limit in known cgroups. */
-int prepare_cgroup_properties(void);
-int restore_freezer_state(void);
-void fini_cgroup(void);
-
-struct cg_controller;
-
-struct cgroup_prop {
- char *name;
- char *value;
- mode_t mode;
- uid_t uid;
- gid_t gid;
- struct list_head list;
-};
-
-/* This describes a particular cgroup path, e.g. the '/lxc/u1' part of
- * 'blkio/lxc/u1' and any properties it has.
- */
-struct cgroup_dir {
- char *path;
- mode_t mode;
- uid_t uid;
- gid_t gid;
-
- struct list_head properties;
- unsigned int n_properties;
-
- /* this is how children are linked together */
- struct list_head siblings;
-
- /* more cgroup_dirs */
- struct list_head children;
- unsigned int n_children;
-};
-
-/* This describes a particular cgroup controller, e.g. blkio or cpuset.
- * The heads are subdirectories organized in their tree format.
- */
-struct cg_controller {
- unsigned int n_controllers;
- char **controllers;
-
- /* cgroup_dirs */
- struct list_head heads;
- unsigned int n_heads;
-
- /* for cgroup list in cgroup.c */
- struct list_head l;
-};
-struct cg_controller *new_controller(const char *name);
-
-/* parse all global cgroup information into structures */
-int parse_cg_info(void);
-int new_cg_root_add(char *controller, char *newroot);
-#endif /* __CR_CGROUP_H__ */
diff --git a/include/compiler.h b/include/compiler.h
deleted file mode 100644
index 6bce93562a48..000000000000
--- a/include/compiler.h
+++ /dev/null
@@ -1,87 +0,0 @@
-#ifndef __CR_COMPILER_H__
-#define __CR_COMPILER_H__
-
-/*
- * Various definitions for success build,
- * picked from various places, mostly from
- * the linux kernel.
- */
-
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
-
-#define __stringify_1(x...) #x
-#define __stringify(x...) __stringify_1(x)
-
-#define NORETURN __attribute__((__noreturn__))
-#define __packed __attribute__((__packed__))
-#define __used __attribute__((__used__))
-#define __maybe_unused __attribute__((unused))
-#define __always_unused __attribute__((unused))
-
-#define __section(S) __attribute__ ((__section__(#S)))
-
-#ifndef __always_inline
-# define __always_inline inline __attribute__((always_inline))
-#endif
-
-#define likely(x) __builtin_expect(!!(x), 1)
-#define unlikely(x) __builtin_expect(!!(x), 0)
-
-#ifndef always_inline
-# define always_inline __always_inline
-#endif
-
-#ifndef noinline
-# define noinline __attribute__((noinline))
-#endif
-
-#define __aligned(x) __attribute__((aligned(x)))
-
-/*
- * Macro to define stack alignment.
- * aarch64 requires stack to be aligned to 16 bytes.
- */
-#define __stack_aligned__ __attribute__((aligned(16)))
-
-#ifndef offsetof
-# define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
-#endif
-
-#define barrier() asm volatile("" ::: "memory")
-
-#define container_of(ptr, type, member) ({ \
- const typeof( ((type *)0)->member ) *__mptr = (ptr); \
- (type *)( (char *)__mptr - offsetof(type,member) );})
-
-#define __round_mask(x, y) ((__typeof__(x))((y) - 1))
-#define round_up(x, y) ((((x) - 1) | __round_mask(x, y)) + 1)
-#define round_down(x, y) ((x) & ~__round_mask(x, y))
-#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
-#define ALIGN(x, a) (((x) + (a) - 1) & ~((a) - 1))
-
-#define min(x, y) ({ \
- typeof(x) _min1 = (x); \
- typeof(y) _min2 = (y); \
- (void) (&_min1 == &_min2); \
- _min1 < _min2 ? _min1 : _min2; })
-
-#define max(x, y) ({ \
- typeof(x) _max1 = (x); \
- typeof(y) _max2 = (y); \
- (void) (&_max1 == &_max2); \
- _max1 > _max2 ? _max1 : _max2; })
-
-#define min_t(type, x, y) ({ \
- type __min1 = (x); \
- type __min2 = (y); \
- __min1 < __min2 ? __min1: __min2; })
-
-#define max_t(type, x, y) ({ \
- type __max1 = (x); \
- type __max2 = (y); \
- __max1 > __max2 ? __max1: __max2; })
-
-#define is_log2(v) (((v) & ((v) - 1)) == 0)
-
-#endif /* __CR_COMPILER_H__ */
diff --git a/include/config-base.h b/include/config-base.h
deleted file mode 100644
index 5e26859658e1..000000000000
--- a/include/config-base.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef __CR_CONFIG_BASE_H__
-#define __CR_CONFIG_BASE_H__
-
-#define PAGE_ALLOC_COSTLY_ORDER 3 /* from the kernel source code */
-struct kernel_pipe_buffer {
- struct page *page;
- unsigned int offset, len;
- const struct pipe_buf_operations *ops;
- unsigned int flags;
- unsigned long private;
-};
-
-/*
- * The kernel allocates the linear chunk of memory for pipe buffers.
- * Allocation of chunks with size more than PAGE_ALLOC_COSTLY_ORDER
- * fails very often, so we need to restrict the pipe capacity to not
- * allocate big chunks.
- */
-#define PIPE_MAX_SIZE ((1 << PAGE_ALLOC_COSTLY_ORDER) * PAGE_SIZE / \
- sizeof(struct kernel_pipe_buffer))
-
-/* The number of pipes for one chunk */
-#define NR_PIPES_PER_CHUNK 8
-
-/*
- * These things are required to compile on CentOS-6
- */
-#ifndef F_LINUX_SPECIFIC_BASE
-# define F_LINUX_SPECIFIC_BASE 1024
-#endif
-
-#ifndef F_SETPIPE_SZ
-# define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7)
-#endif
-
-#ifndef F_GETPIPE_SZ
-# define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8)
-#endif
-
-#endif /* __CR_CONFIG_BASE_H__ */
diff --git a/include/cpu.h b/include/cpu.h
deleted file mode 100644
index e94525a9e780..000000000000
--- a/include/cpu.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef __CR_CPU_H__
-#define __CR_CPU_H__
-
-#include "asm/cpu.h"
-
-extern bool cpu_has_feature(unsigned int feature);
-extern int cpu_init(void);
-extern int cpu_dump_cpuinfo(void);
-extern int cpu_validate_cpuinfo(void);
-extern int cpuinfo_dump(void);
-extern int cpuinfo_check(void);
-
-#endif /* __CR_CPU_H__ */
diff --git a/include/cr-errno.h b/include/cr-errno.h
deleted file mode 100644
index 1f94988cf37e..000000000000
--- a/include/cr-errno.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef __CR_ERRNO_H__
-#define __CR_ERRNO_H__
-
-void set_cr_errno(int err);
-int get_cr_errno(void);
-
-/*
- * List of symbolic error names:
- * ESRCH - no process can be found corresponding to that specified by pid
- * EEXIST - process with such pid already exists
- * EBADRQC - bad options
- */
-
-#define set_task_cr_err(new_err) atomic_cmpxchg(&task_entries->cr_err, 0, new_err)
-#define get_task_cr_err() atomic_read(&task_entries->cr_err)
-
-#endif /* __CR_ERRNO_H__ */
diff --git a/include/cr-service-const.h b/include/cr-service-const.h
deleted file mode 100644
index c6d2e398f1f9..000000000000
--- a/include/cr-service-const.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef __CR_SERVICE_CONST_H__
-#define __CR_SERVICE_CONST_H__
-
-#define CR_DEFAULT_SERVICE_ADDRESS "./criu_service.socket"
-
-#endif /* __CR_SERVICE_CONST_H__ */
diff --git a/include/cr-service.h b/include/cr-service.h
deleted file mode 100644
index 621cedbe7827..000000000000
--- a/include/cr-service.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef __CR_SERVICE_H__
-#define __CR_SERVICE_H__
-
-#include "protobuf/rpc.pb-c.h"
-
-extern int cr_service(bool deamon_mode);
-int cr_service_work(int sk);
-
-extern int send_criu_dump_resp(int socket_fd, bool success, bool restored);
-
-extern struct _cr_service_client *cr_service_client;
-extern unsigned int service_sk_ino;
-
-#endif /* __CR_SERVICE_H__ */
diff --git a/include/cr-show.h b/include/cr-show.h
deleted file mode 100644
index 6ebdb4c2ac92..000000000000
--- a/include/cr-show.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef __CR_SHOW_H__
-#define __CR_SHOW_H__
-
-#include <stdbool.h>
-#include "asm/types.h"
-
-struct cr_img;
-
-struct show_image_info {
- u32 magic;
- int pb_type;
- bool single;
- void (*payload)(struct cr_img *, void *);
- char *fmt;
-};
-
-extern void show_siginfo(struct cr_img *);
-extern void sk_queue_data_handler(struct cr_img *, void *obj);
-extern void ipc_shm_handler(struct cr_img *, void *obj);
-extern void ipc_msg_handler(struct cr_img *, void *obj);
-extern void ipc_sem_handler(struct cr_img *, void *obj);
-extern int cr_parse_fd(struct cr_img *, u32 magic);
-extern void show_tcp_stream(struct cr_img *, void *obj);
-
-#endif /* __CR_SHOW_H__ */
diff --git a/include/cr_options.h b/include/cr_options.h
deleted file mode 100644
index 5c0e6332e279..000000000000
--- a/include/cr_options.h
+++ /dev/null
@@ -1,117 +0,0 @@
-#ifndef __CR_OPTIONS_H__
-#define __CR_OPTIONS_H__
-
-#include <stdbool.h>
-
-#include "list.h"
-
-/*
- * CPU capability options.
- */
-#define CPU_CAP_NONE (0u)
-#define CPU_CAP_ALL (-1u)
-#define CPU_CAP_FPU (1u) /* Only FPU capability required */
-#define CPU_CAP_CPU (2u) /* Strict CPU capability required */
-#define CPU_CAP_INS (4u) /* Instructions CPU capatibility */
-#define CPU_CAP_DEFAULT (CPU_CAP_FPU)
-
-struct cg_root_opt {
- struct list_head node;
- char *controller;
- char *newroot;
-};
-
-/*
- * Cgroup management options.
- */
-#define CG_MODE_IGNORE (0u << 0) /* Zero is important here */
-#define CG_MODE_NONE (1u << 0)
-#define CG_MODE_PROPS (1u << 1)
-#define CG_MODE_SOFT (1u << 2)
-#define CG_MODE_FULL (1u << 3)
-#define CG_MODE_STRICT (1u << 4)
-
-#define CG_MODE_DEFAULT (CG_MODE_SOFT)
-
-/*
- * Ghost file size we allow to carry by default.
- */
-#define DEFAULT_GHOST_LIMIT (1 << 20)
-
-#define DEFAULT_TIMEOUT 5
-
-struct irmap;
-
-struct irmap_path_opt {
- struct list_head node;
- struct irmap *ir;
-};
-
-struct external {
- struct list_head node;
- char *id;
-};
-
-struct cr_options {
- int final_state;
- char *show_dump_file;
- char *show_fmt;
- bool check_ms_kernel;
- bool show_pages_content;
- union {
- bool restore_detach;
- bool daemon_mode;
- };
- bool restore_sibling;
- bool ext_unix_sk;
- struct list_head ext_unixsk_ids;
- bool shell_job;
- bool handle_file_locks;
- bool tcp_established_ok;
- bool evasive_devices;
- bool link_remap_ok;
- unsigned int rst_namespaces_flags;
- bool log_file_per_pid;
- bool swrk_restore;
- char *output;
- char *root;
- char *pidfile;
- char *freeze_cgroup;
- struct list_head veth_pairs;
- struct list_head scripts;
- struct list_head ext_mounts;
- struct list_head inherit_fds;
- struct list_head external;
- char *libdir;
- bool use_page_server;
- unsigned short port;
- char *addr;
- int ps_socket;
- bool track_mem;
- char *img_parent;
- bool auto_dedup;
- unsigned int cpu_cap;
- bool force_irmap;
- char **exec_cmd;
- unsigned int manage_cgroups;
- char *new_global_cg_root;
- struct list_head new_cgroup_roots;
- bool autodetect_ext_mounts;
- bool enable_external_sharing;
- bool enable_external_masters;
- bool aufs; /* auto-deteced, not via cli */
- bool overlayfs;
- size_t ghost_limit;
- struct list_head irmap_scan_paths;
- bool lsm_supplied;
- char *lsm_profile;
- unsigned int timeout;
-};
-
-extern struct cr_options opts;
-
-extern void init_opts(void);
-
-extern int add_external(char *key);
-
-#endif /* __CR_OPTIONS_H__ */
diff --git a/include/criu-log.h b/include/criu-log.h
deleted file mode 100644
index fd5d6349d887..000000000000
--- a/include/criu-log.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- This file defines types and macros for CRIU plugins.
- Copyright (C) 2013 Parallels, Inc
-
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef __CRIU_LOG_H__
-#define __CRIU_LOG_H__
-
-#ifndef CR_NOGLIBC
-
-#include <string.h>
-#include <errno.h>
-
-#endif /* CR_NOGLIBC */
-
-#define LOG_UNSET (-1)
-#define LOG_MSG (0) /* Print message regardless of log level */
-#define LOG_ERROR (1) /* Errors only, when we're in trouble */
-#define LOG_WARN (2) /* Warnings, dazen and confused but trying to continue */
-#define LOG_INFO (3) /* Informative, everything is fine */
-#define LOG_DEBUG (4) /* Debug only */
-
-extern void print_on_level(unsigned int loglevel, const char *format, ...)
- __attribute__ ((__format__ (__printf__, 2, 3)));
-
-#ifndef LOG_PREFIX
-# define LOG_PREFIX
-#endif
-
-#define print_once(loglevel, fmt, ...) \
- do { \
- static bool __printed; \
- if (!__printed) { \
- print_on_level(loglevel, fmt, ##__VA_ARGS__); \
- __printed = 1; \
- } \
- } while (0)
-
-#define pr_msg(fmt, ...) \
- print_on_level(LOG_MSG, \
- fmt, ##__VA_ARGS__)
-
-#define pr_info(fmt, ...) \
- print_on_level(LOG_INFO, \
- LOG_PREFIX fmt, ##__VA_ARGS__)
-
-#define pr_err(fmt, ...) \
- print_on_level(LOG_ERROR, \
- "Error (%s:%d): " LOG_PREFIX fmt, \
- __FILE__, __LINE__, ##__VA_ARGS__)
-
-#define pr_err_once(fmt, ...) \
- print_once(LOG_ERROR, fmt, ##__VA_ARGS__)
-
-#define pr_warn(fmt, ...) \
- print_on_level(LOG_WARN, \
- "Warn (%s:%d): " LOG_PREFIX fmt, \
- __FILE__, __LINE__, ##__VA_ARGS__)
-
-#define pr_warn_once(fmt, ...) \
- print_once(LOG_WARN, fmt, ##__VA_ARGS__)
-
-#define pr_debug(fmt, ...) \
- print_on_level(LOG_DEBUG, \
- LOG_PREFIX fmt, ##__VA_ARGS__)
-
-#ifndef CR_NOGLIBC
-
-#define pr_perror(fmt, ...) \
- pr_err(fmt ": %s\n", ##__VA_ARGS__, strerror(errno))
-
-#endif /* CR_NOGLIBC */
-
-#endif /* __CR_LOG_LEVELS_H__ */
diff --git a/include/criu-plugin.h b/include/criu-plugin.h
deleted file mode 100644
index b76f5f83900f..000000000000
--- a/include/criu-plugin.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * This file defines types and macros for CRIU plugins.
- * Copyright (C) 2013-2014 Parallels, Inc
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-#ifndef __CRIU_PLUGIN_H__
-#define __CRIU_PLUGIN_H__
-
-#include <limits.h>
-#include <stdbool.h>
-
-#define CRIU_PLUGIN_GEN_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c))
-#define CRIU_PLUGIN_VERSION_MAJOR 0
-#define CRIU_PLUGIN_VERSION_MINOR 2
-#define CRIU_PLUGIN_VERSION_SUBLEVEL 0
-
-#define CRIU_PLUGIN_VERSION_OLD CRIU_PLUGIN_GEN_VERSION(0,1,0)
-
-#define CRIU_PLUGIN_VERSION \
- CRIU_PLUGIN_GEN_VERSION(CRIU_PLUGIN_VERSION_MAJOR, \
- CRIU_PLUGIN_VERSION_MINOR, \
- CRIU_PLUGIN_VERSION_SUBLEVEL)
-
-/*
- * Plugin hook points and their arguments in hooks.
- */
-enum {
- CR_PLUGIN_HOOK__DUMP_UNIX_SK = 0,
- CR_PLUGIN_HOOK__RESTORE_UNIX_SK = 1,
-
- CR_PLUGIN_HOOK__DUMP_EXT_FILE = 2,
- CR_PLUGIN_HOOK__RESTORE_EXT_FILE = 3,
-
- CR_PLUGIN_HOOK__DUMP_EXT_MOUNT = 4,
- CR_PLUGIN_HOOK__RESTORE_EXT_MOUNT = 5,
-
- CR_PLUGIN_HOOK__DUMP_EXT_LINK = 6,
-
- CR_PLUGIN_HOOK__MAX
-};
-
-#define DECLARE_PLUGIN_HOOK_ARGS(__hook, ...) \
- typedef int (__hook ##_t)(__VA_ARGS__)
-
-DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_UNIX_SK, int fd, int id);
-DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_UNIX_SK, int id);
-DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_FILE, int fd, int id);
-DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_FILE, int id);
-DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_MOUNT, char *mountpoint, int id);
-DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESTORE_EXT_MOUNT, int id, char *mountpoint, char *old_root, int *is_file);
-DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__DUMP_EXT_LINK, int index, int type, char *kind);
-
-enum {
- CR_PLUGIN_STAGE__DUMP,
- CR_PLUGIN_STAGE__PRE_DUMP,
- CR_PLUGIN_STAGE__RESTORE,
-
- CR_PLUGIN_STAGE_MAX
-};
-
-/*
- * Plugin descriptor.
- */
-typedef struct {
- const char *name;
- int (*init)(int stage);
- void (*exit)(int stage, int ret);
- unsigned int version;
- unsigned int max_hooks;
- void *hooks[CR_PLUGIN_HOOK__MAX];
-} cr_plugin_desc_t;
-
-extern cr_plugin_desc_t CR_PLUGIN_DESC;
-
-#define CR_PLUGIN_REGISTER(___name, ___init, ___exit) \
- cr_plugin_desc_t CR_PLUGIN_DESC = { \
- .name = ___name, \
- .init = ___init, \
- .exit = ___exit, \
- .version = CRIU_PLUGIN_VERSION, \
- .max_hooks = CR_PLUGIN_HOOK__MAX, \
- };
-
-static inline int cr_plugin_dummy_init(int stage) { return 0; }
-static inline void cr_plugin_dummy_exit(int stage, int ret) { }
-
-#define CR_PLUGIN_REGISTER_DUMMY(___name) \
- cr_plugin_desc_t CR_PLUGIN_DESC = { \
- .name = ___name, \
- .init = cr_plugin_dummy_init, \
- .exit = cr_plugin_dummy_exit, \
- .version = CRIU_PLUGIN_VERSION, \
- .max_hooks = CR_PLUGIN_HOOK__MAX, \
- };
-
-#define CR_PLUGIN_REGISTER_HOOK(__hook, __func) \
-static void __attribute__((constructor)) cr_plugin_register_hook_##__func (void) \
-{ \
- CR_PLUGIN_DESC.hooks[__hook] = (void *)__func; \
-}
-
-/* Public API */
-extern int criu_get_image_dir(void);
-
-/*
- * Deprecated, will be removed in next version.
- */
-typedef int (cr_plugin_init_t)(void);
-typedef void (cr_plugin_fini_t)(void);
-typedef int (cr_plugin_dump_unix_sk_t)(int fd, int id);
-typedef int (cr_plugin_restore_unix_sk_t)(int id);
-typedef int (cr_plugin_dump_file_t)(int fd, int id);
-typedef int (cr_plugin_restore_file_t)(int id);
-typedef int (cr_plugin_dump_ext_mount_t)(char *mountpoint, int id);
-typedef int (cr_plugin_restore_ext_mount_t)(int id, char *mountpoint, char *old_root, int *is_file);
-typedef int (cr_plugin_dump_ext_link_t)(int index, int type, char *kind);
-
-#endif /* __CRIU_PLUGIN_H__ */
diff --git a/include/crtools.h b/include/crtools.h
deleted file mode 100644
index eaa70dcf4e4e..000000000000
--- a/include/crtools.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef __CR_CRTOOLS_H__
-#define __CR_CRTOOLS_H__
-
-#include <sys/types.h>
-
-#include "list.h"
-#include "asm/types.h"
-#include "servicefd.h"
-
-#include "protobuf.h"
-#include "protobuf/inventory.pb-c.h"
-
-#define CR_FD_PERM (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)
-
-extern int check_img_inventory(void);
-extern int write_img_inventory(InventoryEntry *he);
-extern int prepare_inventory(InventoryEntry *he);
-
-#define LAST_PID_PATH "sys/kernel/ns_last_pid"
-
-extern int cr_dump_tasks(pid_t pid);
-extern int cr_pre_dump_tasks(pid_t pid);
-extern int cr_restore_tasks(void);
-extern int cr_show(int pid);
-extern int convert_to_elf(char *elf_path, int fd_core);
-extern int cr_check(void);
-extern int cr_exec(int pid, char **opts);
-extern int cr_dedup(void);
-
-extern int check_add_feature(char *arg);
-
-#endif /* __CR_CRTOOLS_H__ */
diff --git a/include/err.h b/include/err.h
deleted file mode 100644
index c5b6165a57ab..000000000000
--- a/include/err.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Adopted from linux kernel
- */
-#ifndef __CR_ERR_H__
-#define __CR_ERR_H__
-
-#include "compiler.h"
-
-/*
- * The address of a block returned by malloc or realloc in GNU
- * systems is always a multiple of eight (or sixteen on 64-bit systems).
- *
- * Thus we may encode error number in low bits.
- */
-#define MAX_ERRNO 4095
-
-#define IS_ERR_VALUE(x) unlikely((x) >= (unsigned long)-MAX_ERRNO)
-
-static inline void *ERR_PTR(long error)
-{
- return (void *)error;
-}
-
-static inline long PTR_ERR(const void *ptr)
-{
- return (long)ptr;
-}
-
-static inline long IS_ERR(const void *ptr)
-{
- return IS_ERR_VALUE((unsigned long)ptr);
-}
-
-static inline long IS_ERR_OR_NULL(const void *ptr)
-{
- return !ptr || IS_ERR_VALUE((unsigned long)ptr);
-}
-
-static inline void *ERR_CAST(const void *ptr)
-{
- /* cast away the const */
- return (void *)ptr;
-}
-
-static inline int PTR_RET(const void *ptr)
-{
- if (IS_ERR(ptr))
- return PTR_ERR(ptr);
- else
- return 0;
-}
-
-#endif /* __CR_ERR_H__ */
diff --git a/include/errno.h b/include/errno.h
deleted file mode 100644
index 5c2322e9fae9..000000000000
--- a/include/errno.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef __CR_ERRNO_H__
-#define __CR_ERRNO_H__
-
-#define ERESTARTSYS 512
-#define ERESTARTNOINTR 513
-#define ERESTARTNOHAND 514
-#define ERESTART_RESTARTBLOCK 516
-
-#endif /* __CR_ERRNO_H__ */
diff --git a/include/eventfd.h b/include/eventfd.h
deleted file mode 100644
index 65e0af7cdc13..000000000000
--- a/include/eventfd.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef __CR_EVENTFD_H__
-#define __CR_EVENTFD_H__
-
-#include "files.h"
-
-extern int is_eventfd_link(char *link);
-extern const struct fdtype_ops eventfd_dump_ops;
-extern struct collect_image_info eventfd_cinfo;
-
-#endif /* __CR_EVENTFD_H__ */
diff --git a/include/eventpoll.h b/include/eventpoll.h
deleted file mode 100644
index 96a77bc984c0..000000000000
--- a/include/eventpoll.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef __CR_EVENTPOLL_H__
-#define __CR_EVENTPOLL_H__
-
-#include "files.h"
-
-extern int is_eventpoll_link(char *link);
-extern const struct fdtype_ops eventpoll_dump_ops;
-extern struct collect_image_info epoll_tfd_cinfo;
-extern struct collect_image_info epoll_cinfo;
-
-#endif /* __CR_EVENTPOLL_H__ */
diff --git a/include/fault-injection.h b/include/fault-injection.h
deleted file mode 100644
index 989f654b2f51..000000000000
--- a/include/fault-injection.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef __CR_FAULT_INJECTION_H__
-#define __CR_FAULT_INJECTION_H__
-#include <stdbool.h>
-
-enum faults {
- FI_NONE = 0,
- FI_DUMP_EARLY,
- FI_RESTORE_ROOT_ONLY,
- FI_MAX,
-};
-
-extern enum faults fi_strategy;
-extern int fault_injection_init(void);
-
-static inline bool fault_injected(enum faults f)
-{
- return fi_strategy == f;
-}
-#endif
diff --git a/include/fcntl.h b/include/fcntl.h
deleted file mode 100644
index 6f85c5ee6923..000000000000
--- a/include/fcntl.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#ifndef __CR_ASM_GENERIC_FCNTL_H__
-#define __CR_ASM_GENERIC_FCNTL_H__
-
-#include <sys/types.h>
-#include <fcntl.h>
-
-#ifndef F_SETOWN_EX
-#define F_SETOWN_EX 15
-#define F_GETOWN_EX 16
-
-struct f_owner_ex {
- int type;
- pid_t pid;
-};
-
-#endif
-
-#ifndef F_GETOWNER_UIDS
-#define F_GETOWNER_UIDS 17
-#endif
-
-#ifndef F_LINUX_SPECIFIC_BASE
-#define F_LINUX_SPECIFIC_BASE 1024
-#endif
-#ifndef F_SETPIPE_SZ
-# define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7)
-#endif
-#ifndef F_GETPIPE_SZ
-# define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8)
-#endif
-
-#ifndef O_PATH
-# define O_PATH 010000000
-#endif
-
-#endif /* __CR_ASM_GENERIC_FCNTL_H__ */
diff --git a/include/fifo.h b/include/fifo.h
deleted file mode 100644
index 776265450612..000000000000
--- a/include/fifo.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef __CR_FIFO_H__
-#define __CR_FIFO_H__
-
-struct fd_parms;
-struct cr_imgset;
-
-extern const struct fdtype_ops fifo_dump_ops;
-extern struct collect_image_info fifo_cinfo;
-extern int collect_fifo(void);
-
-#endif /* __CR_FIFO_H__ */
diff --git a/include/file-ids.h b/include/file-ids.h
deleted file mode 100644
index 2da4ceffde07..000000000000
--- a/include/file-ids.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef __CR_FILE_IDS_H__
-#define __CR_FILE_IDS_H__
-
-#include "compiler.h"
-#include "asm/types.h"
-#include "rbtree.h"
-
-#include "protobuf/fdinfo.pb-c.h"
-
-#define FD_PID_INVALID (-2U)
-#define FD_DESC_INVALID (-3U)
-
-struct fdinfo_entry;
-struct stat;
-
-struct fd_parms;
-extern int fd_id_generate(pid_t pid, FdinfoEntry *fe, struct fd_parms *p);
-extern int fd_id_generate_special(struct fd_parms *p, u32 *id);
-
-#endif /* __CR_FILE_IDS_H__ */
diff --git a/include/file-lock.h b/include/file-lock.h
deleted file mode 100644
index e771c0ee584c..000000000000
--- a/include/file-lock.h
+++ /dev/null
@@ -1,64 +0,0 @@
-#ifndef __FILE_LOCK_H__
-#define __FILE_LOCK_H__
-
-#include "list.h"
-
-#include "protobuf.h"
-#include "protobuf/file-lock.pb-c.h"
-
-#define FL_UNKNOWN -1
-#define FL_POSIX 1
-#define FL_FLOCK 2
-
-/* for posix fcntl() and lockf() */
-#ifndef F_RDLCK
-#define F_RDLCK 0
-#define F_WRLCK 1
-#define F_UNLCK 2
-#endif
-
-/* operations for bsd flock(), also used by the kernel implementation */
-#define LOCK_SH 1 /* shared lock */
-#define LOCK_EX 2 /* exclusive lock */
-#define LOCK_NB 4 /* or'd with one of the above to prevent
- blocking */
-#define LOCK_UN 8 /* remove lock */
-
-#define LOCK_MAND 32 /* This is a mandatory flock ... */
-#define LOCK_READ 64 /* which allows concurrent read operations */
-#define LOCK_WRITE 128 /* which allows concurrent write operations */
-#define LOCK_RW 192 /* which allows concurrent read & write ops */
-
-struct file_lock {
- long long fl_id;
- int fl_kind;
- int fl_ltype;
-
- pid_t fl_owner;
- int maj, min;
- unsigned long i_no;
- long long start;
- char end[32];
-
- struct list_head list; /* list of all file locks */
-
- int real_owner;
- int owners_fd;
-};
-
-extern struct list_head file_lock_list;
-
-extern struct file_lock *alloc_file_lock(void);
-extern void free_file_locks(void);
-
-extern int prepare_file_locks(int pid);
-extern struct collect_image_info file_locks_cinfo;
-
-struct pid;
-struct fd_parms;
-extern int note_file_lock(struct pid *, int fd, int lfd, struct fd_parms *);
-extern int dump_file_locks(void);
-
-#define OPT_FILE_LOCKS "file-locks"
-
-#endif /* __FILE_LOCK_H__ */
diff --git a/include/files-reg.h b/include/files-reg.h
deleted file mode 100644
index e2f611535294..000000000000
--- a/include/files-reg.h
+++ /dev/null
@@ -1,59 +0,0 @@
-#ifndef __CR_FILES_REG_H__
-#define __CR_FILES_REG_H__
-
-#include "asm/types.h"
-#include "files.h"
-#include "image.h"
-
-#include "protobuf/regfile.pb-c.h"
-#include "protobuf/ghost-file.pb-c.h"
-
-struct cr_imgset;
-struct fd_parms;
-
-struct file_remap {
- char *rpath;
- bool is_dir;
- int rmnt_id;
- unsigned int users;
- uid_t owner;
-};
-
-struct reg_file_info {
- struct file_desc d;
- RegFileEntry *rfe;
- struct file_remap *remap;
- bool size_checked;
- char *path;
-};
-
-extern int open_reg_by_id(u32 id);
-extern int open_reg_fd(struct file_desc *);
-extern int open_path(struct file_desc *, int (*open_cb)(int ns_root_fd,
- struct reg_file_info *, void *), void *arg);
-extern void clear_ghost_files(void);
-
-extern int prepare_shared_reg_files(void);
-
-extern const struct fdtype_ops regfile_dump_ops;
-extern int do_open_reg_noseek_flags(int ns_root_fd, struct reg_file_info *rfi, void *arg);
-extern int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p);
-
-extern struct file_remap *lookup_ghost_remap(u32 dev, u32 ino);
-extern void remap_put(struct file_remap *remap);
-
-extern struct file_desc *try_collect_special_file(u32 id, int optional);
-#define collect_special_file(id) try_collect_special_file(id, 0)
-
-extern int collect_remaps_and_regfiles(void);
-
-extern void delete_link_remaps(void);
-extern void free_link_remaps(void);
-extern int prepare_remaps(void);
-extern void try_clean_remaps(int ns_fd);
-
-extern int strip_deleted(struct fd_link *link);
-
-extern int prepare_procfs_remaps(void);
-
-#endif /* __CR_FILES_REG_H__ */
diff --git a/include/files.h b/include/files.h
deleted file mode 100644
index 9ea234440a1e..000000000000
--- a/include/files.h
+++ /dev/null
@@ -1,183 +0,0 @@
-#ifndef __CR_FILES_H__
-#define __CR_FILES_H__
-
-#include "compiler.h"
-#include "asm/types.h"
-#include "fcntl.h"
-#include "lock.h"
-#include "list.h"
-#include "image.h"
-#include "pid.h"
-#include "rst_info.h"
-
-#include "protobuf/fdinfo.pb-c.h"
-#include "protobuf/fown.pb-c.h"
-#include "protobuf/vma.pb-c.h"
-
-struct pstree_item;
-struct file_desc;
-struct cr_imgset;
-struct rst_info;
-struct parasite_ctl;
-
-struct fd_link {
- union {
- /* Link info for generic file (path) */
- struct {
- char name[PATH_MAX + 1];
- size_t len;
- };
-
- /* Link info for proc-ns file */
- struct {
- struct ns_desc *ns_d;
- unsigned int ns_kid;
- };
- };
-};
-
-struct fd_parms {
- int fd;
- off_t pos;
- unsigned int flags;
- char fd_flags;
- struct stat stat;
- pid_t pid;
- FownEntry fown;
- struct fd_link *link;
- long fs_type;
- int mnt_id;
-
- struct parasite_ctl *ctl;
-};
-
-#define FD_PARMS_INIT \
-(struct fd_parms) { \
- .fd = FD_DESC_INVALID, \
- .fown = FOWN_ENTRY__INIT, \
- .link = NULL, \
- .mnt_id = -1, \
-}
-
-extern int fill_fdlink(int lfd, const struct fd_parms *p, struct fd_link *link);
-
-struct file_desc;
-
-struct fdinfo_list_entry {
- struct list_head desc_list; /* To chain on @fd_info_head */
- struct file_desc *desc; /* Associated file descriptor */
- struct list_head ps_list; /* To chain per-task files */
- int pid;
- futex_t real_pid;
- FdinfoEntry *fe;
-};
-
-/* reports whether fd_a takes prio over fd_b */
-static inline int fdinfo_rst_prio(struct fdinfo_list_entry *fd_a, struct fdinfo_list_entry *fd_b)
-{
- return pid_rst_prio(fd_a->pid, fd_b->pid) ||
- ((fd_a->pid == fd_b->pid) && (fd_a->fe->fd < fd_b->fe->fd));
-}
-
-struct file_desc_ops {
- /* fd_types from protobuf/fdinfo.proto */
- unsigned int type;
- /*
- * Opens a file by whatever syscall is required for that.
- * The returned descriptor may be closed (dup2-ed to another)
- * so it shouldn't be saved for any post-actions.
- */
- int (*open)(struct file_desc *d);
- /*
- * Called on a file when all files of that type are opened
- * and with the fd being the "restored" one.
- */
- int (*post_open)(struct file_desc *d, int fd);
- /*
- * Report whether the fd in question wants a transport socket
- * in it instead of a real file. See file_master for details.
- */
- int (*want_transport)(FdinfoEntry *fe, struct file_desc *d);
- /*
- * Called to collect a new fd before adding it on desc. Clients
- * may chose to collect it to some specific rst_info list. See
- * prepare_fds() for details.
- */
- void (*collect_fd)(struct file_desc *, struct fdinfo_list_entry *,
- struct rst_info *);
- char * (*name)(struct file_desc *, char *b, size_t s);
-};
-
-static inline void collect_gen_fd(struct fdinfo_list_entry *fle, struct rst_info *ri)
-{
- list_add_tail(&fle->ps_list, &ri->fds);
-}
-
-struct file_desc {
- u32 id; /* File id, unique */
- struct hlist_node hash; /* Descriptor hashing and lookup */
- struct list_head fd_info_head; /* Chain of fdinfo_list_entry-s with same ID and type but different pids */
- struct file_desc_ops *ops; /* Associated operations */
-};
-
-struct fdtype_ops {
- unsigned int type;
- int (*dump)(int lfd, u32 id, const struct fd_parms *p);
- int (*pre_dump)(int pid, int lfd);
-};
-
-struct cr_img;
-
-extern int do_dump_gen_file(struct fd_parms *p, int lfd,
- const struct fdtype_ops *ops,
- struct cr_img *);
-struct parasite_drain_fd;
-int dump_task_files_seized(struct parasite_ctl *ctl, struct pstree_item *item,
- struct parasite_drain_fd *dfds);
-int predump_task_files(int pid);
-
-extern void file_desc_init(struct file_desc *d, u32 id, struct file_desc_ops *ops);
-extern int file_desc_add(struct file_desc *d, u32 id, struct file_desc_ops *ops);
-extern struct fdinfo_list_entry *file_master(struct file_desc *d);
-extern struct file_desc *find_file_desc_raw(int type, u32 id);
-
-extern int send_fd_to_peer(int fd, struct fdinfo_list_entry *fle, int sock);
-extern int restore_fown(int fd, FownEntry *fown);
-extern int rst_file_params(int fd, FownEntry *fown, int flags);
-
-extern void show_saved_files(void);
-
-extern int prepare_fds(struct pstree_item *me);
-extern int prepare_fd_pid(struct pstree_item *me);
-extern int prepare_ctl_tty(int pid, struct rst_info *rst_info, u32 ctl_tty_id);
-extern int prepare_shared_fdinfo(void);
-extern int get_filemap_fd(struct vma_area *);
-extern int restore_fs(struct pstree_item *);
-extern int prepare_fs_pid(struct pstree_item *);
-extern int set_fd_flags(int fd, int flags);
-
-extern int close_old_fds(void);
-#ifndef AT_EMPTY_PATH
-#define AT_EMPTY_PATH 0x1000
-#endif
-
-#define LREMAP_PARAM "link-remap"
-
-extern int shared_fdt_prepare(struct pstree_item *item);
-
-extern struct collect_image_info ext_file_cinfo;
-extern int dump_unsupp_fd(struct fd_parms *p, int lfd,
- struct cr_img *, char *more, char *info);
-
-extern int inherit_fd_parse(char *optarg);
-extern int inherit_fd_add(int fd, char *key);
-extern void inherit_fd_log(void);
-extern int inherit_fd_resolve_clash(int fd);
-extern int inherit_fd_fini(void);
-
-extern bool external_lookup_id(char *id);
-extern int inherit_fd_lookup_id(char *id);
-
-extern bool inherited_fd(struct file_desc *, int *fdp);
-
-#endif /* __CR_FILES_H__ */
diff --git a/include/fs-magic.h b/include/fs-magic.h
deleted file mode 100644
index d6e9e54d181d..000000000000
--- a/include/fs-magic.h
+++ /dev/null
@@ -1,52 +0,0 @@
-#ifndef __CR_FS_MAGIC_H__
-#define __CR_FS_MAGIC_H__
-
-#include <sys/vfs.h>
-
-/*
- * Gather magic numbers in case if distros
- * do not provide appropriate entry in
- * linux/magic.h.
- */
-
-#ifndef NFS_SUPER_MAGIC
-# define NFS_SUPER_MAGIC 0x6969
-#endif
-
-#ifndef PIPEFS_MAGIC
-# define PIPEFS_MAGIC 0x50495045
-#endif
-
-#ifndef ANON_INODE_FS_MAGIC
-# define ANON_INODE_FS_MAGIC 0x09041934
-#endif
-
-#ifndef TMPFS_MAGIC
-# define TMPFS_MAGIC 0x01021994
-#endif
-
-#ifndef SOCKFS_MAGIC
-# define SOCKFS_MAGIC 0x534f434b
-#endif
-
-#ifndef DEVPTS_SUPER_MAGIC
-#define DEVPTS_SUPER_MAGIC 0x1cd1
-#endif
-
-#ifndef BTRFS_SUPER_MAGIC
-#define BTRFS_SUPER_MAGIC 0x9123683E
-#endif
-
-#ifndef AUFS_SUPER_MAGIC
-#define AUFS_SUPER_MAGIC 0x61756673
-#endif
-
-#ifndef PROC_SUPER_MAGIC
-#define PROC_SUPER_MAGIC 0x9fa0
-#endif
-
-#ifndef BINFMTFS_MAGIC
-#define BINFMTFS_MAGIC 0x42494e4d
-#endif
-
-#endif /* __CR_FS_MAGIC_H__ */
diff --git a/include/fsnotify.h b/include/fsnotify.h
deleted file mode 100644
index 48e3982cf7aa..000000000000
--- a/include/fsnotify.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef __CR_FSNOTIFY_H__
-#define __CR_FSNOTIFY_H__
-
-#include "asm/types.h"
-#include "files.h"
-
-#include "protobuf.h"
-#include "protobuf/fsnotify.pb-c.h"
-
-#define KERNEL_FS_EVENT_ON_CHILD 0x08000000
-
-struct fsnotify_params {
- u32 faflags;
- u32 evflags;
-};
-
-extern int is_inotify_link(char *link);
-extern int is_fanotify_link(char *link);
-extern const struct fdtype_ops inotify_dump_ops;
-extern const struct fdtype_ops fanotify_dump_ops;
-extern struct collect_image_info inotify_cinfo;
-extern struct collect_image_info inotify_mark_cinfo;
-extern struct collect_image_info fanotify_cinfo;
-extern struct collect_image_info fanotify_mark_cinfo;
-
-#endif /* __CR_FSNOTIFY_H__ */
diff --git a/include/image-desc.h b/include/image-desc.h
deleted file mode 100644
index 532ced8b4167..000000000000
--- a/include/image-desc.h
+++ /dev/null
@@ -1,119 +0,0 @@
-#ifndef __CR_IMAGE_DESC_H__
-#define __CR_IMAGE_DESC_H__
-
-#include "asm/int.h"
-
-enum {
- CR_FD_INVENTORY,
- CR_FD_STATS,
- /*
- * Task entries
- */
-
- _CR_FD_TASK_FROM,
- CR_FD_CORE,
- CR_FD_IDS,
- CR_FD_MM,
- CR_FD_SIGACT,
- CR_FD_CREDS,
- CR_FD_FS,
- _CR_FD_TASK_TO,
-
- CR_FD_PAGEMAP,
-
- /*
- * NS entries
- */
- CR_FD_UTSNS,
- CR_FD_MNTS,
- CR_FD_USERNS,
-
- _CR_FD_IPCNS_FROM,
- CR_FD_IPC_VAR,
- CR_FD_IPCNS_SHM,
- CR_FD_IPCNS_MSG,
- CR_FD_IPCNS_SEM,
- _CR_FD_IPCNS_TO,
-
- _CR_FD_NETNS_FROM,
- CR_FD_NETDEV,
- CR_FD_IFADDR,
- CR_FD_ROUTE,
- CR_FD_ROUTE6,
- CR_FD_RULE,
- CR_FD_IPTABLES,
- CR_FD_IP6TABLES,
- CR_FD_NETNS,
- CR_FD_NETNF_CT,
- CR_FD_NETNF_EXP,
- _CR_FD_NETNS_TO,
-
- CR_FD_PSTREE,
- CR_FD_SHMEM_PAGEMAP,
- CR_FD_GHOST_FILE,
- CR_FD_TCP_STREAM,
- CR_FD_FDINFO,
-
- _CR_FD_GLOB_FROM,
- CR_FD_SK_QUEUES,
- CR_FD_REG_FILES,
- CR_FD_EXT_FILES,
- CR_FD_NS_FILES,
- CR_FD_INETSK,
- CR_FD_UNIXSK,
- CR_FD_PACKETSK,
- CR_FD_NETLINK_SK,
- CR_FD_PIPES,
- CR_FD_PIPES_DATA,
- CR_FD_FIFO,
- CR_FD_FIFO_DATA,
- CR_FD_TTY_FILES,
- CR_FD_TTY_INFO,
- CR_FD_REMAP_FPATH,
- CR_FD_EVENTFD_FILE,
- CR_FD_EVENTPOLL_FILE,
- CR_FD_SIGNALFD,
- CR_FD_INOTIFY_FILE,
- CR_FD_FANOTIFY_FILE,
- CR_FD_TUNFILE,
- CR_FD_CGROUP,
- CR_FD_TIMERFD,
- CR_FD_FILE_LOCKS,
- CR_FD_SECCOMP,
- _CR_FD_GLOB_TO,
-
- CR_FD_TMPFS_IMG,
- CR_FD_TMPFS_DEV,
- CR_FD_BINFMT_MISC,
- CR_FD_PAGES,
-
- CR_FD_VMAS,
- CR_FD_PAGES_OLD,
- CR_FD_SHM_PAGES_OLD,
- CR_FD_RLIMIT,
- CR_FD_ITIMERS,
- CR_FD_POSIX_TIMERS,
- CR_FD_FILE_LOCKS_PID,
-
- CR_FD_IRMAP_CACHE,
- CR_FD_CPUINFO,
-
- CR_FD_SIGNAL,
- CR_FD_PSIGNAL,
- CR_FD_INOTIFY_WD,
- CR_FD_FANOTIFY_MARK,
- CR_FD_EVENTPOLL_TFD,
-
- CR_FD_MAX
-};
-
-/* file descriptors template */
-struct cr_fd_desc_tmpl {
- const char *fmt; /* format for the name */
- u32 magic; /* magic in the header */
- int oflags; /* flags for image_open */
-};
-
-extern struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX];
-
-#endif /* __CR_IMAGE_DESC_H__ */
diff --git a/include/image.h b/include/image.h
deleted file mode 100644
index 305febf5adb6..000000000000
--- a/include/image.h
+++ /dev/null
@@ -1,190 +0,0 @@
-#ifndef __CR_IMAGE_H__
-#define __CR_IMAGE_H__
-
-#include <stdbool.h>
-
-#include "compiler.h"
-#include "servicefd.h"
-#include "image-desc.h"
-#include "fcntl.h"
-#include "magic.h"
-#include "bfd.h"
-#include "bug.h"
-
-#ifdef _ARCH_PPC64
-#define PAGE_IMAGE_SIZE 65536
-#else
-#define PAGE_IMAGE_SIZE 4096
-#endif /* _ARCH_PPC64 */
-#define PAGE_RSS 1
-#define PAGE_ANON 2
-
-/*
- * Top bit set in the tgt id means we've remapped
- * to a ghost file.
- */
-#define REMAP_GHOST (1 << 31)
-
-/*
- * By-default, when dumping a unix socket, we should dump its peer
- * as well. Which in turn means, we should dump the task(s) that have
- * this peer opened.
- *
- * Sometimes, we can break this rule and dump only one end of the
- * unix sockets pair, and on restore time connect() this end back to
- * its peer.
- *
- * So, to resolve this situation we mark the peers we don't dump
- * as "external" and require the --ext-unix-sk option.
- */
-
-#define USK_EXTERN (1 << 0)
-#define USK_SERVICE (1 << 1)
-#define USK_CALLBACK (1 << 2)
-#define USK_INHERIT (1 << 3)
-
-/*
- * VMA_AREA status:
- *
- * - none
- * VmaEntry is just allocated and has not been used
- * for anything yet
- * - regular
- * VmaEntry represent some memory area which should be
- * dumped and restored; this is a general sign that we
- * should not skip the area content from processing in
- * compare with special areas such as vsyscall
- * - stack
- * the memory area is used in application stack so we
- * should be careful about guard page here
- * - vsyscall
- * special memory area injected into the task memory
- * space by the kernel itself, represent virtual syscall
- * implementation and it is specific to every kernel version,
- * its contents should not be dumped ever
- * - vdso,vvar
- * the vDSO area, it might reqire additional memory
- * contents modification especially when tasks are
- * migrating between different kernel versions
- * - heap
- * "heap" area in application, currently for inforamtion only
- * - file private
- * stands for privately memory mapped files
- * - file shared
- * stands for shared memory mapped files
- * - anon shared
- * represent shared anonymous memory areas
- * - anon private
- * represent private anonymous memory areas
- * - SysV IPC
- * IPC shared memory area
- * - socket
- * memory map for socket
- * - AIO ring
- * memory area serves AIO buffers
- * - unsupported
- * stands for any unknown memory areas, usually means
- * we don't know how to work with it and should stop
- * processing exiting with error; while the rest of bits
- * are part of image ABI, this particular one must never
- * be used in image.
- */
-#define VMA_AREA_NONE (0 << 0)
-#define VMA_AREA_REGULAR (1 << 0)
-#define VMA_AREA_STACK (1 << 1)
-#define VMA_AREA_VSYSCALL (1 << 2)
-#define VMA_AREA_VDSO (1 << 3)
-#define VMA_AREA_HEAP (1 << 5)
-
-#define VMA_FILE_PRIVATE (1 << 6)
-#define VMA_FILE_SHARED (1 << 7)
-#define VMA_ANON_SHARED (1 << 8)
-#define VMA_ANON_PRIVATE (1 << 9)
-
-#define VMA_AREA_SYSVIPC (1 << 10)
-#define VMA_AREA_SOCKET (1 << 11)
-#define VMA_AREA_VVAR (1 << 12)
-#define VMA_AREA_AIORING (1 << 13)
-
-#define VMA_UNSUPP (1 << 31)
-
-#define CR_CAP_SIZE 2
-
-#define TASK_COMM_LEN 16
-
-#define TASK_ALIVE 0x1
-#define TASK_DEAD 0x2
-#define TASK_STOPPED 0x3
-#define TASK_HELPER 0x4
-
-#define CR_PARENT_LINK "parent"
-
-extern bool fdinfo_per_id;
-extern bool ns_per_id;
-extern bool img_common_magic;
-
-#define O_NOBUF (O_DIRECT)
-#define O_SERVICE (O_DIRECTORY)
-#define O_DUMP (O_WRONLY | O_CREAT | O_TRUNC)
-#define O_SHOW (O_RDONLY | O_NOBUF)
-#define O_RSTR (O_RDONLY)
-
-struct cr_img {
- union {
- struct bfd _x;
- struct {
- int fd; /* should be first to coincide with _x.fd */
- int type;
- unsigned long oflags;
- char *path;
- };
- };
-};
-
-#define EMPTY_IMG_FD (-404)
-#define LAZY_IMG_FD (-505)
-
-static inline bool empty_image(struct cr_img *img)
-{
- return img && img->_x.fd == EMPTY_IMG_FD;
-}
-
-static inline bool lazy_image(struct cr_img *img)
-{
- return img->_x.fd == LAZY_IMG_FD;
-}
-
-extern int open_image_lazy(struct cr_img *img);
-
-static inline int img_raw_fd(struct cr_img *img)
-{
- if (lazy_image(img) && open_image_lazy(img))
- return -1;
-
- BUG_ON(bfd_buffered(&img->_x));
- return img->_x.fd;
-}
-
-extern int open_image_dir(char *dir);
-extern void close_image_dir(void);
-
-extern struct cr_img *open_image_at(int dfd, int type, unsigned long flags, ...);
-#define open_image(typ, flags, ...) open_image_at(-1, typ, flags, ##__VA_ARGS__)
-extern int open_image_lazy(struct cr_img *img);
-extern struct cr_img *open_pages_image(unsigned long flags, struct cr_img *pmi);
-extern struct cr_img *open_pages_image_at(int dfd, unsigned long flags, struct cr_img *pmi);
-extern void up_page_ids_base(void);
-
-extern struct cr_img *img_from_fd(int fd); /* for cr-show mostly */
-
-extern int write_img_buf(struct cr_img *, const void *ptr, int size);
-#define write_img(img, ptr) write_img_buf((img), (ptr), sizeof(*(ptr)))
-extern int read_img_buf_eof(struct cr_img *, void *ptr, int size);
-#define read_img_eof(img, ptr) read_img_buf_eof((img), (ptr), sizeof(*(ptr)))
-extern int read_img_buf(struct cr_img *, void *ptr, int size);
-#define read_img(img, ptr) read_img_buf((img), (ptr), sizeof(*(ptr)))
-extern int read_img_str(struct cr_img *, char **pstr, int size);
-
-extern void close_image(struct cr_img *);
-
-#endif /* __CR_IMAGE_H__ */
diff --git a/include/imgset.h b/include/imgset.h
deleted file mode 100644
index 04be917e2dac..000000000000
--- a/include/imgset.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef __CR_IMGSET_H__
-#define __CR_IMGSET_H__
-
-#include "image-desc.h"
-#include "bug.h"
-#include "image.h"
-
-struct cr_imgset {
- int fd_off;
- int fd_nr;
- struct cr_img **_imgs;
-};
-
-static inline struct cr_img *img_from_set(const struct cr_imgset *imgset, int type)
-{
- int idx;
-
- idx = type - imgset->fd_off;
- BUG_ON(idx > imgset->fd_nr);
-
- return imgset->_imgs[idx];
-}
-
-extern struct cr_imgset *glob_imgset;
-
-extern struct cr_fd_desc_tmpl imgset_template[CR_FD_MAX];
-
-extern struct cr_imgset *cr_task_imgset_open(int pid, int mode);
-extern struct cr_imgset *cr_imgset_open_range(int pid, int from, int to,
- unsigned long flags);
-#define cr_imgset_open(pid, type, flags) cr_imgset_open_range(pid, \
- _CR_FD_##type##_FROM, _CR_FD_##type##_TO, flags)
-extern struct cr_imgset *cr_glob_imgset_open(int mode);
-
-extern void close_cr_imgset(struct cr_imgset **cr_imgset);
-
-#endif /* __CR_IMGSET_H__ */
diff --git a/include/inet_diag.h b/include/inet_diag.h
deleted file mode 100644
index 95be2c19df84..000000000000
--- a/include/inet_diag.h
+++ /dev/null
@@ -1,136 +0,0 @@
-#ifndef __CR_INET_DIAG_H__
-#define __CR_INET_DIAG_H__
-
-#include <linux/types.h>
-
-/* Just some random number */
-#define TCPDIAG_GETSOCK 18
-#define DCCPDIAG_GETSOCK 19
-
-#define INET_DIAG_GETSOCK_MAX 24
-
-/* Socket identity */
-struct inet_diag_sockid {
- __be16 idiag_sport;
- __be16 idiag_dport;
- __be32 idiag_src[4];
- __be32 idiag_dst[4];
- __u32 idiag_if;
- __u32 idiag_cookie[2];
-#define INET_DIAG_NOCOOKIE (~0U)
-};
-
-/* Request structure */
-
-struct inet_diag_req_compat {
- __u8 idiag_family; /* Family of addresses. */
- __u8 idiag_src_len;
- __u8 idiag_dst_len;
- __u8 idiag_ext; /* Query extended information */
-
- struct inet_diag_sockid id;
-
- __u32 idiag_states; /* States to dump */
- __u32 idiag_dbs; /* Tables to dump (NI) */
-};
-
-struct inet_diag_req_v2 {
- __u8 sdiag_family;
- __u8 sdiag_protocol;
- __u8 idiag_ext;
- __u8 pad;
- __u32 idiag_states;
- struct inet_diag_sockid id;
-};
-
-enum {
- INET_DIAG_REQ_NONE,
- INET_DIAG_REQ_BYTECODE,
-};
-
-#define INET_DIAG_REQ_MAX INET_DIAG_REQ_BYTECODE
-
-/* Bytecode is sequence of 4 byte commands followed by variable arguments.
- * All the commands identified by "code" are conditional jumps forward:
- * to offset cc+"yes" or to offset cc+"no". "yes" is supposed to be
- * length of the command and its arguments.
- */
-
-struct inet_diag_bc_op {
- unsigned char code;
- unsigned char yes;
- unsigned short no;
-};
-
-enum {
- INET_DIAG_BC_NOP,
- INET_DIAG_BC_JMP,
- INET_DIAG_BC_S_GE,
- INET_DIAG_BC_S_LE,
- INET_DIAG_BC_D_GE,
- INET_DIAG_BC_D_LE,
- INET_DIAG_BC_AUTO,
- INET_DIAG_BC_S_COND,
- INET_DIAG_BC_D_COND,
-};
-
-struct inet_diag_hostcond {
- __u8 family;
- __u8 prefix_len;
- int port;
- __be32 addr[0];
-};
-
-/* Base info structure. It contains socket identity (addrs/ports/cookie)
- * and, alas, the information shown by netstat. */
-struct inet_diag_msg {
- __u8 idiag_family;
- __u8 idiag_state;
- __u8 idiag_timer;
- __u8 idiag_retrans;
-
- struct inet_diag_sockid id;
-
- __u32 idiag_expires;
- __u32 idiag_rqueue;
- __u32 idiag_wqueue;
- __u32 idiag_uid;
- __u32 idiag_inode;
-};
-
-/* Extensions */
-
-enum {
- INET_DIAG_NONE,
- INET_DIAG_MEMINFO,
- INET_DIAG_INFO,
- INET_DIAG_VEGASINFO,
- INET_DIAG_CONG,
- INET_DIAG_TOS,
- INET_DIAG_TCLASS,
- INET_DIAG_SKMEMINFO,
- INET_DIAG_SHUTDOWN,
-};
-
-#define INET_DIAG_MAX INET_DIAG_SHUTDOWN
-
-
-/* INET_DIAG_MEM */
-
-struct inet_diag_meminfo {
- __u32 idiag_rmem;
- __u32 idiag_wmem;
- __u32 idiag_fmem;
- __u32 idiag_tmem;
-};
-
-/* INET_DIAG_VEGASINFO */
-
-struct tcpvegas_info {
- __u32 tcpv_enabled;
- __u32 tcpv_rttcnt;
- __u32 tcpv_rtt;
- __u32 tcpv_minrtt;
-};
-
-#endif /* __CR_INET_DIAG_H__ */
diff --git a/include/ipc_ns.h b/include/ipc_ns.h
deleted file mode 100644
index c8909892637c..000000000000
--- a/include/ipc_ns.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef __CR_IPC_NS_H__
-#define __CR_IPC_NS_H__
-
-extern int dump_ipc_ns(int ns_id);
-extern int prepare_ipc_ns(int pid);
-
-extern struct ns_desc ipc_ns_desc;
-
-#endif /* __CR_IPC_NS_H__ */
diff --git a/include/irmap.h b/include/irmap.h
deleted file mode 100644
index 033f71e3722a..000000000000
--- a/include/irmap.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef __CR_IRMAP__H__
-#define __CR_IRMAP__H__
-char *irmap_lookup(unsigned int s_dev, unsigned long i_ino);
-struct _FhEntry;
-int irmap_queue_cache(unsigned int dev, unsigned long ino,
- struct _FhEntry *fh);
-int irmap_predump_prep(void);
-int irmap_predump_run(void);
-int check_open_handle(unsigned int s_dev, unsigned long i_ino,
- struct _FhEntry *f_handle);
-int irmap_load_cache(void);
-int irmap_scan_path_add(char *path);
-#endif
diff --git a/include/kcmp-ids.h b/include/kcmp-ids.h
deleted file mode 100644
index afe68d6d3285..000000000000
--- a/include/kcmp-ids.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef __CR_KCMP_IDS_H__
-#define __CR_KCMP_IDS_H__
-
-#include "kcmp.h"
-
-struct kid_tree {
- struct rb_root root;
- unsigned kcmp_type;
- unsigned long subid;
-
-};
-
-#define DECLARE_KCMP_TREE(name, type) \
- struct kid_tree name = { \
- .root = RB_ROOT, \
- .kcmp_type = type, \
- .subid = 1, \
- }
-
-struct kid_elem {
- int pid;
- unsigned genid;
- unsigned idx;
-};
-
-extern u32 kid_generate_gen(struct kid_tree *tree,
- struct kid_elem *elem, int *new_id);
-
-#endif /* __CR_KCMP_IDS_H__ */
diff --git a/include/kcmp.h b/include/kcmp.h
deleted file mode 100644
index 76f557bff047..000000000000
--- a/include/kcmp.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef __CR_KCMP_H__
-#define __CR_KCMP_H__
-
-enum kcmp_type {
- KCMP_FILE,
- KCMP_VM,
- KCMP_FILES,
- KCMP_FS,
- KCMP_SIGHAND,
- KCMP_IO,
- KCMP_SYSVSEM,
-
- KCMP_TYPES,
-};
-
-#endif /* __CR_KCMP_H__ */
diff --git a/include/kerndat.h b/include/kerndat.h
deleted file mode 100644
index a02d15bc169c..000000000000
--- a/include/kerndat.h
+++ /dev/null
@@ -1,58 +0,0 @@
-#ifndef __CR_KERNDAT_H__
-#define __CR_KERNDAT_H__
-
-#include "asm/types.h"
-
-struct stat;
-
-/*
- * kerndat stands for "kernel data" and is a collection
- * of run-time information about current kernel
- */
-
-extern int kerndat_init(void);
-extern int kerndat_init_rst(void);
-extern int kerndat_get_dirty_track(void);
-extern int kerndat_fdinfo_has_lock(void);
-extern int kerndat_loginuid(bool only_dump);
-
-enum pagemap_func {
- PM_UNKNOWN,
- PM_DISABLED, /* /proc/pid/pagemap doesn't open (user mode) */
- PM_FLAGS_ONLY, /* pagemap zeroes pfn part (user mode) */
- PM_FULL,
-};
-
-struct kerndat_s {
- dev_t shmem_dev;
- int tcp_max_rshare;
- int last_cap;
- u64 zero_page_pfn;
- bool has_dirty_track;
- bool has_memfd;
- bool has_fdinfo_lock;
- unsigned long task_size;
- bool ipv6;
- bool has_loginuid;
- enum pagemap_func pmap;
-};
-
-extern struct kerndat_s kdat;
-
-enum {
- KERNDAT_FS_STAT_DEVPTS,
- KERNDAT_FS_STAT_DEVTMPFS,
- KERNDAT_FS_STAT_BINFMT_MISC,
-
- KERNDAT_FS_STAT_MAX
-};
-
-/*
- * Check whether the fs @which with kdevice @kdev
- * is the same as host's. If yes, this means that
- * the fs mount is shared with host, if no -- it's
- * a new (likely virtuzlized) fs instance.
- */
-extern int kerndat_fs_virtualized(unsigned int which, u32 kdev);
-
-#endif /* __CR_KERNDAT_H__ */
diff --git a/include/libnetlink.h b/include/libnetlink.h
deleted file mode 100644
index 92eded420b06..000000000000
--- a/include/libnetlink.h
+++ /dev/null
@@ -1,20 +0,0 @@
-#ifndef __CR_LIBNETLINK_H__
-#define __CR_LIBNETLINK_H__
-
-#define CR_NLMSG_SEQ 24680 /* arbitrary chosen */
-
-extern int parse_rtattr(struct rtattr *tb[], int max, struct rtattr *rta, int len);
-#define parse_rtattr_nested(tb, max, rta) \
- (parse_rtattr((tb), (max), RTA_DATA(rta), RTA_PAYLOAD(rta)))
-extern int do_rtnl_req(int nl, void *req, int size,
- int (*receive_callback)(struct nlmsghdr *h, void *),
- int (*error_callback)(int err, void *), void *);
-
-extern int addattr_l(struct nlmsghdr *n, int maxlen, int type,
- const void *data, int alen);
-
-#define NLMSG_TAIL(nmsg) \
- ((struct rtattr *) (((void *) (nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len)))
-
-
-#endif /* __CR_LIBNETLINK_H__ */
diff --git a/include/list.h b/include/list.h
deleted file mode 100644
index ce3a3c0cd757..000000000000
--- a/include/list.h
+++ /dev/null
@@ -1,423 +0,0 @@
-#ifndef __CR_LIST_H__
-#define __CR_LIST_H__
-
-/*
- * Double linked lists.
- */
-
-#include "compiler.h"
-
-#define POISON_POINTER_DELTA 0
-#define LIST_POISON1 ((void *) 0x00100100 + POISON_POINTER_DELTA)
-#define LIST_POISON2 ((void *) 0x00200200 + POISON_POINTER_DELTA)
-
-struct list_head {
- struct list_head *prev, *next;
-};
-
-#define LIST_HEAD_INIT(name) { &(name), &(name) }
-#define LIST_HEAD(name) struct list_head name = LIST_HEAD_INIT(name)
-
-static inline void INIT_LIST_HEAD(struct list_head *list)
-{
- list->next = list;
- list->prev = list;
-}
-
-static inline void __list_add(struct list_head *new,
- struct list_head *prev,
- struct list_head *next)
-{
- next->prev = new;
- new->next = next;
- new->prev = prev;
- prev->next = new;
-}
-
-static inline void list_add(struct list_head *new, struct list_head *head)
-{
- __list_add(new, head, head->next);
-}
-
-static inline void list_add_tail(struct list_head *new, struct list_head *head)
-{
- __list_add(new, head->prev, head);
-}
-
-static inline void __list_del(struct list_head * prev, struct list_head * next)
-{
- next->prev = prev;
- prev->next = next;
-}
-
-static inline void __list_del_entry(struct list_head *entry)
-{
- __list_del(entry->prev, entry->next);
-}
-
-static inline void list_del(struct list_head *entry)
-{
- __list_del(entry->prev, entry->next);
- entry->next = LIST_POISON1;
- entry->prev = LIST_POISON2;
-}
-
-static inline void list_replace(struct list_head *old,
- struct list_head *new)
-{
- new->next = old->next;
- new->next->prev = new;
- new->prev = old->prev;
- new->prev->next = new;
-}
-
-static inline void list_replace_init(struct list_head *old,
- struct list_head *new)
-{
- list_replace(old, new);
- INIT_LIST_HEAD(old);
-}
-
-static inline void list_del_init(struct list_head *entry)
-{
- __list_del_entry(entry);
- INIT_LIST_HEAD(entry);
-}
-
-static inline void list_move(struct list_head *list, struct list_head *head)
-{
- __list_del_entry(list);
- list_add(list, head);
-}
-
-static inline void list_move_tail(struct list_head *list,
- struct list_head *head)
-{
- __list_del_entry(list);
- list_add_tail(list, head);
-}
-
-static inline int list_is_last(const struct list_head *list,
- const struct list_head *head)
-{
- return list->next == head;
-}
-
-static inline int list_is_first(const struct list_head *list,
- const struct list_head *head)
-{
- return list->prev == head;
-}
-
-static inline int list_empty(const struct list_head *head)
-{
- return head->next == head;
-}
-
-static inline int list_empty_careful(const struct list_head *head)
-{
- struct list_head *next = head->next;
- return (next == head) && (next == head->prev);
-}
-static inline void list_rotate_left(struct list_head *head)
-{
- struct list_head *first;
-
- if (!list_empty(head)) {
- first = head->next;
- list_move_tail(first, head);
- }
-}
-
-static inline int list_is_singular(const struct list_head *head)
-{
- return !list_empty(head) && (head->next == head->prev);
-}
-
-static inline void __list_cut_position(struct list_head *list,
- struct list_head *head, struct list_head *entry)
-{
- struct list_head *new_first = entry->next;
- list->next = head->next;
- list->next->prev = list;
- list->prev = entry;
- entry->next = list;
- head->next = new_first;
- new_first->prev = head;
-}
-
-static inline void list_cut_position(struct list_head *list,
- struct list_head *head, struct list_head *entry)
-{
- if (list_empty(head))
- return;
- if (list_is_singular(head) &&
- (head->next != entry && head != entry))
- return;
- if (entry == head)
- INIT_LIST_HEAD(list);
- else
- __list_cut_position(list, head, entry);
-}
-
-static inline void __list_splice(const struct list_head *list,
- struct list_head *prev,
- struct list_head *next)
-{
- struct list_head *first = list->next;
- struct list_head *last = list->prev;
-
- first->prev = prev;
- prev->next = first;
-
- last->next = next;
- next->prev = last;
-}
-
-static inline void list_splice(const struct list_head *list,
- struct list_head *head)
-{
- if (!list_empty(list))
- __list_splice(list, head, head->next);
-}
-
-static inline void list_splice_tail(struct list_head *list,
- struct list_head *head)
-{
- if (!list_empty(list))
- __list_splice(list, head->prev, head);
-}
-
-static inline void list_splice_init(struct list_head *list,
- struct list_head *head)
-{
- if (!list_empty(list)) {
- __list_splice(list, head, head->next);
- INIT_LIST_HEAD(list);
- }
-}
-
-static inline void list_splice_tail_init(struct list_head *list,
- struct list_head *head)
-{
- if (!list_empty(list)) {
- __list_splice(list, head->prev, head);
- INIT_LIST_HEAD(list);
- }
-}
-
-#define list_entry(ptr, type, member) \
- container_of(ptr, type, member)
-
-#define list_first_entry(ptr, type, member) \
- list_entry((ptr)->next, type, member)
-
-#define list_for_each(pos, head) \
- for (pos = (head)->next; pos != (head); pos = pos->next)
-
-#define __list_for_each(pos, head) \
- for (pos = (head)->next; pos != (head); pos = pos->next)
-
-#define list_for_each_prev(pos, head) \
- for (pos = (head)->prev; pos != (head); pos = pos->prev)
-
-#define list_for_each_safe(pos, n, head) \
- for (pos = (head)->next, n = pos->next; pos != (head); \
- pos = n, n = pos->next)
-
-#define list_for_each_prev_safe(pos, n, head) \
- for (pos = (head)->prev, n = pos->prev; \
- pos != (head); \
- pos = n, n = pos->prev)
-
-#define list_for_each_entry(pos, head, member) \
- for (pos = list_entry((head)->next, typeof(*pos), member); \
- &pos->member != (head); \
- pos = list_entry(pos->member.next, typeof(*pos), member))
-
-#define list_for_each_entry_reverse(pos, head, member) \
- for (pos = list_entry((head)->prev, typeof(*pos), member); \
- &pos->member != (head); \
- pos = list_entry(pos->member.prev, typeof(*pos), member))
-
-#define list_prepare_entry(pos, head, member) \
- ((pos) ? : list_entry(head, typeof(*pos), member))
-
-#define list_for_each_entry_continue(pos, head, member) \
- for (pos = list_entry(pos->member.next, typeof(*pos), member); \
- &pos->member != (head); \
- pos = list_entry(pos->member.next, typeof(*pos), member))
-
-#define list_for_each_entry_continue_reverse(pos, head, member) \
- for (pos = list_entry(pos->member.prev, typeof(*pos), member); \
- &pos->member != (head); \
- pos = list_entry(pos->member.prev, typeof(*pos), member))
-
-#define list_for_each_entry_from(pos, head, member) \
- for (; &pos->member != (head); \
- pos = list_entry(pos->member.next, typeof(*pos), member))
-
-#define list_for_each_entry_safe(pos, n, head, member) \
- for (pos = list_entry((head)->next, typeof(*pos), member), \
- n = list_entry(pos->member.next, typeof(*pos), member); \
- &pos->member != (head); \
- pos = n, n = list_entry(n->member.next, typeof(*n), member))
-
-#define list_for_each_entry_safe_continue(pos, n, head, member) \
- for (pos = list_entry(pos->member.next, typeof(*pos), member), \
- n = list_entry(pos->member.next, typeof(*pos), member); \
- &pos->member != (head); \
- pos = n, n = list_entry(n->member.next, typeof(*n), member))
-
-#define list_for_each_entry_safe_from(pos, n, head, member) \
- for (n = list_entry(pos->member.next, typeof(*pos), member); \
- &pos->member != (head); \
- pos = n, n = list_entry(n->member.next, typeof(*n), member))
-
-#define list_for_each_entry_safe_reverse(pos, n, head, member) \
- for (pos = list_entry((head)->prev, typeof(*pos), member), \
- n = list_entry(pos->member.prev, typeof(*pos), member); \
- &pos->member != (head); \
- pos = n, n = list_entry(n->member.prev, typeof(*n), member))
-
-#define list_safe_reset_next(pos, n, member) \
- n = list_entry(pos->member.next, typeof(*pos), member)
-
-/*
- * Double linked lists with a single pointer list head.
- */
-
-struct hlist_head {
- struct hlist_node *first;
-};
-
-struct hlist_node {
- struct hlist_node *next, **pprev;
-};
-
-#define HLIST_HEAD_INIT { .first = NULL }
-#define HLIST_HEAD(name) struct hlist_head name = { .first = NULL }
-#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL)
-
-static inline void INIT_HLIST_NODE(struct hlist_node *h)
-{
- h->next = NULL;
- h->pprev = NULL;
-}
-
-static inline int hlist_unhashed(const struct hlist_node *h)
-{
- return !h->pprev;
-}
-
-static inline int hlist_empty(const struct hlist_head *h)
-{
- return !h->first;
-}
-
-static inline void __hlist_del(struct hlist_node *n)
-{
- struct hlist_node *next = n->next;
- struct hlist_node **pprev = n->pprev;
- *pprev = next;
- if (next)
- next->pprev = pprev;
-}
-
-static inline void hlist_del(struct hlist_node *n)
-{
- __hlist_del(n);
- n->next = LIST_POISON1;
- n->pprev = LIST_POISON2;
-}
-
-static inline void hlist_del_init(struct hlist_node *n)
-{
- if (!hlist_unhashed(n)) {
- __hlist_del(n);
- INIT_HLIST_NODE(n);
- }
-}
-
-static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
-{
- struct hlist_node *first = h->first;
- n->next = first;
- if (first)
- first->pprev = &n->next;
- h->first = n;
- n->pprev = &h->first;
-}
-
-/* next must be != NULL */
-static inline void hlist_add_before(struct hlist_node *n,
- struct hlist_node *next)
-{
- n->pprev = next->pprev;
- n->next = next;
- next->pprev = &n->next;
- *(n->pprev) = n;
-}
-
-static inline void hlist_add_after(struct hlist_node *n,
- struct hlist_node *next)
-{
- next->next = n->next;
- n->next = next;
- next->pprev = &n->next;
-
- if (next->next)
- next->next->pprev = &next->next;
-}
-
-/* after that we'll appear to be on some hlist and hlist_del will work */
-static inline void hlist_add_fake(struct hlist_node *n)
-{
- n->pprev = &n->next;
-}
-
-/*
- * Move a list from one list head to another. Fixup the pprev
- * reference of the first entry if it exists.
- */
-static inline void hlist_move_list(struct hlist_head *old,
- struct hlist_head *new)
-{
- new->first = old->first;
- if (new->first)
- new->first->pprev = &new->first;
- old->first = NULL;
-}
-
-#define hlist_entry(ptr, type, member) container_of(ptr,type,member)
-
-#define hlist_for_each(pos, head) \
- for (pos = (head)->first; pos ; pos = pos->next)
-
-#define hlist_for_each_safe(pos, n, head) \
- for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \
- pos = n)
-
-#define hlist_entry_safe(ptr, type, member) \
- (ptr) ? hlist_entry(ptr, type, member) : NULL
-
-#define hlist_for_each_entry(pos, head, member) \
- for (pos = hlist_entry_safe((head)->first, typeof(*(pos)), member); \
- pos; \
- pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
-
-#define hlist_for_each_entry_continue(pos, member) \
- for (pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member);\
- pos; \
- pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
-
-#define hlist_for_each_entry_from(pos, member) \
- for (; pos; \
- pos = hlist_entry_safe((pos)->member.next, typeof(*(pos)), member))
-
-#define hlist_for_each_entry_safe(pos, n, head, member) \
- for (pos = hlist_entry_safe((head)->first, typeof(*pos), member); \
- pos && ({ n = pos->member.next; 1; }); \
- pos = hlist_entry_safe(n, typeof(*pos), member))
-
-#endif /* __CR_LIST_H__ */
diff --git a/include/lock.h b/include/lock.h
deleted file mode 100644
index 1678d10ef988..000000000000
--- a/include/lock.h
+++ /dev/null
@@ -1,157 +0,0 @@
-#ifndef __CR_LOCK_H__
-#define __CR_LOCK_H__
-
-#include <linux/futex.h>
-#include <sys/time.h>
-#include <limits.h>
-#include <errno.h>
-
-#include "asm/types.h"
-#include "asm/atomic.h"
-#include "bug.h"
-
-#ifdef CR_NOGLIBC
-# include "syscall.h"
-#else
-# include <sys/syscall.h>
-static inline long sys_futex(void *addr1, int op, int val1,
- struct timespec *timeout, void *addr2, int val3)
-{
- int rc = syscall(SYS_futex, addr1, op, val1, timeout, addr2, val3);
- if (rc == -1) rc = -errno;
- return rc;
-}
-#endif
-
-typedef struct {
- atomic_t raw;
-} futex_t;
-
-#define FUTEX_ABORT_FLAG (0x80000000)
-#define FUTEX_ABORT_RAW (-1U)
-
-/* Get current futex @f value */
-static inline u32 futex_get(futex_t *f)
-{
- return atomic_read(&f->raw);
-}
-
-/* Set futex @f value to @v */
-static inline void futex_set(futex_t *f, u32 v)
-{
- atomic_set(&f->raw, (int)v);
-}
-
-#define futex_init(f) futex_set(f, 0)
-
-/* Wait on futex @__f value @__v become in condition @__c */
-#define futex_wait_if_cond(__f, __v, __cond) \
- do { \
- int ret; \
- u32 tmp; \
- \
- while (1) { \
- struct timespec to = {.tv_sec = 120}; \
- tmp = (u32)atomic_read(&(__f)->raw); \
- if ((tmp & FUTEX_ABORT_FLAG) || \
- (tmp __cond (__v))) \
- break; \
- ret = sys_futex((u32 *)&(__f)->raw.counter, FUTEX_WAIT,\
- tmp, &to, NULL, 0); \
- if (ret == -ETIMEDOUT) { \
- pr_warn("blocked for more than 120 seconds\n"); \
- continue; \
- } \
- if (ret == -EINTR || ret == -EWOULDBLOCK) \
- continue; \
- if (ret < 0) { \
- pr_err("futex() returned an unexpected error: %d\n", ret); \
- BUG(); \
- } \
- } \
- } while (0)
-
-/* Set futex @f to @v and wake up all waiters */
-static inline void futex_set_and_wake(futex_t *f, u32 v)
-{
- atomic_set(&f->raw, (int)v);
- BUG_ON(sys_futex((u32 *)&f->raw.counter, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) < 0);
-}
-
-/* Mark futex @f as wait abort needed and wake up all waiters */
-static inline void futex_abort_and_wake(futex_t *f)
-{
- BUILD_BUG_ON(!(FUTEX_ABORT_RAW & FUTEX_ABORT_FLAG));
- futex_set_and_wake(f, FUTEX_ABORT_RAW);
-}
-
-/* Decrement futex @f value and wake up all waiters */
-static inline void futex_dec_and_wake(futex_t *f)
-{
- atomic_dec(&f->raw);
- BUG_ON(sys_futex((u32 *)&f->raw.counter, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) < 0);
-}
-
-/* Increment futex @f value and wake up all waiters */
-static inline void futex_inc_and_wake(futex_t *f)
-{
- atomic_inc(&f->raw);
- BUG_ON(sys_futex((u32 *)&f->raw.counter, FUTEX_WAKE, INT_MAX, NULL, NULL, 0) < 0);
-}
-
-/* Plain increment futex @f value */
-static inline void futex_inc(futex_t *f) { atomic_inc(&f->raw); }
-
-/* Plain decrement futex @f value */
-static inline void futex_dec(futex_t *f) { atomic_dec(&f->raw); }
-
-/* Wait until futex @f value become @v */
-#define futex_wait_until(f, v) futex_wait_if_cond(f, v, ==)
-
-/* Wait while futex @f value is greater than @v */
-#define futex_wait_while_gt(f, v) futex_wait_if_cond(f, v, <=)
-
-/* Wait while futex @f value is less than @v */
-#define futex_wait_while_lt(f, v) futex_wait_if_cond(f, v, >=)
-
-/* Wait while futex @f value is equal to @v */
-#define futex_wait_while_eq(f, v) futex_wait_if_cond(f, v, !=)
-
-/* Wait while futex @f value is @v */
-static inline void futex_wait_while(futex_t *f, u32 v)
-{
- while ((u32)atomic_read(&f->raw) == v) {
- int ret = sys_futex((u32 *)&f->raw.counter, FUTEX_WAIT, v, NULL, NULL, 0);
- BUG_ON(ret < 0 && ret != -EWOULDBLOCK);
- }
-}
-
-typedef struct {
- atomic_t raw;
-} mutex_t;
-
-static inline void mutex_init(mutex_t *m)
-{
- u32 c = 0;
- atomic_set(&m->raw, (int)c);
-}
-
-static inline void mutex_lock(mutex_t *m)
-{
- u32 c;
- int ret;
-
- while ((c = (u32)atomic_inc_return(&m->raw)) != 1) {
- ret = sys_futex((u32 *)&m->raw.counter, FUTEX_WAIT, c, NULL, NULL, 0);
- BUG_ON(ret < 0 && ret != -EWOULDBLOCK);
- }
-}
-
-static inline void mutex_unlock(mutex_t *m)
-{
- u32 c = 0;
- atomic_set(&m->raw, (int)c);
- BUG_ON(sys_futex((u32 *)&m->raw.counter, FUTEX_WAKE, 1, NULL, NULL, 0) < 0);
-}
-
-#endif /* __CR_LOCK_H__ */
diff --git a/include/log.h b/include/log.h
deleted file mode 100644
index fe53a7c928b1..000000000000
--- a/include/log.h
+++ /dev/null
@@ -1,41 +0,0 @@
-#ifndef __CR_LOG_H__
-#define __CR_LOG_H__
-
-#include <inttypes.h>
-
-#include "criu-log.h"
-
-extern int log_init(const char *output);
-extern void log_fini(void);
-extern int log_init_by_pid(void);
-extern void log_closedir(void);
-
-extern void log_set_fd(int fd);
-extern int log_get_fd(void);
-
-extern void log_set_loglevel(unsigned int loglevel);
-extern unsigned int log_get_loglevel(void);
-
-#define LOG_SIMPLE_CHUNK 72
-
-extern int vprint_num(char *buf, int blen, int num, char **ps);
-extern void simple_sprintf(char output[LOG_SIMPLE_CHUNK], const char *format, ...)
- __attribute__ ((__format__ (__printf__, 2, 3)));
-
-extern int write_pidfile(int pid);
-
-#define DEFAULT_LOGLEVEL LOG_WARN
-
-#define DEFAULT_LOG_FILENAME "criu.log"
-
-struct cr_img;
-
-extern void print_data(unsigned long addr, unsigned char *data, size_t size);
-extern void print_image_data(struct cr_img *, unsigned int length, int show);
-
-static inline int pr_quelled(unsigned int loglevel)
-{
- return log_get_loglevel() < loglevel && loglevel != LOG_MSG;
-}
-
-#endif /* __CR_LOG_H__ */
diff --git a/include/lsm.h b/include/lsm.h
deleted file mode 100644
index bd13ef70b4c6..000000000000
--- a/include/lsm.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef __CR_LSM_H__
-#define __CR_LSM_H__
-
-#include "protobuf/inventory.pb-c.h"
-#include "protobuf/creds.pb-c.h"
-
-/*
- * Get the Lsmtype for the current host.
- */
-extern Lsmtype host_lsm_type(void);
-
-/*
- * Initilize the Lsmtype for the current host
- */
-extern void kerndat_lsm(void);
-
-/*
- * Read the LSM profile for the pstree item
- */
-extern int collect_lsm_profile(pid_t, CredsEntry *);
-
-/*
- * Validate that the LSM profiles can be correctly applied (must happen after
- * pstree is set up).
- */
-int validate_lsm(char *profile);
-
-/*
- * Render the profile name in the way that the LSM wants it written to
- * /proc/<pid>/attr/current.
- */
-int render_lsm_profile(char *profile, char **val);
-
-extern int parse_lsm_arg(char *arg);
-#endif /* __CR_LSM_H__ */
diff --git a/include/magic.h b/include/magic.h
deleted file mode 100644
index b11a70eddebe..000000000000
--- a/include/magic.h
+++ /dev/null
@@ -1,115 +0,0 @@
-#ifndef __CR_MAGIC_H__
-#define __CR_MAGIC_H__
-
-/*
- * Basic multi-file images
- */
-
-#define CRTOOLS_IMAGES_V1 1
-/*
- * v1.1 has common magic in the head of each image file,
- * except for inventory
- */
-#define CRTOOLS_IMAGES_V1_1 2
-
-/*
- * Raw images are images in which data is stored in some
- * non-crtool format (ip tool dumps, tarballs, etc.)
- */
-
-#define RAW_IMAGE_MAGIC 0x0
-
-/*
- * Images have the IMG_COMMON_MAGIC in the head. Service files
- * such as stats and irmap-cache have the IMG_SERVICE_MAGIC.
- */
-
-#define IMG_COMMON_MAGIC 0x54564319 /* Sarov (a.k.a. Arzamas-16) */
-#define IMG_SERVICE_MAGIC 0x55105940 /* Zlatoust */
-
-/*
- * The magic-s below correspond to coordinates
- * of various Russian towns in the NNNNEEEE form.
- */
-
-#define INVENTORY_MAGIC 0x58313116 /* Veliky Novgorod */
-#define PSTREE_MAGIC 0x50273030 /* Kyiv */
-#define FDINFO_MAGIC 0x56213732 /* Dmitrov */
-#define PAGEMAP_MAGIC 0x56084025 /* Vladimir */
-#define SHMEM_PAGEMAP_MAGIC PAGEMAP_MAGIC
-#define PAGES_MAGIC RAW_IMAGE_MAGIC
-#define CORE_MAGIC 0x55053847 /* Kolomna */
-#define IDS_MAGIC 0x54432030 /* Konigsberg */
-#define VMAS_MAGIC 0x54123737 /* Tula */
-#define PIPES_MAGIC 0x56513555 /* Tver */
-#define PIPES_DATA_MAGIC 0x56453709 /* Dubna */
-#define FIFO_MAGIC 0x58364939 /* Kirov */
-#define FIFO_DATA_MAGIC 0x59333054 /* Tosno */
-#define SIGACT_MAGIC 0x55344201 /* Murom */
-#define UNIXSK_MAGIC 0x54373943 /* Ryazan */
-#define INETSK_MAGIC 0x56443851 /* Pereslavl */
-#define PACKETSK_MAGIC 0x60454618 /* Veliky Ustyug */
-#define ITIMERS_MAGIC 0x57464056 /* Kostroma */
-#define POSIX_TIMERS_MAGIC 0x52603957 /* Lipetsk */
-#define SK_QUEUES_MAGIC 0x56264026 /* Suzdal */
-#define UTSNS_MAGIC 0x54473203 /* Smolensk */
-#define CREDS_MAGIC 0x54023547 /* Kozelsk */
-#define IPC_VAR_MAGIC 0x53115007 /* Samara */
-#define IPCNS_SHM_MAGIC 0x46283044 /* Odessa */
-#define IPCNS_MSG_MAGIC 0x55453737 /* Moscow */
-#define IPCNS_SEM_MAGIC 0x59573019 /* St. Petersburg */
-#define REG_FILES_MAGIC 0x50363636 /* Belgorod */
-#define EXT_FILES_MAGIC 0x59255641 /* Usolye */
-#define FS_MAGIC 0x51403912 /* Voronezh */
-#define MM_MAGIC 0x57492820 /* Pskov */
-#define REMAP_FPATH_MAGIC 0x59133954 /* Vologda */
-#define GHOST_FILE_MAGIC 0x52583605 /* Oryol */
-#define TCP_STREAM_MAGIC 0x51465506 /* Orenburg */
-#define EVENTFD_FILE_MAGIC 0x44523722 /* Anapa */
-#define EVENTPOLL_FILE_MAGIC 0x45023858 /* Krasnodar */
-#define EVENTPOLL_TFD_MAGIC 0x44433746 /* Novorossiysk */
-#define SIGNALFD_MAGIC 0x57323820 /* Uglich */
-#define INOTIFY_FILE_MAGIC 0x48424431 /* Volgograd */
-#define INOTIFY_WD_MAGIC 0x54562009 /* Svetlogorsk (Rauschen) */
-#define MNTS_MAGIC 0x55563928 /* Petushki */
-#define NETDEV_MAGIC 0x57373951 /* Yaroslavl */
-#define NETNS_MAGIC 0x55933752 /* Dolgoprudny */
-#define TTY_FILES_MAGIC 0x59433025 /* Pushkin */
-#define TTY_INFO_MAGIC 0x59453036 /* Kolpino */
-#define FILE_LOCKS_MAGIC 0x54323616 /* Kaluga */
-#define RLIMIT_MAGIC 0x57113925 /* Rostov */
-#define FANOTIFY_FILE_MAGIC 0x55096122 /* Chelyabinsk */
-#define FANOTIFY_MARK_MAGIC 0x56506035 /* Yekaterinburg */
-#define SIGNAL_MAGIC 0x59255647 /* Berezniki */
-#define PSIGNAL_MAGIC SIGNAL_MAGIC
-#define NETLINK_SK_MAGIC 0x58005614 /* Perm */
-#define NS_FILES_MAGIC 0x61394011 /* Nyandoma */
-#define TUNFILE_MAGIC 0x57143751 /* Kalyazin */
-#define CGROUP_MAGIC 0x59383330 /* Tikhvin */
-#define TIMERFD_MAGIC 0x50493712 /* Korocha */
-#define CPUINFO_MAGIC 0x61404013 /* Nyandoma */
-#define USERNS_MAGIC 0x55474906 /* Kazan */
-#define SECCOMP_MAGIC 0x64413049 /* Kostomuksha */
-#define BINFMT_MISC_MAGIC 0x67343323 /* Apatity */
-
-#define IFADDR_MAGIC RAW_IMAGE_MAGIC
-#define ROUTE_MAGIC RAW_IMAGE_MAGIC
-#define ROUTE6_MAGIC RAW_IMAGE_MAGIC
-#define RULE_MAGIC RAW_IMAGE_MAGIC
-#define TMPFS_IMG_MAGIC RAW_IMAGE_MAGIC
-#define TMPFS_DEV_MAGIC RAW_IMAGE_MAGIC
-#define IPTABLES_MAGIC RAW_IMAGE_MAGIC
-#define IP6TABLES_MAGIC RAW_IMAGE_MAGIC
-#define NETNF_CT_MAGIC RAW_IMAGE_MAGIC
-#define NETNF_EXP_MAGIC RAW_IMAGE_MAGIC
-
-#define PAGES_OLD_MAGIC PAGEMAP_MAGIC
-#define SHM_PAGES_OLD_MAGIC PAGEMAP_MAGIC
-
-/*
- * These are special files, not exactly images
- */
-#define STATS_MAGIC 0x57093306 /* Ostashkov */
-#define IRMAP_CACHE_MAGIC 0x57004059 /* Ivanovo */
-
-#endif /* __CR_MAGIC_H__ */
diff --git a/include/mem.h b/include/mem.h
deleted file mode 100644
index 5269cad029cf..000000000000
--- a/include/mem.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef __CR_MEM_H__
-#define __CR_MEM_H__
-
-struct parasite_ctl;
-struct vm_area_list;
-struct page_pipe;
-struct pstree_item;
-
-extern int prepare_mm_pid(struct pstree_item *i);
-extern int do_task_reset_dirty_track(int pid);
-extern unsigned int dump_pages_args_size(struct vm_area_list *vmas);
-extern int parasite_dump_pages_seized(struct parasite_ctl *ctl,
- struct vm_area_list *vma_area_list,
- struct page_pipe **pp);
-
-#define PME_PRESENT (1ULL << 63)
-#define PME_SWAP (1ULL << 62)
-#define PME_FILE (1ULL << 61)
-#define PME_SOFT_DIRTY (1ULL << 55)
-#define PME_PSHIFT_BITS (6)
-#define PME_STATUS_BITS (3)
-#define PME_STATUS_OFFSET (64 - PME_STATUS_BITS)
-#define PME_PSHIFT_OFFSET (PME_STATUS_OFFSET - PME_PSHIFT_BITS)
-#define PME_PFRAME_MASK ((1ULL << PME_PSHIFT_OFFSET) - 1)
-#define PME_PFRAME(x) ((x) & PME_PFRAME_MASK)
-
-#endif /* __CR_MEM_H__ */
diff --git a/include/mman.h b/include/mman.h
deleted file mode 100644
index 340d36927152..000000000000
--- a/include/mman.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef __CR_MMAN_H__
-#define __CR_MMAN_H__
-
-#ifndef MAP_HUGETLB
-# define MAP_HUGETLB 0x40000
-#endif
-#ifndef MADV_HUGEPAGE
-# define MADV_HUGEPAGE 14
-#endif
-#ifndef MADV_NOHUGEPAGE
-# define MADV_NOHUGEPAGE 15
-#endif
-#ifndef MADV_DONTDUMP
-# define MADV_DONTDUMP 16
-#endif
-
-#endif /* __CR_MMAN_H__ */
diff --git a/include/mount.h b/include/mount.h
deleted file mode 100644
index b3bbdcea53c7..000000000000
--- a/include/mount.h
+++ /dev/null
@@ -1,129 +0,0 @@
-#ifndef __CR_MOUNT_H__
-#define __CR_MOUNT_H__
-
-#include <sys/types.h>
-
-#include "asm/types.h"
-#include "list.h"
-
-struct proc_mountinfo;
-struct pstree_item;
-struct fstype;
-struct ns_id;
-
-/*
- * Structure to keep external mount points resolving info.
- *
- * On dump the key is the mountpoint as seen from the mount
- * namespace, the val is some name that will be put into image
- * instead of the mount point's root path.
- *
- * On restore the key is the name from the image (the one
- * mentioned above) and the val is the path in criu's mount
- * namespace that will become the mount point's root, i.e. --
- * be bind mounted to the respective mountpoint.
- */
-struct ext_mount {
- struct list_head list;
- char *key;
- char *val;
-};
-
-#define MOUNT_INVALID_DEV (0)
-
-struct mount_info {
- int mnt_id;
- int parent_mnt_id;
- unsigned int s_dev;
- unsigned int s_dev_rt;
- char *root;
- /*
- * During dump mountpoint contains path with dot at the
- * beginning. It allows to use openat, statat, etc without
- * creating a temporary copy of the path.
- *
- * On restore mountpoint is prepended with so called ns
- * root path -- it's a place in fs where the namespace
- * mount tree is constructed. Check mnt_roots for details.
- * The ns_mountpoint contains path w/o this prefix.
- */
- char *mountpoint;
- char *ns_mountpoint;
- unsigned flags;
- unsigned sb_flags;
- int master_id;
- int shared_id;
- struct fstype *fstype;
- char *source;
- char *options;
- union {
- bool mounted;
- bool dumped;
- };
- bool need_plugin;
- bool is_ns_root;
- bool deleted;
- struct mount_info *next;
- struct ns_id *nsid;
-
- struct ext_mount *external;
- bool internal_sharing;
-
- /* tree linkage */
- struct mount_info *parent;
- struct mount_info *bind;
- struct list_head children;
- struct list_head siblings;
-
- struct list_head mnt_bind; /* circular list of derivatives of one real mount */
- struct list_head mnt_share; /* circular list of shared mounts */
- struct list_head mnt_slave_list; /* list of slave mounts */
- struct list_head mnt_slave; /* slave list entry */
- struct mount_info *mnt_master; /* slave is on master->mnt_slave_list */
-
- struct list_head postpone;
-
- void *private; /* associated filesystem data */
-};
-
-extern struct mount_info *mntinfo;
-extern struct ns_desc mnt_ns_desc;
-
-extern struct mount_info *mnt_entry_alloc();
-extern void mnt_entry_free(struct mount_info *mi);
-
-extern int __mntns_get_root_fd(pid_t pid);
-extern int mntns_get_root_fd(struct ns_id *ns);
-extern int mntns_get_root_by_mnt_id(int mnt_id);
-extern struct ns_id *lookup_nsid_by_mnt_id(int mnt_id);
-
-extern int open_mount(unsigned int s_dev);
-extern int __open_mountpoint(struct mount_info *pm, int mnt_fd);
-extern struct fstype *find_fstype_by_name(char *fst);
-extern bool add_fsname_auto(const char *names);
-
-extern struct mount_info *collect_mntinfo(struct ns_id *ns, bool for_dump);
-extern int prepare_mnt_ns(void);
-
-extern int pivot_root(const char *new_root, const char *put_old);
-
-extern struct mount_info *lookup_overlayfs(char *rpath, unsigned int s_dev,
- unsigned int st_ino, unsigned int mnt_id);
-extern struct mount_info *lookup_mnt_id(unsigned int id);
-extern struct mount_info *lookup_mnt_sdev(unsigned int s_dev);
-
-extern dev_t phys_stat_resolve_dev(struct ns_id *, dev_t st_dev, const char *path);
-extern bool phys_stat_dev_match(dev_t st_dev, dev_t phys_dev,
- struct ns_id *, const char *path);
-
-extern int restore_task_mnt_ns(struct pstree_item *current);
-extern void fini_restore_mntns(void);
-extern int depopulate_roots_yard(void);
-
-extern int rst_get_mnt_root(int mnt_id, char *path, int plen);
-extern int ext_mount_add(char *key, char *val);
-extern int mntns_maybe_create_roots(void);
-extern int read_mnt_ns_img(void);
-extern void cleanup_mnt_ns(void);
-
-#endif /* __CR_MOUNT_H__ */
diff --git a/include/namespaces.h b/include/namespaces.h
deleted file mode 100644
index 4ce5a3470b98..000000000000
--- a/include/namespaces.h
+++ /dev/null
@@ -1,130 +0,0 @@
-#ifndef __CR_NS_H__
-#define __CR_NS_H__
-
-#include "compiler.h"
-#include "files.h"
-
-/* Nested namespaces are supported only for these types */
-#define CLONE_SUBNS (CLONE_NEWNS)
-
-struct ns_desc {
- unsigned int cflag;
- char *str;
- size_t len;
-};
-
-enum ns_type {
- NS_UNKNOWN = 0,
- NS_CRIU,
- NS_ROOT,
- NS_OTHER,
-};
-
-struct ns_id {
- unsigned int kid;
- unsigned int id;
- pid_t ns_pid;
- struct ns_desc *nd;
- struct ns_id *next;
- enum ns_type type;
-
- /*
- * For mount namespaces on restore -- indicates that
- * the namespace in question is created (all mounts
- * are mounted) and other tasks may do setns on it
- * and proceed.
- */
- futex_t ns_populated;
-
- union {
- struct {
- struct mount_info *mntinfo_list;
- struct mount_info *mntinfo_tree;
- int ns_fd;
- int root_fd;
- } mnt;
-
- struct {
- int nlsk; /* for sockets collection */
- int seqsk; /* to talk to parasite daemons */
- } net;
- };
-};
-extern struct ns_id *ns_ids;
-
-#define NS_DESC_ENTRY(_cflag, _str) \
- { \
- .cflag = _cflag, \
- .str = _str, \
- .len = sizeof(_str) - 1, \
- }
-
-extern bool check_ns_proc(struct fd_link *link);
-
-extern struct ns_desc pid_ns_desc;
-extern struct ns_desc user_ns_desc;
-extern unsigned long root_ns_mask;
-
-extern const struct fdtype_ops nsfile_dump_ops;
-extern struct collect_image_info nsfile_cinfo;
-
-extern int walk_namespaces(struct ns_desc *nd, int (*cb)(struct ns_id *, void *), void *oarg);
-extern int collect_namespaces(bool for_dump);
-extern int collect_mnt_namespaces(bool for_dump);
-extern int dump_mnt_namespaces(void);
-extern int dump_namespaces(struct pstree_item *item, unsigned int ns_flags);
-extern int prepare_namespace_before_tasks(void);
-extern int prepare_namespace(struct pstree_item *item, unsigned long clone_flags);
-extern int try_show_namespaces(int pid);
-
-extern int switch_ns(int pid, struct ns_desc *nd, int *rst);
-extern int restore_ns(int rst, struct ns_desc *nd);
-
-extern int dump_task_ns_ids(struct pstree_item *);
-extern int predump_task_ns_ids(struct pstree_item *);
-extern struct ns_id *rst_new_ns_id(unsigned int id, pid_t pid, struct ns_desc *nd, enum ns_type t);
-extern int rst_add_ns_id(unsigned int id, struct pstree_item *, struct ns_desc *nd);
-extern struct ns_id *lookup_ns_by_id(unsigned int id, struct ns_desc *nd);
-
-extern int collect_user_namespaces(bool for_dump);
-extern int prepare_userns(struct pstree_item *item);
-extern int stop_usernsd(void);
-extern int userns_uid(int uid);
-extern int userns_gid(int gid);
-extern int dump_user_ns(pid_t pid, int ns_id);
-extern void free_userns_maps(void);
-
-typedef int (*uns_call_t)(void *arg, int fd, pid_t pid);
-/*
- * Async call -- The call is guaranteed to be done till the
- * CR_STATE_COMPLETE happens. The function may return even
- * before the call starts.
- * W/o flag the call is synchronous -- this function returns
- * strictly after the call finishes.
- */
-#define UNS_ASYNC 0x1
-/*
- * The call returns an FD which should be sent back. Conflicts
- * with UNS_ASYNC.
- */
-#define UNS_FDOUT 0x2
-
-#define MAX_UNSFD_MSG_SIZE 4096
-
-/*
- * When we're restoring inside user namespace, some things are
- * not allowed to be done there due to insufficient capabilities.
- * If the operation in question can be offloaded to another process,
- * this call allows to do that.
- *
- * In case we're not in userns, just call the callback immediatelly
- * in the context of calling task.
- */
-extern int __userns_call(const char *func_name, uns_call_t call, int flags,
- void *arg, size_t arg_size, int fd);
-
-#define userns_call(__call, __flags, __arg, __arg_size, __fd) \
- __userns_call(__stringify(__call), __call, __flags, \
- __arg, __arg_size, __fd)
-
-#endif /* __CR_NS_H__ */
diff --git a/include/net.h b/include/net.h
deleted file mode 100644
index 900b1365634e..000000000000
--- a/include/net.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef __CR_NET_H__
-#define __CR_NET_H__
-
-#include "list.h"
-
-struct cr_imgset;
-extern int dump_net_ns(int ns_id);
-extern int prepare_net_ns(int pid);
-extern int netns_keep_nsfd(void);
-
-struct veth_pair {
- struct list_head node;
- char *inside;
- char *outside;
- char *bridge;
-};
-
-extern int collect_net_namespaces(bool for_dump);
-
-extern int network_lock(void);
-extern void network_unlock(void);
-
-extern struct ns_desc net_ns_desc;
-
-#include "protobuf/netdev.pb-c.h"
-extern int write_netdev_img(NetDeviceEntry *nde, struct cr_imgset *fds);
-extern int read_ns_sys_file(char *path, char *buf, int len);
-extern int restore_link_parms(NetDeviceEntry *nde, int nlsk);
-
-extern int veth_pair_add(char *in, char *out);
-extern int move_veth_to_bridge(void);
-
-#endif /* __CR_NET_H__ */
diff --git a/include/netfilter.h b/include/netfilter.h
deleted file mode 100644
index f3667fc81ea4..000000000000
--- a/include/netfilter.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef __CR_NETFILTER_H__
-#define __CR_NETFILTER_H__
-
-struct inet_sk_desc;
-extern int nf_lock_connection(struct inet_sk_desc *);
-extern int nf_unlock_connection(struct inet_sk_desc *);
-
-struct inet_sk_info;
-extern int nf_unlock_connection_info(struct inet_sk_info *);
-
-#endif /* __CR_NETFILTER_H__ */
diff --git a/include/netlink_diag.h b/include/netlink_diag.h
deleted file mode 100644
index 14ca403b8b3d..000000000000
--- a/include/netlink_diag.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef __CR_NETLINK_DIAG_H__
-#define __CR_NETLINK_DIAG_H__
-
-#include <linux/types.h>
-
-struct netlink_diag_req {
- __u8 sdiag_family;
- __u8 sdiag_protocol;
- __u16 pad;
- __u32 ndiag_ino;
- __u32 ndiag_show;
- __u32 ndiag_cookie[2];
-};
-
-struct netlink_diag_msg {
- __u8 ndiag_family;
- __u8 ndiag_type;
- __u8 ndiag_protocol;
- __u8 ndiag_state;
-
- __u32 ndiag_portid;
- __u32 ndiag_dst_portid;
- __u32 ndiag_dst_group;
- __u32 ndiag_ino;
- __u32 ndiag_cookie[2];
-};
-
-enum {
- NETLINK_DIAG_MEMINFO,
- NETLINK_DIAG_GROUPS,
-
- __NETLINK_DIAG_MAX,
-};
-
-#define NETLINK_DIAG_MAX (__NETLINK_DIAG_MAX - 1)
-
-#define NDIAG_PROTO_ALL ((__u8) ~0)
-
-#define NDIAG_SHOW_MEMINFO 0x00000001 /* show memory info of a socket */
-#define NDIAG_SHOW_GROUPS 0x00000002 /* show groups of a netlink socket */
-
-#endif /* __CR_NETLINK_DIAG_H__ */
diff --git a/include/packet_diag.h b/include/packet_diag.h
deleted file mode 100644
index e5d9193a8c42..000000000000
--- a/include/packet_diag.h
+++ /dev/null
@@ -1,76 +0,0 @@
-#ifndef __CR_PACKET_DIAG_H__
-#define __CR_PACKET_DIAG_H__
-
-#include <linux/types.h>
-
-struct packet_diag_req {
- __u8 sdiag_family;
- __u8 sdiag_protocol;
- __u16 pad;
- __u32 pdiag_ino;
- __u32 pdiag_show;
- __u32 pdiag_cookie[2];
-};
-
-#define PACKET_SHOW_INFO 0x00000001 /* Basic packet_sk information */
-#define PACKET_SHOW_MCLIST 0x00000002 /* A set of packet_diag_mclist-s */
-#define PACKET_SHOW_RING_CFG 0x00000004 /* Rings configuration parameters */
-#define PACKET_SHOW_FANOUT 0x00000008
-
-struct packet_diag_msg {
- __u8 pdiag_family;
- __u8 pdiag_type;
- __u16 pdiag_num;
-
- __u32 pdiag_ino;
- __u32 pdiag_cookie[2];
-};
-
-enum {
- PACKET_DIAG_INFO,
- PACKET_DIAG_MCLIST,
- PACKET_DIAG_RX_RING,
- PACKET_DIAG_TX_RING,
- PACKET_DIAG_FANOUT,
-
- PACKET_DIAG_MAX,
-};
-
-struct packet_diag_info {
- __u32 pdi_index;
- __u32 pdi_version;
- __u32 pdi_reserve;
- __u32 pdi_copy_thresh;
- __u32 pdi_tstamp;
- __u32 pdi_flags;
-
-#define PDI_RUNNING 0x1
-#define PDI_AUXDATA 0x2
-#define PDI_ORIGDEV 0x4
-#define PDI_VNETHDR 0x8
-#define PDI_LOSS 0x10
-};
-
-#ifndef MAX_ADDR_LEN
-#define MAX_ADDR_LEN 32
-#endif
-
-struct packet_diag_mclist {
- __u32 pdmc_index;
- __u32 pdmc_count;
- __u16 pdmc_type;
- __u16 pdmc_alen;
- __u8 pdmc_addr[MAX_ADDR_LEN];
-};
-
-struct packet_diag_ring {
- __u32 pdr_block_size;
- __u32 pdr_block_nr;
- __u32 pdr_frame_size;
- __u32 pdr_frame_nr;
- __u32 pdr_retire_tmo;
- __u32 pdr_sizeof_priv;
- __u32 pdr_features;
-};
-
-#endif /* __CR_PACKET_DIAG_H__ */
diff --git a/include/page-pipe.h b/include/page-pipe.h
deleted file mode 100644
index a2dc26852dd8..000000000000
--- a/include/page-pipe.h
+++ /dev/null
@@ -1,107 +0,0 @@
-#ifndef __CR_PAGE_PIPE_H__
-#define __CR_PAGE_PIPE_H__
-
-#include <sys/uio.h>
-#include "list.h"
-
-/*
- * page_pipe is a descriptor of task's virtual memory
- * with pipes, containing pages.
- *
- * A page-pipe may contain holes -- these are pagemap
- * entries without pages. Holes are stored in separate
- * array to optimize paged iovs feed into vmsplice --
- * they will be sent there in one go.
- *
- * A hole is a pagemap entry that doesn't have pages
- * in it, since they are present in previous (parent)
- * snapshot.
- *
- *
- * This page-pipe vs holes vs task vmem vs image layout
- * is described below.
- *
- * Task memory: (+ present, - not present pages)
- * 0 0 0 0 1 1 1
- * 0 3 6 B 1 8 C
- * ---+++-----++++++-------++++----
- *
- * Page-pipe iovs:
- *
- * bufs = 03:3,0B:6,18:4
- * holes = <empty>
- *
- * The pagemap.img would purely contain page-pipe bufs.
- *
- * Pages image will contain pages at
- *
- * 03,04,05,0B,0C,0D,0E,0F,10,18,19,1A,1B
- *
- * stored one by one.
- *
- * Not let's imagine task touches some pages and its mem
- * looks like: (+ present, = old present, - non present)
- *
- * 0 0 0 0 11 11 1
- * 0 3 6 B 12 78 C
- * ---==+-----====+++-----++===----
- *
- * (not new pages at 11 and 17 vaddrs)
- *
- * The new --snapshot'ed page-pipe would look like
- *
- * bufs = 05:1,0F:3,17:2
- * holes = 03:2,0B:4,19:3
- *
- * So the pagemap.img would look like
- *
- * 03:2:P,05:1,0B:4:P,0F:3,17:2,19:3:P
- *
- * (the page_xfer_dump_pages generates one)
- *
- * where P means "in parent", i.e. respective pages should
- * be looked up in the parent pagemap (not pages.img, but
- * the pagemap, and then the offset in previous pages.img
- * should be calculated, see the read_pagemap_page routine).
- *
- * New pages.img file would contain only pages for
- *
- * 05,0F,10,11,17,18
- */
-
-struct page_pipe_buf {
- int p[2]; /* pipe with pages */
- unsigned int pipe_size; /* how many pages can be fit into pipe */
- unsigned int pages_in; /* how many pages are there */
- unsigned int nr_segs; /* how many iov-s are busy */
- struct iovec *iov; /* vaddr:len map */
- struct list_head l; /* links into page_pipe->bufs */
-};
-
-struct page_pipe {
- unsigned int nr_pipes; /* how many page_pipe_bufs in there */
- struct list_head bufs; /* list of bufs */
- struct list_head free_bufs; /* list of bufs */
- unsigned int nr_iovs; /* number of iovs */
- unsigned int free_iov; /* first free iov */
- struct iovec *iovs; /* iovs. They are provided into create_page_pipe
- and all bufs have their iov-s in there */
-
- unsigned int nr_holes; /* number of holes allocated */
- unsigned int free_hole; /* number of holes in use */
- struct iovec *holes; /* holes */
-
- bool chunk_mode; /* Restrict the maximum buffer size of pipes
- and dump memory for a few iterations */
-};
-
-extern struct page_pipe *create_page_pipe(unsigned int nr,
- struct iovec *, bool chunk_mode);
-extern void destroy_page_pipe(struct page_pipe *p);
-extern int page_pipe_add_page(struct page_pipe *p, unsigned long addr);
-extern int page_pipe_add_hole(struct page_pipe *p, unsigned long addr);
-
-extern void debug_show_page_pipe(struct page_pipe *pp);
-void page_pipe_reinit(struct page_pipe *pp);
-
-#endif /* __CR_PAGE_PIPE_H__ */
diff --git a/include/page-read.h b/include/page-read.h
deleted file mode 100644
index 827e4acd5d47..000000000000
--- a/include/page-read.h
+++ /dev/null
@@ -1,90 +0,0 @@
-#ifndef __CR_PAGE_READ_H__
-#define __CR_PAGE_READ_H__
-
-#include "protobuf/pagemap.pb-c.h"
-
-/*
- * page_read -- engine, that reads pages from image file(s)
- *
- * Several page-read's can be arranged in a chain to read
- * pages from a series of snapshot.
- *
- * A task's address space vs pagemaps+page image pairs can
- * look like this (taken from comment in page-pipe.h):
- *
- * task:
- *
- * 0 0 0 0 1 1 1
- * 0 3 6 B 2 7 C
- * ---+++-----+++++++-----+++++----
- * pm1: ---+++-----++++++-------++++----
- * pm2: ---==+-----====+++-----++===----
- *
- * Here + is present page, - is non prsent, = is present,
- * but is not modified from last snapshot.
- *
- * Thus pagemap.img and pages.img entries are
- *
- * pm1: 03:3,0B:6,18:4
- * pm2: 03:2:P,05:1,0B:4:P,0F:3,17:2,19:3:P
- *
- * where P means "page is in parent pagemap".
- *
- * pg1: 03,04,05,0B,0C,0D,0E,0F,10,18,19,1A,1B
- * pg2: 05,0F,10,11,17,18
- *
- * When trying to restore from these 4 files we'd have
- * to carefull scan pagemap.img's one by one and read or
- * skip pages from pages.img where appropriate.
- *
- * All this is implemented in read_pagemap_page.
- */
-
-struct page_read {
- /*
- * gets next vaddr:len pair to work on.
- * Pagemap entries should be returned in sorted order.
- */
- int (*get_pagemap)(struct page_read *, struct iovec *iov);
- /* reads page from current pagemap */
- int (*read_pages)(struct page_read *, unsigned long vaddr, int nr, void *);
- /* stop working on current pagemap */
- void (*put_pagemap)(struct page_read *);
- void (*close)(struct page_read *);
-
- /* Private data of reader */
- struct cr_img *pmi;
- struct cr_img *pi;
-
- PagemapEntry *pe; /* current pagemap we are on */
- struct page_read *parent; /* parent pagemap (if ->in_parent
- pagemap is met in image, then
- go to this guy for page, see
- read_pagemap_page */
- unsigned long cvaddr; /* vaddr we are on */
-
- struct iovec bunch; /* record consequent neighbour
- iovecs to punch together */
- unsigned id; /* for logging */
-};
-
-#define PR_SHMEM 0x1
-#define PR_TASK 0x2
-
-#define PR_TYPE_MASK 0x3
-#define PR_MOD 0x4 /* Will need to modify */
-
-/*
- * -1 -- error
- * 0 -- no images
- * 1 -- opened
- */
-extern int open_page_read(int pid, struct page_read *, int pr_flags);
-extern int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flags);
-extern void pagemap2iovec(PagemapEntry *pe, struct iovec *iov);
-extern void iovec2pagemap(struct iovec *iov, PagemapEntry *pe);
-extern int seek_pagemap_page(struct page_read *pr, unsigned long vaddr, bool warn);
-
-extern int dedup_one_iovec(struct page_read *pr, struct iovec *iov);
-extern int punch_hole(struct page_read *pr, unsigned long off, unsigned long len, bool cleanup);
-#endif /* __CR_PAGE_READ_H__ */
diff --git a/include/page-xfer.h b/include/page-xfer.h
deleted file mode 100644
index 8492daaff974..000000000000
--- a/include/page-xfer.h
+++ /dev/null
@@ -1,47 +0,0 @@
-#ifndef __CR_PAGE_XFER__H__
-#define __CR_PAGE_XFER__H__
-#include "page-read.h"
-
-extern int cr_page_server(bool daemon_mode, int cfd);
-
-/*
- * page_xfer -- transfer pages into image file.
- * Two images backends are implemented -- local image file
- * and page-server image file.
- */
-
-struct page_xfer {
- /* transfers one vaddr:len entry */
- int (*write_pagemap)(struct page_xfer *self, struct iovec *iov);
- /* transfers pages related to previous pagemap */
- int (*write_pages)(struct page_xfer *self, int pipe, unsigned long len);
- /* transfers one hole -- vaddr:len entry w/o pages */
- int (*write_hole)(struct page_xfer *self, struct iovec *iov);
- void (*close)(struct page_xfer *self);
-
- /* private data for every page-xfer engine */
- union {
- struct /* local */ {
- struct cr_img *pmi; /* pagemaps */
- struct cr_img *pi; /* pages */
- };
-
- struct /* page-server */ {
- int sk;
- u64 dst_id;
- };
- };
-
- struct page_read *parent;
-};
-
-extern int open_page_xfer(struct page_xfer *xfer, int fd_type, long id);
-struct page_pipe;
-extern int page_xfer_dump_pages(struct page_xfer *, struct page_pipe *,
- unsigned long off);
-extern int connect_to_page_server(void);
-extern int disconnect_from_page_server(void);
-
-extern int check_parent_page_xfer(int fd_type, long id);
-
-#endif /* __CR_PAGE_XFER__H__ */
diff --git a/include/pagemap-cache.h b/include/pagemap-cache.h
deleted file mode 100644
index e0880906d74f..000000000000
--- a/include/pagemap-cache.h
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef __CR_PAGEMAP_H__
-#define __CR_PAGEMAP_H__
-
-#include <sys/types.h>
-#include "asm/page.h"
-#include "asm/int.h"
-
-#include "list.h"
-
-struct vma_area;
-
-#define PAGEMAP_PFN_OFF(addr) (PAGE_PFN(addr) * sizeof(u64))
-
-typedef struct {
- pid_t pid; /* which process it belongs */
- unsigned long start; /* start of area */
- unsigned long end; /* end of area */
- const struct list_head *vma_head; /* list head of VMAs we're serving */
- u64 *map; /* local buffer */
- size_t map_len; /* length of a buffer */
- int fd; /* file to read PMs from */
-} pmc_t;
-
-#define PMC_INIT (pmc_t){ }
-
-extern int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t size);
-extern u64 *pmc_get_map(pmc_t *pmc, const struct vma_area *vma);
-extern void pmc_fini(pmc_t *pmc);
-
-#endif /* __CR_PAGEMAP_H__ */
diff --git a/include/parasite-syscall.h b/include/parasite-syscall.h
deleted file mode 100644
index 57612df7478e..000000000000
--- a/include/parasite-syscall.h
+++ /dev/null
@@ -1,139 +0,0 @@
-#ifndef __CR_PARASITE_SYSCALL_H__
-#define __CR_PARASITE_SYSCALL_H__
-
-#include "asm/types.h"
-#include "pid.h"
-#include "list.h"
-#include "config.h"
-
-#define BUILTIN_SYSCALL_SIZE 8
-
-struct parasite_dump_thread;
-struct parasite_dump_misc;
-struct parasite_drain_fd;
-struct vm_area_list;
-struct pstree_item;
-struct _CredsEntry;
-struct _CoreEntry;
-struct list_head;
-struct cr_imgset;
-struct fd_opts;
-struct pid;
-
-struct thread_ctx {
- k_rtsigset_t sigmask;
- user_regs_struct_t regs;
-};
-
-/* parasite control block */
-struct parasite_ctl {
- struct pid pid;
- void *remote_map;
- void *local_map;
- void *sigreturn_addr; /* A place for the breakpoint */
- unsigned long map_length;
-
- /* thread leader data */
- bool daemonized;
-
- struct thread_ctx orig;
-
- void *rstack; /* thread leader stack*/
- struct rt_sigframe *sigframe;
- struct rt_sigframe *rsigframe; /* address in a parasite */
-
- void *r_thread_stack; /* stack for non-leader threads */
-
- unsigned long parasite_ip; /* service routine start ip */
- unsigned long syscall_ip; /* entry point of infection */
-
- unsigned int *addr_cmd; /* addr for command */
- void *addr_args; /* address for arguments */
- unsigned long args_size;
- int tsock; /* transport socket for transfering fds */
-
- struct list_head pre_list;
- struct page_pipe *mem_pp;
-};
-
-extern int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct cr_imgset *cr_imgset);
-extern int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *);
-
-struct proc_posix_timers_stat;
-extern int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args,
- struct parasite_ctl *ctl, struct pstree_item *);
-
-#define parasite_args(ctl, type) \
- ({ \
- BUILD_BUG_ON(sizeof(type) > PARASITE_ARG_SIZE_MIN); \
- ctl->addr_args; \
- })
-
-extern void *parasite_args_s(struct parasite_ctl *ctl, int args_size);
-extern int parasite_send_fd(struct parasite_ctl *ctl, int fd);
-
-/*
- * Execute a command in parasite when it's in daemon mode.
- * The __-ed version is asyncronous (doesn't wait for ack).
- */
-extern int parasite_execute_daemon(unsigned int cmd, struct parasite_ctl *ctl);
-extern int __parasite_execute_daemon(unsigned int cmd, struct parasite_ctl *ctl);
-
-extern int __parasite_wait_daemon_ack(unsigned int cmd,
- struct parasite_ctl *ctl);
-
-extern int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_misc *misc);
-extern int parasite_dump_creds(struct parasite_ctl *ctl, struct _CredsEntry *ce);
-extern int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, struct _CoreEntry *core);
-extern int parasite_dump_thread_seized(struct parasite_ctl *ctl, int id,
- struct pid *tid, struct _CoreEntry *core);
-extern int dump_thread_core(int pid, CoreEntry *core, const struct parasite_dump_thread *dt);
-
-extern int parasite_drain_fds_seized(struct parasite_ctl *ctl,
- struct parasite_drain_fd *dfds,
- int *lfds, struct fd_opts *flags);
-extern int parasite_get_proc_fd_seized(struct parasite_ctl *ctl);
-
-extern int parasite_cure_remote(struct parasite_ctl *ctl);
-extern int parasite_cure_local(struct parasite_ctl *ctl);
-extern int parasite_cure_seized(struct parasite_ctl *ctl);
-extern struct parasite_ctl *parasite_infect_seized(pid_t pid,
- struct pstree_item *item,
- struct vm_area_list *vma_area_list);
-extern void parasite_ensure_args_size(unsigned long sz);
-extern struct parasite_ctl *parasite_prep_ctl(pid_t pid,
- struct vm_area_list *vma_area_list);
-extern int parasite_map_exchange(struct parasite_ctl *ctl, unsigned long size);
-
-extern struct parasite_tty_args *parasite_dump_tty(struct parasite_ctl *ctl, int fd, int type);
-
-extern int parasite_init_threads_seized(struct parasite_ctl *ctl, struct pstree_item *item);
-extern int parasite_fini_threads_seized(struct parasite_ctl *ctl);
-
-extern int syscall_seized(struct parasite_ctl *ctl, int nr, unsigned long *ret,
- unsigned long arg1, unsigned long arg2,
- unsigned long arg3, unsigned long arg4,
- unsigned long arg5, unsigned long arg6);
-
-extern int __parasite_execute_syscall(struct parasite_ctl *ctl,
- user_regs_struct_t *regs);
-extern bool arch_can_dump_task(pid_t pid);
-
-/*
- * The PTRACE_SYSCALL will trap task twice -- on
- * enter into and on exit from syscall. If we trace
- * a single task, we may skip half of all getregs
- * calls -- on exit we don't need them.
- */
-enum trace_flags {
- TRACE_ALL,
- TRACE_ENTER,
- TRACE_EXIT,
-};
-
-extern int parasite_stop_daemon(struct parasite_ctl *ctl);
-extern int parasite_stop_on_syscall(int tasks, int sys_nr, enum trace_flags trace);
-extern int parasite_unmap(struct parasite_ctl *ctl, unsigned long addr);
-extern int ptrace_stop_pie(pid_t pid, void *addr, enum trace_flags *tf);
-
-#endif /* __CR_PARASITE_SYSCALL_H__ */
diff --git a/include/parasite-vdso.h b/include/parasite-vdso.h
deleted file mode 100644
index d4dc89b47ade..000000000000
--- a/include/parasite-vdso.h
+++ /dev/null
@@ -1,93 +0,0 @@
-#ifndef __CR_PARASITE_VDSO_H__
-#define __CR_PARASITE_VDSO_H__
-
-#include "config.h"
-
-#ifdef CONFIG_VDSO
-
-#include "util-vdso.h"
-#include "protobuf/vma.pb-c.h"
-
-struct parasite_ctl;
-struct vm_area_list;
-
-/* Check if symbol present in symtable */
-static inline bool vdso_symbol_empty(struct vdso_symbol *s)
-{
- return s->offset == VDSO_BAD_ADDR && s->name[0] == '\0';
-}
-
-/*
- * Special mark which allows to identify runtime vdso where
- * calls from proxy vdso are redirected. This mark usually
- * placed at the start of vdso area where Elf header lives.
- * Since such runtime vdso is solevey used by proxy and
- * nobody else is supposed to access it, it's more-less
- * safe to screw the Elf header with @signature and
- * @proxy_addr.
- *
- * The @proxy_addr deserves a few comments. When we redirect
- * the calls from proxy to runtime vdso, on next checkpoint
- * it won't be possible to find which VMA is proxy, thus
- * we save its address in the member.
- */
-struct vdso_mark {
- u64 signature;
- unsigned long proxy_vdso_addr;
-
- unsigned long version;
-
- /*
- * In case of new vDSO format the VVAR area address
- * neeed for easier discovering where it lives without
- * relying on procfs output.
- */
- unsigned long proxy_vvar_addr;
-};
-
-#define VDSO_MARK_SIGNATURE (0x6f73647675697263ULL) /* Magic number (criuvdso) */
-#define VDSO_MARK_SIGNATURE_V2 (0x4f53447675697263ULL) /* Magic number (criuvDSO) */
-#define VDSO_MARK_CUR_VERSION (2)
-
-static inline void vdso_put_mark(void *where, unsigned long proxy_vdso_addr, unsigned long proxy_vvar_addr)
-{
- struct vdso_mark *m = where;
-
- m->signature = VDSO_MARK_SIGNATURE_V2;
- m->proxy_vdso_addr = proxy_vdso_addr;
- m->version = VDSO_MARK_CUR_VERSION;
- m->proxy_vvar_addr = proxy_vvar_addr;
-}
-
-static inline bool is_vdso_mark(void *addr)
-{
- struct vdso_mark *m = addr;
-
- if (m->signature == VDSO_MARK_SIGNATURE_V2) {
- /*
- * New format
- */
- return true;
- } else if (m->signature == VDSO_MARK_SIGNATURE) {
- /*
- * Old format -- simply extend the mark up
- * to the version we support.
- */
- vdso_put_mark(m, m->proxy_vdso_addr, VVAR_BAD_ADDR);
- return true;
- }
- return false;
-}
-
-extern int vdso_do_park(struct vdso_symtable *sym_rt, unsigned long park_at, unsigned long park_size);
-extern int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t);
-extern int vdso_proxify(char *who, struct vdso_symtable *sym_rt,
- unsigned long vdso_rt_parked_at, size_t index,
- VmaEntry *vmas, size_t nr_vmas);
-
-#else /* CONFIG_VDSO */
-#define vdso_do_park(sym_rt, park_at, park_size) (0)
-
-#endif /* CONFIG_VDSO */
-
-#endif /* __CR_PARASITE_VDSO_H__ */
diff --git a/include/parasite.h b/include/parasite.h
deleted file mode 100644
index 063903b84874..000000000000
--- a/include/parasite.h
+++ /dev/null
@@ -1,253 +0,0 @@
-#ifndef __CR_PARASITE_H__
-#define __CR_PARASITE_H__
-
-#define PARASITE_STACK_SIZE (16 << 10)
-#define PARASITE_ARG_SIZE_MIN ( 1 << 12)
-
-#define PARASITE_MAX_SIZE (64 << 10)
-
-#ifndef __ASSEMBLY__
-
-#include <sys/un.h>
-#include <sys/time.h>
-#include <time.h>
-#include <signal.h>
-
-#include "image.h"
-#include "util-pie.h"
-
-#include "protobuf/vma.pb-c.h"
-#include "protobuf/tty.pb-c.h"
-
-#define __head __used __section(.head.text)
-
-enum {
- PARASITE_CMD_IDLE = 0,
- PARASITE_CMD_ACK,
-
- PARASITE_CMD_INIT_DAEMON,
- PARASITE_CMD_DUMP_THREAD,
- PARASITE_CMD_UNMAP,
-
- /*
- * These two must be greater than INITs.
- */
- PARASITE_CMD_DAEMONIZED,
-
- PARASITE_CMD_FINI,
-
- PARASITE_CMD_MPROTECT_VMAS,
- PARASITE_CMD_DUMPPAGES,
-
- PARASITE_CMD_DUMP_SIGACTS,
- PARASITE_CMD_DUMP_ITIMERS,
- PARASITE_CMD_DUMP_POSIX_TIMERS,
- PARASITE_CMD_DUMP_MISC,
- PARASITE_CMD_DRAIN_FDS,
- PARASITE_CMD_GET_PROC_FD,
- PARASITE_CMD_DUMP_TTY,
- PARASITE_CMD_CHECK_VDSO_MARK,
- PARASITE_CMD_CHECK_AIOS,
-
- PARASITE_CMD_MAX,
-};
-
-struct ctl_msg {
- unsigned int cmd; /* command itself */
- unsigned int ack; /* ack on command */
- int err; /* error code on reply */
-};
-
-#define ctl_msg_cmd(_cmd) \
- (struct ctl_msg){.cmd = _cmd, }
-
-#define ctl_msg_ack(_cmd, _err) \
- (struct ctl_msg){.cmd = _cmd, .ack = _cmd, .err = _err, }
-
-struct parasite_init_args {
- int h_addr_len;
- struct sockaddr_un h_addr;
-
- int log_level;
-
- struct rt_sigframe *sigframe;
-
- void *sigreturn_addr;
-};
-
-struct parasite_unmap_args {
- void *parasite_start;
- unsigned long parasite_len;
-};
-
-struct parasite_vma_entry
-{
- unsigned long start;
- unsigned long len;
- int prot;
-};
-
-struct parasite_vdso_vma_entry {
- unsigned long start;
- unsigned long len;
- unsigned long proxy_vdso_addr;
- unsigned long proxy_vvar_addr;
- int is_marked;
- bool try_fill_symtable;
- bool is_vdso;
-};
-
-struct parasite_dump_pages_args {
- unsigned int nr_vmas;
- unsigned int add_prot;
- unsigned int off;
- unsigned int nr_segs;
- unsigned int nr_pages;
-};
-
-static inline struct parasite_vma_entry *pargs_vmas(struct parasite_dump_pages_args *a)
-{
- return (struct parasite_vma_entry *)(a + 1);
-}
-
-static inline struct iovec *pargs_iovs(struct parasite_dump_pages_args *a)
-{
- return (struct iovec *)(pargs_vmas(a) + a->nr_vmas);
-}
-
-struct parasite_dump_sa_args {
- rt_sigaction_t sas[SIGMAX];
-};
-
-struct parasite_dump_itimers_args {
- struct itimerval real;
- struct itimerval virt;
- struct itimerval prof;
-};
-
-struct posix_timer {
- int it_id;
- struct itimerspec val;
- int overrun;
-};
-
-struct parasite_dump_posix_timers_args {
- int timer_n;
- struct posix_timer timer[0];
-};
-
-struct parasite_aio {
- unsigned long ctx;
- unsigned int max_reqs;
- unsigned int *vma_nr_reqs;
-};
-
-struct parasite_check_aios_args {
- unsigned nr_rings;
- struct parasite_aio ring[0];
-};
-
-static inline int posix_timers_dump_size(int timer_n)
-{
- return sizeof(int) + sizeof(struct posix_timer) * timer_n;
-}
-
-/*
- * Misc sfuff, that is too small for separate file, but cannot
- * be read w/o using parasite
- */
-
-struct parasite_dump_misc {
- unsigned long brk;
-
- u32 pid;
- u32 sid;
- u32 pgid;
- u32 umask;
-
- int dumpable;
-};
-
-/*
- * Calculate how long we can make the groups array in parasite_dump_creds
- * and still fit the struct in one page
- */
-#define PARASITE_MAX_GROUPS \
- ((PAGE_SIZE - sizeof(struct parasite_dump_thread) - \
- offsetof(struct parasite_dump_creds, groups)) / sizeof(unsigned int)) /* groups */
-
-struct parasite_dump_creds {
- unsigned int cap_last_cap;
-
- u32 cap_inh[CR_CAP_SIZE];
- u32 cap_prm[CR_CAP_SIZE];
- u32 cap_eff[CR_CAP_SIZE];
- u32 cap_bnd[CR_CAP_SIZE];
-
- int uids[4];
- int gids[4];
- unsigned int secbits;
- unsigned int ngroups;
- /*
- * FIXME -- this structure is passed to parasite code
- * through parasite args area so in parasite_dump_creds()
- * call we check for size of this data fits the size of
- * the area. Unfortunatelly, we _actually_ use more bytes
- * than the sizeof() -- we put PARASITE_MAX_GROUPS int-s
- * in there, so the size check is not correct.
- *
- * However, all this works simply because we make sure
- * the PARASITE_MAX_GROUPS is so, that the total amount
- * of memory in use doesn't exceed the PAGE_SIZE and the
- * args area is at least one page (PARASITE_ARG_SIZE_MIN).
- */
- unsigned int groups[0];
-};
-
-struct parasite_dump_thread {
- unsigned int *tid_addr;
- pid_t tid;
- tls_t tls;
- stack_t sas;
- int pdeath_sig;
- struct parasite_dump_creds creds[0];
-};
-
-static inline void copy_sas(ThreadSasEntry *dst, const stack_t *src)
-{
- dst->ss_sp = encode_pointer(src->ss_sp);
- dst->ss_size = (u64)src->ss_size;
- dst->ss_flags = src->ss_flags;
-}
-
-#define PARASITE_MAX_FDS (PAGE_SIZE / sizeof(int))
-
-struct parasite_drain_fd {
- int nr_fds;
- int fds[PARASITE_MAX_FDS];
-};
-
-static inline int drain_fds_size(struct parasite_drain_fd *dfds)
-{
- return sizeof(dfds->nr_fds) + dfds->nr_fds * sizeof(dfds->fds[0]);
-}
-
-struct parasite_tty_args {
- int fd;
- int type;
-
- int sid;
- int pgrp;
- bool hangup;
-
- int st_pckt;
- int st_lock;
- int st_excl;
-};
-
-/* the parasite prefix is added by gen_offsets.sh */
-#define parasite_sym(pblob, name) ((void *)(pblob) + parasite_blob_offset__##name)
-
-#endif /* !__ASSEMBLY__ */
-
-#endif /* __CR_PARASITE_H__ */
diff --git a/include/pid.h b/include/pid.h
deleted file mode 100644
index d073944cea7d..000000000000
--- a/include/pid.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef __CR_PID_H__
-#define __CR_PID_H__
-
-#include "stdbool.h"
-
-struct pid {
- /*
- * The @real pid is used to fetch tasks during dumping stage,
- * This is a global pid seen from the context where the dumping
- * is running.
- */
- pid_t real;
-
- /*
- * The @virt pid is one which used in the image itself and keeps
- * the pid value to be restored. This pid fetched from the
- * dumpee context, because the dumpee might have own pid namespace.
- */
- pid_t virt;
-};
-
-/*
- * When we have to restore a shared resource, we mush select which
- * task should do it, and make other(s) wait for it. In order to
- * avoid deadlocks, always make task with lower pid be the restorer.
- */
-static inline bool pid_rst_prio(unsigned pid_a, unsigned pid_b)
-{
- return pid_a < pid_b;
-}
-
-#endif /* __CR_PID_H__ */
diff --git a/include/pipes.h b/include/pipes.h
deleted file mode 100644
index c8786164c4ec..000000000000
--- a/include/pipes.h
+++ /dev/null
@@ -1,57 +0,0 @@
-#ifndef __CR_PIPES_H__
-#define __CR_PIPES_H__
-
-#include "protobuf/pipe-data.pb-c.h"
-#include "protobuf/pipe.pb-c.h"
-
-extern struct collect_image_info pipe_cinfo;
-extern int collect_pipes(void);
-extern void mark_pipe_master(void);
-extern const struct fdtype_ops pipe_dump_ops;
-
-static inline u32 pipe_id(const struct fd_parms *p)
-{
- return p->stat.st_ino;
-}
-
-#define NR_PIPES_WITH_DATA 1024
-
-struct pipe_data_dump {
- int img_type;
- unsigned int nr;
- u32 ids[NR_PIPES_WITH_DATA];
-};
-
-extern int dump_one_pipe_data(struct pipe_data_dump *pd, int lfd, const struct fd_parms *p);
-
-struct pipe_data_rst {
- PipeDataEntry *pde;
- void *data;
- struct pipe_data_rst *next;
-};
-
-#define PIPE_DATA_HASH_BITS 5
-#define PIPE_DATA_HASH_SIZE (1 << PIPE_DATA_HASH_BITS)
-#define PIPE_DATA_HASH_MASK (PIPE_DATA_HASH_SIZE - 1)
-
-extern int collect_pipe_data(int img_type, struct pipe_data_rst **hash);
-extern int restore_pipe_data(int img_type, int pfd, u32 id, struct pipe_data_rst **hash);
-
-/*
- * The sequence of objects which should be restored:
- * pipe -> files struct-s -> fd-s.
- * pipe_entry describes pipe's file structs-s.
- * A pipe doesn't have own properties, so it has no object.
- */
-
-struct pipe_info {
- PipeEntry *pe;
- struct list_head pipe_list; /* All pipe_info with the same pipe_id
- * This is pure circular list without head */
- struct list_head list; /* list head for fdinfo_list_entry-s */
- struct file_desc d;
- unsigned int create : 1,
- reopen : 1;
-};
-
-#endif /* __CR_PIPES_H__ */
diff --git a/include/plugin.h b/include/plugin.h
deleted file mode 100644
index 2855836206d9..000000000000
--- a/include/plugin.h
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef __CR_PLUGIN_H__
-#define __CR_PLUGIN_H__
-
-#include "criu-plugin.h"
-#include "compiler.h"
-#include "list.h"
-
-#define CR_PLUGIN_DEFAULT "/var/lib/criu/"
-
-void cr_plugin_fini(int stage, int err);
-int cr_plugin_init(int stage);
-
-typedef struct {
- struct list_head head;
- struct list_head hook_chain[CR_PLUGIN_HOOK__MAX];
-} cr_plugin_ctl_t;
-
-extern cr_plugin_ctl_t cr_plugin_ctl;
-
-typedef struct {
- cr_plugin_desc_t *d;
- struct list_head list;
- void *dlhandle;
- struct list_head link[CR_PLUGIN_HOOK__MAX];
-} plugin_desc_t;
-
-#define run_plugins(__hook, ...) \
-({ \
- plugin_desc_t *this; \
- int __ret = -ENOTSUP; \
- \
- list_for_each_entry(this, &cr_plugin_ctl.hook_chain[CR_PLUGIN_HOOK__ ##__hook], \
- link[CR_PLUGIN_HOOK__ ##__hook]) { \
- pr_debug("plugin: `%s' hook %u -> %p\n", \
- this->d->name, CR_PLUGIN_HOOK__ ##__hook, \
- this->d->hooks[CR_PLUGIN_HOOK__ ##__hook]); \
- __ret = ((CR_PLUGIN_HOOK__ ##__hook ##_t *) \
- this->d->hooks[CR_PLUGIN_HOOK__ ##__hook])(__VA_ARGS__); \
- if (__ret == -ENOTSUP) \
- continue; \
- break; \
- } \
- __ret; \
-})
-
-#endif
diff --git a/include/posix-timer.h b/include/posix-timer.h
deleted file mode 100644
index 568bf4a27e9d..000000000000
--- a/include/posix-timer.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef __CR_PROC_POSIX_TIMER_H__
-#define __CR_PROC_POSIX_TIMER_H__
-
-#include "list.h"
-
-struct str_posix_timer {
- long it_id;
- int clock_id;
- int si_signo;
- int it_sigev_notify;
- void * sival_ptr;
-};
-
-struct proc_posix_timer {
- struct list_head list;
- struct str_posix_timer spt;
-};
-
-struct proc_posix_timers_stat {
- int timer_n;
- struct list_head timers;
-};
-
-extern int parse_posix_timers(pid_t pid, struct proc_posix_timers_stat * args);
-void free_posix_timers(struct proc_posix_timers_stat *st);
-
-#endif /* __CR_PROC_POSIX_TIMER_H__ */
diff --git a/include/prctl.h b/include/prctl.h
deleted file mode 100644
index b48d95286277..000000000000
--- a/include/prctl.h
+++ /dev/null
@@ -1,77 +0,0 @@
-#ifndef __CR_PRCTL_H__
-#define __CR_PRCTL_H__
-
-#include "asm/int.h"
-
-#ifndef PR_SET_NAME
-# define PR_SET_NAME 15
-#endif
-#ifndef PR_GET_NAME
-# define PR_GET_NAME 16
-#endif
-#ifndef PR_SET_SECCOMP
-# define PR_SET_SECCOMP 22
-#endif
-#ifndef PR_CAPBSET_READ
-# define PR_CAPBSET_READ 23
-#endif
-#ifndef PR_CAPBSET_DROP
-# define PR_CAPBSET_DROP 24
-#endif
-#ifndef PR_GET_SECUREBITS
-# define PR_GET_SECUREBITS 27
-#endif
-#ifndef PR_SET_SECUREBITS
-# define PR_SET_SECUREBITS 28
-#endif
-#ifndef PR_GET_DUMPABLE
-# define PR_GET_DUMPABLE 3
-#endif
-#ifndef PR_SET_DUMPABLE
-# define PR_SET_DUMPABLE 4
-#endif
-
-#ifndef PR_SET_MM
-#define PR_SET_MM 35
-# define PR_SET_MM_START_CODE 1
-# define PR_SET_MM_END_CODE 2
-# define PR_SET_MM_START_DATA 3
-# define PR_SET_MM_END_DATA 4
-# define PR_SET_MM_START_STACK 5
-# define PR_SET_MM_START_BRK 6
-# define PR_SET_MM_BRK 7
-# define PR_SET_MM_ARG_START 8
-# define PR_SET_MM_ARG_END 9
-# define PR_SET_MM_ENV_START 10
-# define PR_SET_MM_ENV_END 11
-# define PR_SET_MM_AUXV 12
-# define PR_SET_MM_EXE_FILE 13
-#endif
-
-#ifndef PR_SET_MM_MAP
-# define PR_SET_MM_MAP 14
-# define PR_SET_MM_MAP_SIZE 15
-
-struct prctl_mm_map {
- u64 start_code;
- u64 end_code;
- u64 start_data;
- u64 end_data;
- u64 start_brk;
- u64 brk;
- u64 start_stack;
- u64 arg_start;
- u64 arg_end;
- u64 env_start;
- u64 env_end;
- u64 *auxv;
- u32 auxv_size;
- u32 exe_fd;
-};
-#endif
-
-#ifndef PR_GET_TID_ADDRESS
-# define PR_GET_TID_ADDRESS 40
-#endif
-
-#endif /* __CR_PRCTL_H__ */
diff --git a/include/proc_parse.h b/include/proc_parse.h
deleted file mode 100644
index 33cd07712c89..000000000000
--- a/include/proc_parse.h
+++ /dev/null
@@ -1,217 +0,0 @@
-#ifndef __CR_PROC_PARSE_H__
-#define __CR_PROC_PARSE_H__
-
-#include <sys/types.h>
-#include "asm/types.h"
-#include "image.h"
-#include "list.h"
-#include "cgroup.h"
-#include "mount.h"
-
-#include "protobuf/eventfd.pb-c.h"
-#include "protobuf/eventpoll.pb-c.h"
-#include "protobuf/signalfd.pb-c.h"
-#include "protobuf/fsnotify.pb-c.h"
-#include "protobuf/timerfd.pb-c.h"
-#include "protobuf/seccomp.pb-c.h"
-
-#define PROC_TASK_COMM_LEN 32
-#define PROC_TASK_COMM_LEN_FMT "(%31s"
-
-struct proc_pid_stat {
- int pid;
- char comm[PROC_TASK_COMM_LEN];
- char state;
- int ppid;
- int pgid;
- int sid;
- int tty_nr;
- int tty_pgrp;
- unsigned int flags;
- unsigned long min_flt;
- unsigned long cmin_flt;
- unsigned long maj_flt;
- unsigned long cmaj_flt;
- unsigned long utime;
- unsigned long stime;
- long cutime;
- long cstime;
- long priority;
- long nice;
- int num_threads;
- int zero0;
- unsigned long long start_time;
- unsigned long vsize;
- long mm_rss;
- unsigned long rsslim;
- unsigned long start_code;
- unsigned long end_code;
- unsigned long start_stack;
- unsigned long esp;
- unsigned long eip;
- unsigned long sig_pending;
- unsigned long sig_blocked;
- unsigned long sig_ignored;
- unsigned long sig_handled;
- unsigned long wchan;
- unsigned long zero1;
- unsigned long zero2;
- int exit_signal;
- int task_cpu;
- unsigned int rt_priority;
- unsigned int policy;
- unsigned long long delayacct_blkio_ticks;
- unsigned long gtime;
- long cgtime;
- unsigned long start_data;
- unsigned long end_data;
- unsigned long start_brk;
- unsigned long arg_start;
- unsigned long arg_end;
- unsigned long env_start;
- unsigned long env_end;
- int exit_code;
-};
-
-struct seccomp_info {
- SeccompFilter filter;
- int id;
- struct seccomp_info *prev;
-};
-
-#define PROC_CAP_SIZE 2
-
-struct proc_status_creds {
- unsigned int uids[4];
- unsigned int gids[4];
-
- char state;
- int ppid;
- unsigned long long sigpnd;
- unsigned long long shdpnd;
-
- int seccomp_mode;
- u32 last_filter;
-
- /*
- * Keep them at the end of structure
- * for fast comparision reason.
- */
- u32 cap_inh[PROC_CAP_SIZE];
- u32 cap_prm[PROC_CAP_SIZE];
- u32 cap_eff[PROC_CAP_SIZE];
- u32 cap_bnd[PROC_CAP_SIZE];
-};
-
-bool proc_status_creds_dumpable(struct proc_status_creds *parent,
- struct proc_status_creds *child);
-
-typedef int (*mount_fn_t)(struct mount_info *mi, const char *src, const
- char *fstype, unsigned long mountflags);
-
-struct fstype {
- char *name;
- int code;
- int (*dump)(struct mount_info *pm);
- int (*restore)(struct mount_info *pm);
- int (*parse)(struct mount_info *pm);
- mount_fn_t mount;
-};
-
-struct vm_area_list;
-
-#define INVALID_UID ((uid_t)-1)
-
-extern bool add_skip_mount(const char *mountpoint);
-extern struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool for_dump);
-extern int parse_pid_stat(pid_t pid, struct proc_pid_stat *s);
-extern unsigned int parse_pid_loginuid(pid_t pid, int *err, bool ignore_noent);
-extern int parse_pid_oom_score_adj(pid_t pid, int *err);
-extern int prepare_loginuid(unsigned int value, unsigned int loglevel);
-extern int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list);
-extern int parse_self_maps_lite(struct vm_area_list *vms);
-extern int parse_pid_status(pid_t pid, struct proc_status_creds *);
-
-struct inotify_wd_entry {
- InotifyWdEntry e;
- FhEntry f_handle;
- struct list_head node;
-};
-
-struct fanotify_mark_entry {
- FanotifyMarkEntry e;
- FhEntry f_handle;
- struct list_head node;
- union {
- FanotifyInodeMarkEntry ie;
- FanotifyMountMarkEntry me;
- };
-};
-
-struct eventpoll_tfd_entry {
- EventpollTfdEntry e;
- struct list_head node;
-};
-
-union fdinfo_entries {
- EventfdFileEntry efd;
- SignalfdEntry sfd;
- struct inotify_wd_entry ify;
- struct fanotify_mark_entry ffy;
- struct eventpoll_tfd_entry epl;
- TimerfdEntry tfy;
-};
-
-extern void free_inotify_wd_entry(union fdinfo_entries *e);
-extern void free_fanotify_mark_entry(union fdinfo_entries *e);
-extern void free_event_poll_entry(union fdinfo_entries *e);
-
-struct fdinfo_common {
- off64_t pos;
- int flags;
- int mnt_id;
- int owner;
-};
-
-extern int parse_fdinfo(int fd, int type,
- int (*cb)(union fdinfo_entries *e, void *arg), void *arg);
-extern int parse_fdinfo_pid(int pid, int fd, int type,
- int (*cb)(union fdinfo_entries *e, void *arg), void *arg);
-extern int parse_file_locks(void);
-extern int get_fd_mntid(int fd, int *mnt_id);
-
-struct pid;
-extern int parse_threads(int pid, struct pid **_t, int *_n);
-
-extern int check_mnt_id(void);
-
-/*
- * This struct describes a group controlled by one controller.
- * The @name is the controller name or 'name=...' for named cgroups.
- * The @path is the path from the hierarchy root.
- */
-
-struct cg_ctl {
- struct list_head l;
- char *name;
- char *path;
-};
-
-/*
- * Returns the list of cg_ctl-s sorted by name
- */
-
-extern int parse_task_cgroup(int pid, struct list_head *l, unsigned int *n);
-extern void put_ctls(struct list_head *);
-
-int collect_controllers(struct list_head *cgroups, unsigned int *n_cgroups);
-
-/* callback for AUFS support */
-extern int aufs_parse(struct mount_info *mi);
-
-/* callback for OverlayFS support */
-extern int overlayfs_parse(struct mount_info *mi);
-
-int parse_children(pid_t pid, pid_t **_c, int *_n);
-
-#endif /* __CR_PROC_PARSE_H__ */
diff --git a/include/protobuf-desc.h b/include/protobuf-desc.h
deleted file mode 100644
index bb66a868958d..000000000000
--- a/include/protobuf-desc.h
+++ /dev/null
@@ -1,91 +0,0 @@
-#ifndef __CR_PROTOBUF_DESC_H__
-#define __CR_PROTOBUF_DESC_H__
-
-#include <sys/types.h>
-#include <google/protobuf-c/protobuf-c.h>
-
-enum {
- /* PB_AUTOGEN_START */
- PB_INVENTORY, /* 0 */
- PB_STATS,
- PB_FDINFO,
- PB_CORE,
- PB_MM,
- PB_VMA,
- PB_ITIMER,
- PB_POSIX_TIMER,
- PB_CREDS,
- PB_FS,
- PB_UTSNS, /* 10 */
- PB_IPC_VAR,
- PB_IPC_SHM,
- PB_IPC_SEM,
- PB_MNT,
- PB_PSTREE,
- PB_GHOST_FILE,
- PB_TCP_STREAM,
- PB_REG_FILE,
- PB_EXT_FILE,
- PB_NS_FILE, /* 20 */
- PB_INET_SK,
- PB_UNIX_SK,
- PB_PACKET_SOCK,
- PB_NETLINK_SK,
- PB_PIPE,
- PB_FIFO,
- PB_PIPE_DATA,
- PB_EVENTFD_FILE,
- PB_EVENTPOLL_FILE,
- PB_EVENTPOLL_TFD, /* 30 */
- PB_SIGNALFD,
- PB_INOTIFY_FILE,
- PB_INOTIFY_WD,
- PB_FANOTIFY_FILE,
- PB_FANOTIFY_MARK,
- PB_TTY_FILE,
- PB_TTY_INFO,
- PB_FILE_LOCK,
- PB_RLIMIT,
- PB_PAGEMAP, /* 40 */
- PB_SIGINFO,
- PB_TUNFILE,
- PB_IRMAP_CACHE,
- PB_CGROUP,
- PB_SECCOMP,
- PB_TIMERFD,
- PB_CPUINFO,
- PB_USERNS,
- PB_NETNS,
- PB_BINFMT_MISC, /* 50 */
-
- /* PB_AUTOGEN_STOP */
-
- PB_PAGEMAP_HEAD,
- PB_IDS,
- PB_SIGACT,
- PB_NETDEV,
- PB_REMAP_FPATH,
- PB_SK_QUEUES,
- PB_IPCNS_MSG,
- PB_IPCNS_MSG_ENT,
-
- PB_MAX,
-};
-
-typedef size_t (*pb_getpksize_t)(void *obj);
-typedef size_t (*pb_pack_t)(void *obj, void *where);
-typedef void *(*pb_unpack_t)(void *allocator, size_t size, void *from);
-typedef void (*pb_free_t)(void *obj, void *allocator);
-
-struct cr_pb_message_desc {
- pb_getpksize_t getpksize;
- pb_pack_t pack;
- pb_unpack_t unpack;
- pb_free_t free;
- const ProtobufCMessageDescriptor *pb_desc;
-};
-
-extern void cr_pb_init(void);
-extern struct cr_pb_message_desc cr_pb_descs[PB_MAX];
-
-#endif /* __CR_PROTOBUF_DESC_H__ */
diff --git a/include/protobuf.h b/include/protobuf.h
deleted file mode 100644
index 3d76b13eda32..000000000000
--- a/include/protobuf.h
+++ /dev/null
@@ -1,57 +0,0 @@
-#ifndef __CR_PROTOBUF_H__
-#define __CR_PROTOBUF_H__
-
-#include "protobuf-desc.h"
-
-#include "asm/types.h"
-#include "compiler.h"
-#include "util.h"
-
-struct cr_img;
-
-extern int do_pb_read_one(struct cr_img *, void **objp, int type, bool eof);
-
-#define pb_read_one(fd, objp, type) do_pb_read_one(fd, (void **)objp, type, false)
-#define pb_read_one_eof(fd, objp, type) do_pb_read_one(fd, (void **)objp, type, true)
-
-extern int pb_write_one(struct cr_img *, void *obj, int type);
-
-#define pb_pksize(__obj, __proto_message_name) \
- (__proto_message_name ##__get_packed_size(__obj) + sizeof(u32))
-
-#define pb_repeated_size(__obj, __member) \
- ((size_t)(sizeof(*(__obj)->__member) * (__obj)->n_ ##__member))
-
-#define pb_msg(__base, __type) \
- container_of(__base, __type, base)
-
-#include <google/protobuf-c/protobuf-c.h>
-
-extern void do_pb_show_plain(struct cr_img *, int type, int single_entry,
- void (*payload_hadler)(struct cr_img *, void *obj),
- const char *pretty_fmt);
-
-/* Don't have objects at hands to also do typechecking here */
-#define pb_show_plain_payload_pretty(__fd, __type, payload_hadler, pretty) \
- do_pb_show_plain(__fd, __type, 0, payload_hadler, pretty)
-
-#define pb_show_plain_payload(__fd, __proto_message_name, payload_hadler) \
- pb_show_plain_payload_pretty(__fd, __proto_message_name, payload_hadler, NULL)
-
-#define pb_show_plain_pretty(__fd, __proto_message_name, __pretty) \
- pb_show_plain_payload_pretty(__fd, __proto_message_name, NULL, __pretty)
-
-struct collect_image_info {
- int fd_type;
- int pb_type;
- unsigned int priv_size;
- int (*collect)(void *, ProtobufCMessage *);
- unsigned flags;
-};
-
-#define COLLECT_SHARED 0x1 /* use shared memory for obj-s */
-#define COLLECT_HAPPENED 0x4 /* image was opened and collected */
-
-extern int collect_image(struct collect_image_info *);
-
-#endif /* __CR_PROTOBUF_H__ */
diff --git a/include/pstree.h b/include/pstree.h
deleted file mode 100644
index 47ce676a9eba..000000000000
--- a/include/pstree.h
+++ /dev/null
@@ -1,102 +0,0 @@
-#ifndef __CR_PSTREE_H__
-#define __CR_PSTREE_H__
-
-#include "list.h"
-#include "pid.h"
-#include "image.h"
-#include "rst_info.h"
-#include "protobuf/core.pb-c.h"
-
-/*
- * That's the init process which usually inherit
- * all orphaned children in the system.
- */
-#define INIT_PID (1)
-struct pstree_item {
- struct pstree_item *parent;
- struct list_head children; /* list of my children */
- struct list_head sibling; /* linkage in my parent's children list */
-
- struct pid pid;
- pid_t pgid;
- pid_t sid;
- pid_t born_sid;
-
- int state; /* TASK_XXX constants */
-
- int nr_threads; /* number of threads */
- struct pid *threads; /* array of threads */
- CoreEntry **core;
- TaskKobjIdsEntry *ids;
-};
-
-/* See alloc_pstree_item() for details */
-static inline struct rst_info *rsti(struct pstree_item *i)
-{
- return (struct rst_info *)(i + 1);
-}
-
-struct ns_id;
-struct dmp_info {
- struct ns_id *netns;
- /*
- * We keep the creds here so that we can compare creds while seizing
- * threads. Dumping tasks with different creds is not supported.
- */
- struct proc_status_creds *pi_creds;
-};
-
-static inline struct dmp_info *dmpi(struct pstree_item *i)
-{
- return (struct dmp_info *)(i + 1);
-}
-
-/* ids is alocated and initialized for all alive tasks */
-static inline int shared_fdtable(struct pstree_item *item)
-{
- return (item->parent &&
- item->ids->files_id == item->parent->ids->files_id);
-}
-
-static inline bool task_alive(struct pstree_item *i)
-{
- return (i->state == TASK_ALIVE) || (i->state == TASK_STOPPED);
-}
-
-extern void free_pstree(struct pstree_item *root_item);
-extern struct pstree_item *__alloc_pstree_item(bool rst);
-#define alloc_pstree_item() __alloc_pstree_item(false)
-#define alloc_pstree_item_with_rst() __alloc_pstree_item(true)
-extern struct pstree_item *alloc_pstree_helper(void);
-
-extern struct pstree_item *root_item;
-extern struct pstree_item *pstree_item_next(struct pstree_item *item);
-#define for_each_pstree_item(pi) \
- for (pi = root_item; pi != NULL; pi = pstree_item_next(pi))
-
-extern bool restore_before_setsid(struct pstree_item *child);
-extern int prepare_pstree(void);
-
-extern int dump_pstree(struct pstree_item *root_item);
-
-struct pstree_item *pstree_item_by_real(pid_t virt);
-struct pstree_item *pstree_item_by_virt(pid_t virt);
-
-extern int pid_to_virt(pid_t pid);
-extern bool pid_in_pstree(pid_t pid);
-
-struct task_entries;
-extern struct task_entries *task_entries;
-
-extern int get_task_ids(struct pstree_item *);
-extern struct _TaskKobjIdsEntry *root_ids;
-
-extern void core_entry_free(CoreEntry *core);
-extern CoreEntry *core_entry_alloc(int alloc_thread_info, int alloc_tc);
-extern int pstree_alloc_cores(struct pstree_item *item);
-extern void pstree_free_cores(struct pstree_item *item);
-
-extern int collect_pstree_ids(void);
-
-extern int preorder_pstree_traversal(struct pstree_item *item, int (*f)(struct pstree_item *));
-#endif /* __CR_PSTREE_H__ */
diff --git a/include/ptrace.h b/include/ptrace.h
deleted file mode 100644
index 047b1e2ab008..000000000000
--- a/include/ptrace.h
+++ /dev/null
@@ -1,84 +0,0 @@
-#ifndef __CR_PTRACE_H__
-#define __CR_PTRACE_H__
-
-#include <linux/types.h>
-#include <sys/ptrace.h>
-
-#include "config.h"
-#include "proc_parse.h"
-
-/* some constants for ptrace */
-#ifndef PTRACE_SEIZE
-# define PTRACE_SEIZE 0x4206
-#endif
-
-#ifndef PTRACE_O_SUSPEND_SECCOMP
-# define PTRACE_O_SUSPEND_SECCOMP (1 << 21)
-#endif
-
-#ifndef PTRACE_INTERRUPT
-# define PTRACE_INTERRUPT 0x4207
-#endif
-
-#ifndef PTRACE_LISTEN
-#define PTRACE_LISTEN 0x4208
-#endif
-
-#ifndef PTRACE_PEEKSIGINFO
-#define PTRACE_PEEKSIGINFO 0x4209
-
-/* Read signals from a shared (process wide) queue */
-#define PTRACE_PEEKSIGINFO_SHARED (1 << 0)
-#endif
-
-#ifndef CONFIG_HAS_PEEKSIGINFO_ARGS
-struct ptrace_peeksiginfo_args {
- __u64 off; /* from which siginfo to start */
- __u32 flags;
- __u32 nr; /* how may siginfos to take */
-};
-#endif
-
-#ifndef PTRACE_GETREGSET
-# define PTRACE_GETREGSET 0x4204
-# define PTRACE_SETREGSET 0x4205
-#endif
-
-#define PTRACE_GETSIGMASK 0x420a
-#define PTRACE_SETSIGMASK 0x420b
-
-#ifndef PTRACE_SECCOMP_GET_FILTER
-#define PTRACE_SECCOMP_GET_FILTER 0x420c
-#endif
-
-#define PTRACE_SEIZE_DEVEL 0x80000000
-
-#define PTRACE_EVENT_FORK 1
-#define PTRACE_EVENT_VFORK 2
-#define PTRACE_EVENT_CLONE 3
-#define PTRACE_EVENT_EXEC 4
-#define PTRACE_EVENT_VFORK_DONE 5
-#define PTRACE_EVENT_EXIT 6
-#define PTRACE_EVENT_STOP 128
-
-#define PTRACE_O_TRACESYSGOOD 0x00000001
-#define PTRACE_O_TRACEFORK 0x00000002
-#define PTRACE_O_TRACEVFORK 0x00000004
-#define PTRACE_O_TRACECLONE 0x00000008
-#define PTRACE_O_TRACEEXEC 0x00000010
-#define PTRACE_O_TRACEVFORKDONE 0x00000020
-#define PTRACE_O_TRACEEXIT 0x00000040
-
-#define SI_EVENT(_si_code) (((_si_code) & 0xFFFF) >> 8)
-
-extern int processes_to_wait;
-
-extern int seize_catch_task(pid_t pid);
-extern int seize_wait_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds);
-extern int suspend_seccomp(pid_t pid);
-extern int unseize_task(pid_t pid, int orig_state, int state);
-extern int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes);
-extern int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes);
-extern int ptrace_swap_area(pid_t pid, void *dst, void *src, long bytes);
-
-#endif /* __CR_PTRACE_H__ */
diff --git a/include/rbtree.h b/include/rbtree.h
deleted file mode 100644
index f6082103298f..000000000000
--- a/include/rbtree.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * RBtree implementation adopted from the Linux kernel sources.
- */
-
-#ifndef __CR_RBTREE_H__
-#define __CR_RBTREE_H__
-
-#include <stddef.h>
-
-#include "compiler.h"
-#include "asm/types.h"
-
-#define RB_RED 0
-#define RB_BLACK 1
-#define RB_MASK 3
-
-struct rb_node {
- unsigned long rb_parent_color; /* Keeps both parent anc color */
- struct rb_node *rb_right;
- struct rb_node *rb_left;
-} __aligned(sizeof(long));
-
-struct rb_root {
- struct rb_node *rb_node;
-};
-
-#define rb_parent(r) ((struct rb_node *)((r)->rb_parent_color & ~RB_MASK))
-#define rb_color(r) ((r)->rb_parent_color & RB_BLACK)
-#define rb_is_red(r) (!rb_color(r))
-#define rb_is_black(r) (rb_color(r))
-#define rb_set_red(r) do { (r)->rb_parent_color &= ~RB_BLACK; } while (0)
-#define rb_set_black(r) do { (r)->rb_parent_color |= RB_BLACK; } while (0)
-
-static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
-{
- rb->rb_parent_color = (rb->rb_parent_color & RB_MASK) | (unsigned long)p;
-}
-
-static inline void rb_set_color(struct rb_node *rb, int color)
-{
- rb->rb_parent_color = (rb->rb_parent_color & ~RB_BLACK) | color;
-}
-
-#define RB_ROOT (struct rb_root){ NULL, }
-#define rb_entry(ptr, type, member) container_of(ptr, type, member)
-
-#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL)
-#define RB_EMPTY_NODE(node) (rb_parent(node) == node)
-#define RB_CLEAR_NODE(node) (rb_set_parent(node, node))
-
-static inline void rb_init_node(struct rb_node *node)
-{
- *node = (struct rb_node){ };
-
- RB_CLEAR_NODE(node);
-}
-
-extern void rb_insert_color(struct rb_node *node, struct rb_root *root);
-extern void rb_erase(struct rb_node *node, struct rb_root *root);
-
-/* Find logical next and previous nodes in a tree */
-extern struct rb_node *rb_first(const struct rb_root *root);
-extern struct rb_node *rb_last(const struct rb_root *root);
-extern struct rb_node *rb_next(const struct rb_node *node);
-extern struct rb_node *rb_prev(const struct rb_node *node);
-
-/* Fast replacement of a single node without remove/rebalance/add/rebalance */
-extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,
- struct rb_root *root);
-
-static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
- struct rb_node **rb_link)
-{
- node->rb_parent_color = (unsigned long)parent;
- node->rb_left = node->rb_right = NULL;
-
- *rb_link = node;
-}
-
-static inline void rb_link_and_balance(struct rb_root *root,
- struct rb_node *node,
- struct rb_node *parent,
- struct rb_node **rb_link)
-{
- rb_link_node(node, parent, rb_link);
- rb_insert_color(node, root);
-}
-
-#endif /* __CR_RBTREE_H__ */
diff --git a/include/restorer.h b/include/restorer.h
deleted file mode 100644
index 4c4377cdaf67..000000000000
--- a/include/restorer.h
+++ /dev/null
@@ -1,241 +0,0 @@
-#ifndef __CR_RESTORER_H__
-#define __CR_RESTORER_H__
-
-#include <signal.h>
-#include <limits.h>
-#include <sys/resource.h>
-
-#include "compiler.h"
-#include "asm/types.h"
-#include "asm/fpu.h"
-#include "image.h"
-#include "lock.h"
-#include "util.h"
-#include "asm/restorer.h"
-#include "rst_info.h"
-#include "config.h"
-
-#include "posix-timer.h"
-#include "timerfd.h"
-#include "shmem.h"
-#include "sigframe.h"
-#include "parasite-vdso.h"
-
-#include <time.h>
-
-#include "protobuf/mm.pb-c.h"
-#include "protobuf/vma.pb-c.h"
-#include "protobuf/creds.pb-c.h"
-#include "protobuf/core.pb-c.h"
-
-struct task_restore_core_args;
-struct thread_restore_args;
-
-typedef long (*task_restore_fcall_t) (struct task_restore_core_args *args);
-typedef long (*thread_restore_fcall_t) (struct thread_restore_args *args);
-
-#define RESTORE_CMD__NONE 0
-#define RESTORE_CMD__GET_SELF_LEN 1
-#define RESTORE_CMD__RESTORE_CORE 2
-#define RESTORE_CMD__RESTORE_THREAD 3
-
-/*
- * These *must* be power of two values.
- */
-#define RESTORE_ARGS_SIZE (512)
-#define RESTORE_STACK_REDZONE (128)
-#define RESTORE_STACK_SIZE (KILO(32))
-
-struct restore_mem_zone {
- u8 redzone[RESTORE_STACK_REDZONE];
- u8 stack[RESTORE_STACK_SIZE];
- u8 rt_sigframe[RESTORE_STACK_SIGFRAME];
-} __stack_aligned__;
-
-struct rst_sched_param {
- int policy;
- int nice;
- int prio;
-};
-
-struct restore_posix_timer {
- struct str_posix_timer spt;
- struct itimerspec val;
- int overrun;
-};
-
-struct task_restore_core_args;
-
-/*
- * We should be able to construct fpu sigframe in sigreturn_prep_fpu_frame,
- * so the mem_zone.rt_sigframe should be 64-bytes aligned. To make things
- * simpler, force both _args alignment be 64 bytes.
- */
-
-struct thread_creds_args {
- CredsEntry creds;
-
- unsigned int cap_last_cap;
-
- u32 cap_inh[CR_CAP_SIZE];
- u32 cap_prm[CR_CAP_SIZE];
- u32 cap_eff[CR_CAP_SIZE];
- u32 cap_bnd[CR_CAP_SIZE];
-
- unsigned int secbits;
- char *lsm_profile;
- unsigned int *groups;
-
- unsigned long mem_lsm_profile_pos;
- unsigned long mem_groups_pos;
-
- unsigned long mem_pos_next;
-};
-
-struct thread_restore_args {
- struct restore_mem_zone mem_zone;
-
- int pid;
- UserRegsEntry gpregs;
- u64 clear_tid_addr;
-
- bool has_futex;
- u64 futex_rla;
- u32 futex_rla_len;
-
- struct rst_sched_param sp;
-
- struct task_restore_args *ta;
-
- tls_t tls;
-
- siginfo_t *siginfo;
- unsigned int siginfo_n;
-
- int pdeath_sig;
-
- struct thread_creds_args *creds_args;
-} __aligned(64);
-
-struct task_restore_args {
- struct thread_restore_args *t; /* thread group leader */
-
- int fd_exe_link; /* opened self->exe file */
- int logfd;
- unsigned int loglevel;
-
- /* threads restoration */
- int nr_threads; /* number of threads */
- thread_restore_fcall_t clone_restore_fn; /* helper address for clone() call */
- struct thread_restore_args *thread_args; /* array of thread arguments */
- struct task_entries *task_entries;
- void *rst_mem;
- unsigned long rst_mem_size;
-
- /* Below arrays get remapped from RM_PRIVATE in sigreturn_restore */
- VmaEntry *vmas;
- unsigned int vmas_n;
-
- struct restore_posix_timer *posix_timers;
- unsigned int posix_timers_n;
-
- struct restore_timerfd *timerfd;
- unsigned int timerfd_n;
-
- siginfo_t *siginfo;
- unsigned int siginfo_n;
-
- struct rst_tcp_sock *tcp_socks;
- unsigned int tcp_socks_n;
-
- struct rst_aio_ring *rings;
- unsigned int rings_n;
-
- struct rlimit *rlims;
- unsigned int rlims_n;
-
- pid_t *helpers /* the TASK_HELPERS to wait on at the end of restore */;
- unsigned int helpers_n;
-
- pid_t *zombies;
- unsigned int zombies_n;
-
- struct sock_fprog *seccomp_filters;
- unsigned int seccomp_filters_n;
-
- /* * * * * * * * * * * * * * * * * * * * */
-
- unsigned long task_size;
- unsigned long premmapped_addr;
- unsigned long premmapped_len;
- rt_sigaction_t sigchld_act;
-
- void *bootstrap_start;
- unsigned long bootstrap_len;
-
- struct itimerval itimers[3];
-
- MmEntry mm;
- auxv_t mm_saved_auxv[AT_VECTOR_SIZE];
- u32 mm_saved_auxv_size;
- char comm[TASK_COMM_LEN];
-
- /*
- * proc_fd is a handle to /proc that the restorer blob can use to open
- * files there, because some of them can't be opened before the
- * restorer blob is called.
- */
- int proc_fd;
-
- int seccomp_mode;
-
-#ifdef CONFIG_VDSO
- unsigned long vdso_rt_size;
- struct vdso_symtable vdso_sym_rt; /* runtime vdso symbols */
- unsigned long vdso_rt_parked_at; /* safe place to keep vdso */
-#endif
- void **breakpoint;
-} __aligned(64);
-
-/*
- * For arm64 stack needs to aligned to 16 bytes.
- * Hence align to 16 bytes for all
-*/
-#define RESTORE_ALIGN_STACK(start, size) \
- (ALIGN((start) + (size) - 16, 16))
-
-static inline unsigned long restorer_stack(struct thread_restore_args *a)
-{
- return RESTORE_ALIGN_STACK((long)a->mem_zone.stack, RESTORE_STACK_SIZE);
-}
-
-enum {
- CR_STATE_FAIL = -1,
- CR_STATE_RESTORE_NS = 0, /* is used for executing "setup-namespace" scripts */
- CR_STATE_RESTORE_SHARED,
- CR_STATE_FORKING,
- CR_STATE_RESTORE,
- CR_STATE_RESTORE_SIGCHLD,
- /*
- * For security reason processes can be resumed only when all
- * credentials are restored. Otherwise someone can attach to a
- * process, which are not restored credentials yet and execute
- * some code.
- */
- CR_STATE_RESTORE_CREDS,
- CR_STATE_COMPLETE
-};
-
-#define restore_finish_stage(__stage) ({ \
- futex_dec_and_wake(&task_entries->nr_in_progress); \
- futex_wait_while(&task_entries->start, __stage); \
- (s32) futex_get(&task_entries->start); \
- })
-
-
-/* the restorer_blob_offset__ prefix is added by gen_offsets.sh */
-#define __blob_offset(name) restorer_blob_offset__ ## name
-#define _blob_offset(name) __blob_offset(name)
-#define restorer_sym(rblob, name) (void*)(rblob + _blob_offset(name))
-
-#endif /* __CR_RESTORER_H__ */
diff --git a/include/rst-malloc.h b/include/rst-malloc.h
deleted file mode 100644
index 001fa4183c60..000000000000
--- a/include/rst-malloc.h
+++ /dev/null
@@ -1,74 +0,0 @@
-#ifndef __CR_RST_MALLOC__H__
-#define __CR_RST_MALLOC__H__
-
-/*
- * On restore we need differetn types of memory allocation.
- * Here's an engine that tries to generalize them all. The
- * main difference is in how the buffer with objects is being
- * grown up.
- *
- * Buffers, that are to be used by restorer will be remapped
- * into restorer address space with rst_mem_remap() call. Thus
- * we have to either keep track of all the buffers and objects,
- * or keep objects one-by-one in a plain linear buffer. The
- * engine uses the 2nd approach.
- */
-
-enum {
- /*
- * Shared non-remapable allocations. These can happen only
- * in "global" context, i.e. when objects are allocated to
- * be used by any process to be restored. The objects are
- * not going to be used in restorer blob, thus allocation
- * engine grows buffers in a simple manner.
- */
- RM_SHARED,
- /*
- * Shared objects, that are about to be used in restorer
- * blob. For these the *_remap_* stuff below is used to get
- * the actual pointer on any object. Growing a buffer is
- * done with mremap, so that we don't have to keep track
- * of all the buffer chunks and can remap them in restorer
- * in one call.
- */
- RM_SHREMAP,
- /*
- * Privately used objects. Buffer grow and remap is the
- * same as for SHREMAP, but memory regions are MAP_PRIVATE.
- */
- RM_PRIVATE,
-
- RST_MEM_TYPES,
-};
-
-/*
- * Disables SHARED and SHREMAP allocations, turns on PRIVATE
- */
-extern void rst_mem_switch_to_private(void);
-/*
- * Reports a cookie of a current shared buffer position, that
- * can later be used in rst_mem_remap_ptr() to find out the object
- * pointer in the restorer blob.
- */
-extern unsigned long rst_mem_align_cpos(int type);
-extern void *rst_mem_remap_ptr(unsigned long pos, int type);
-/*
- * Allocate and free objects. We don't need to free arbitrary
- * object, thus allocation is simple (linear) and only the
- * last object can be freed (pop-ed from buffer).
- */
-extern void *rst_mem_alloc(unsigned long size, int type);
-extern void rst_mem_free_last(int type);
-
-/* Word-align the current freelist pointer for the next allocation. If we don't
- * align pointers, some futex and atomic operations can fail.
- */
-extern void rst_mem_align(int type);
-
-/*
- * Routines to remap SHREMAP and PRIVATE into restorer address space
- */
-extern unsigned long rst_mem_lock(void);
-extern int rst_mem_remap(void *to);
-
-#endif /* __CR_RST_MALLOC__H__ */
diff --git a/include/rst_info.h b/include/rst_info.h
deleted file mode 100644
index b72e5d0c868e..000000000000
--- a/include/rst_info.h
+++ /dev/null
@@ -1,71 +0,0 @@
-#ifndef __CR_RST_INFO_H__
-#define __CR_RST_INFO_H__
-
-#include "lock.h"
-#include "list.h"
-#include "vma.h"
-
-struct task_entries {
- int nr_threads, nr_tasks, nr_helpers;
- atomic_t nr_zombies;
- futex_t nr_in_progress;
- futex_t start;
- atomic_t cr_err;
- mutex_t userns_sync_lock;
-};
-
-struct fdt {
- int nr; /* How many tasks share this fd table */
- pid_t pid; /* Who should restore this fd table */
- /*
- * The fd table is ready for restoing, if fdt_lock is equal to nr
- * The fdt table was restrored, if fdt_lock is equal to nr + 1
- */
- futex_t fdt_lock;
-};
-
-struct _MmEntry;
-
-struct rst_info {
- struct list_head fds;
- struct list_head eventpoll;
- struct list_head tty_slaves;
- struct list_head tty_ctty;
-
- void *premmapped_addr;
- unsigned long premmapped_len;
- unsigned long clone_flags;
-
- void *munmap_restorer;
-
- int nr_zombies;
-
- int service_fd_id;
- struct fdt *fdt;
-
- struct vm_area_list vmas;
- struct _MmEntry *mm;
-
- u32 cg_set;
-
- union {
- struct pstree_item *pgrp_leader;
- futex_t pgrp_set;
- };
-
- struct file_desc *cwd;
- struct file_desc *root;
- bool has_umask;
- u32 umask;
-
- /*
- * We set this flag when process has seccomp filters
- * so that we know to suspend them before we unmap the
- * restorer blob.
- */
- bool has_seccomp;
-
- void *breakpoint;
-};
-
-#endif /* __CR_RST_INFO_H__ */
diff --git a/include/seccomp.h b/include/seccomp.h
deleted file mode 100644
index b5b26c80996d..000000000000
--- a/include/seccomp.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef __CR_SECCOMP_H__
-#define __CR_SECCOMP_H__
-
-#include <linux/seccomp.h>
-#include <linux/filter.h>
-
-#include "protobuf/core.pb-c.h"
-
-#ifndef SECCOMP_MODE_DISABLED
-#define SECCOMP_MODE_DISABLED 0
-#endif
-
-#ifndef SECCOMP_MODE_STRICT
-#define SECCOMP_MODE_STRICT 1
-#endif
-
-#ifndef SECCOMP_MODE_FILTER
-#define SECCOMP_MODE_FILTER 2
-#endif
-
-#ifndef SECCOMP_SET_MODE_FILTER
-#define SECCOMP_SET_MODE_FILTER 1
-#endif
-
-#ifndef SECCOMP_FILTER_FLAG_TSYNC
-#define SECCOMP_FILTER_FLAG_TSYNC 1
-#endif
-
-extern int collect_seccomp_filters(void);
-extern int prepare_seccomp_filters(void);
-extern int seccomp_filters_get_rst_pos(CoreEntry *item, int *count, unsigned long *pos);
-#endif
diff --git a/include/seize.h b/include/seize.h
deleted file mode 100644
index 315fab2d36d9..000000000000
--- a/include/seize.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __CR_SEIZE_H__
-#define __CR_SEIZE_H__
-
-extern int collect_pstree(pid_t pid);
-extern void pstree_switch_state(struct pstree_item *root_item, int st);
-extern const char *get_real_freezer_state(void);
-
-#endif
diff --git a/include/servicefd.h b/include/servicefd.h
deleted file mode 100644
index a9e35a223420..000000000000
--- a/include/servicefd.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef __CR_SERVICE_FD_H__
-#define __CR_SERVICE_FD_H__
-
-#include <stdbool.h>
-
-enum sfd_type {
- SERVICE_FD_MIN,
-
- LOG_FD_OFF,
- IMG_FD_OFF,
- PROC_FD_OFF, /* fd with /proc for all proc_ calls */
- CTL_TTY_OFF,
- SELF_STDIN_OFF,
- CR_PROC_FD_OFF, /* some other's proc fd.
- * For dump -- target ns' proc
- * For restore -- CRIU ns' proc
- */
- ROOT_FD_OFF, /* Root of the namespace we dump/restore */
- CGROUP_YARD,
- USERNSD_SK, /* Socket for usernsd */
- NS_FD_OFF, /* Node's net namespace fd */
-
- SERVICE_FD_MAX
-};
-
-extern int clone_service_fd(int id);
-extern int init_service_fd(void);
-extern int get_service_fd(enum sfd_type type);
-extern int reserve_service_fd(enum sfd_type type);
-extern int install_service_fd(enum sfd_type type, int fd);
-extern int close_service_fd(enum sfd_type type);
-extern bool is_service_fd(int fd, enum sfd_type type);
-extern bool is_any_service_fd(int fd);
-
-#endif /* __CR_SERVICE_FD_H__ */
diff --git a/include/setproctitle.h b/include/setproctitle.h
deleted file mode 100644
index bc634331bde4..000000000000
--- a/include/setproctitle.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef __CR_SETPROCTITLE_H__
-#define __CR_SETPROCTITLE_H__
-
-#ifdef CONFIG_HAS_LIBBSD
-#include <bsd/unistd.h>
-#else
-
-/*
- * setproctitle_init is in the libbsd since v0.6.0. This macro allows to
- * compile criu with libbsd<0.6.0.
- */
-#ifndef CONFIG_HAS_SETPROCTITLE_INIT
-#define setproctitle_init(argc, argv, envp)
-#endif
-
-#define setproctitle(fmt, ...)
-#endif
-
-#endif /* __CR_SETPROCTITLE_H__ */
diff --git a/include/shmem.h b/include/shmem.h
deleted file mode 100644
index 47dd0fd3b396..000000000000
--- a/include/shmem.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef __CR_SHMEM_H__
-#define __CR_SHMEM_H__
-
-#include "lock.h"
-#include "protobuf/vma.pb-c.h"
-
-struct _VmaEntry;
-extern int collect_shmem(int pid, struct _VmaEntry *vi);
-extern void show_saved_shmems(void);
-extern int get_shmem_fd(int pid, VmaEntry *vi);
-
-extern int cr_dump_shmem(void);
-extern int add_shmem_area(pid_t pid, VmaEntry *vma);
-
-#endif /* __CR_SHMEM_H__ */
diff --git a/include/sigframe.h b/include/sigframe.h
deleted file mode 100644
index 5ab09b1fd662..000000000000
--- a/include/sigframe.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Generic sigframe bits.
- */
-
-#ifndef __CR_SIGFRAME_H__
-#define __CR_SIGFRAME_H__
-
-#include "asm/types.h"
-#include "protobuf/core.pb-c.h"
-
-struct rt_sigframe;
-
-/* sigframe should be aligned on 64 byte for x86 and 8 bytes for arm */
-#define RESTORE_STACK_SIGFRAME ALIGN(sizeof(struct rt_sigframe) + SIGFRAME_OFFSET, 64)
-
-#ifndef __ARCH_SI_PREAMBLE_SIZE
-#define __ARCH_SI_PREAMBLE_SIZE (3 * sizeof(int))
-#endif
-
-#define SI_MAX_SIZE 128
-#ifndef SI_PAD_SIZE
-#define SI_PAD_SIZE ((SI_MAX_SIZE - __ARCH_SI_PREAMBLE_SIZE) / sizeof(int))
-#endif
-
-typedef struct rt_siginfo {
- int si_signo;
- int si_errno;
- int si_code;
- int _pad[SI_PAD_SIZE];
-} rt_siginfo_t;
-
-typedef struct rt_sigaltstack {
- void *ss_sp;
- int ss_flags;
- size_t ss_size;
-} rt_stack_t;
-
-struct rt_ucontext {
- unsigned long uc_flags;
- struct rt_ucontext *uc_link;
- rt_stack_t uc_stack;
- struct rt_sigcontext uc_mcontext;
- k_rtsigset_t uc_sigmask; /* mask last for extensibility */
- int __unused[32 - (sizeof (k_rtsigset_t) / sizeof (int))];
- unsigned long uc_regspace[128] __attribute__((__aligned__(8)));
-};
-
-extern int construct_sigframe(struct rt_sigframe *sigframe,
- struct rt_sigframe *rsigframe,
- CoreEntry *core);
-
-/*
- * FIXME Convert it to inline helper, which requires
- * to unweave types mess we've generated for
- * run-time data.
- */
-#define setup_sas(sigframe, sas) \
-do { \
- if ((sas)) { \
- RT_SIGFRAME_UC((sigframe)).uc_stack.ss_sp = (void *)decode_pointer((sas)->ss_sp); \
- RT_SIGFRAME_UC((sigframe)).uc_stack.ss_flags = (int)(sas)->ss_flags; \
- RT_SIGFRAME_UC((sigframe)).uc_stack.ss_size = (size_t)(sas)->ss_size; \
- } \
-} while (0)
-
-#endif /* __CR_SIGFRAME_H__ */
diff --git a/include/signalfd.h b/include/signalfd.h
deleted file mode 100644
index c7af81977b29..000000000000
--- a/include/signalfd.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef __CR_SIGNALFD_H__
-#define __CR_SIGNALFD_H__
-
-struct cr_imgset;
-struct fd_parms;
-extern int is_signalfd_link(char *link);
-extern const struct fdtype_ops signalfd_dump_ops;
-extern struct collect_image_info signalfd_cinfo;
-
-#endif /* __CR_SIGNALFD_H__ */
diff --git a/include/sk-inet.h b/include/sk-inet.h
deleted file mode 100644
index 5b5fca63870c..000000000000
--- a/include/sk-inet.h
+++ /dev/null
@@ -1,88 +0,0 @@
-#ifndef __CR_SK_INET_H__
-#define __CR_SK_INET_H__
-
-#include <netinet/tcp.h>
-
-#include "sockets.h"
-#include "files.h"
-#include "list.h"
-#include "protobuf.h"
-#include "protobuf/sk-inet.pb-c.h"
-
-#define INET_ADDR_LEN 40
-#ifndef TCP_REPAIR
-#define TCP_REPAIR 19 /* TCP sock is under repair right now */
-#define TCP_REPAIR_QUEUE 20
-#define TCP_QUEUE_SEQ 21
-#define TCP_REPAIR_OPTIONS 22
-#endif
-
-struct inet_sk_desc {
- struct socket_desc sd;
- unsigned int type;
- unsigned int src_port;
- unsigned int dst_port;
- unsigned int state;
- unsigned int rqlen;
- unsigned int wqlen; /* sent + unsent data */
- unsigned int uwqlen; /* unsent data */
- unsigned int src_addr[4];
- unsigned int dst_addr[4];
- unsigned short shutdown;
-
- int rfd;
- int cpt_reuseaddr;
- struct list_head rlist;
-};
-
-struct inet_port;
-struct inet_sk_info {
- InetSkEntry *ie;
- struct file_desc d;
- struct inet_port *port;
- /*
- * This is an fd by which the socket is opened.
- * It will be carried down to restorer code to
- * repair-off the socket at the very end.
- */
- int sk_fd;
- struct list_head rlist;
-};
-
-extern int inet_bind(int sk, struct inet_sk_info *);
-extern int inet_connect(int sk, struct inet_sk_info *);
-
-#ifdef CR_NOGLIBC
-#define setsockopt sys_setsockopt
-#endif
-static inline void tcp_repair_off(int fd)
-{
- int aux = 0, ret;
-
- ret = setsockopt(fd, SOL_TCP, TCP_REPAIR, &aux, sizeof(aux));
- if (ret < 0)
- pr_err("Failed to turn off repair mode on socket (%d)\n", ret);
-}
-
-extern void tcp_locked_conn_add(struct inet_sk_info *);
-extern void rst_unlock_tcp_connections(void);
-extern void cpt_unlock_tcp_connections(void);
-
-extern int dump_one_tcp(int sk, struct inet_sk_desc *sd);
-extern int restore_one_tcp(int sk, struct inet_sk_info *si);
-
-#define SK_EST_PARAM "tcp-established"
-
-extern int check_tcp(void);
-extern mutex_t *inet_get_reuseaddr_lock(struct inet_sk_info *ii);
-
-int rst_tcp_socks_prep(void);
-extern unsigned long rst_tcp_socks_cpos;
-extern unsigned int rst_tcp_socks_nr;
-
-struct rst_tcp_sock {
- int sk;
- bool reuseaddr;
-};
-
-#endif /* __CR_SK_INET_H__ */
diff --git a/include/sk-packet.h b/include/sk-packet.h
deleted file mode 100644
index 6c4398c604e4..000000000000
--- a/include/sk-packet.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef __CR_SK_PACKET_H__
-#define __CR_SK_PACKET_H__
-
-#ifndef PACKET_TIMESTAMP
-#define PACKET_TIMESTAMP 17
-#endif
-
-struct cr_imgset;
-struct fd_parms;
-struct vma_area;
-
-extern struct collect_image_info packet_sk_cinfo;
-
-extern int dump_socket_map(struct vma_area *vma);
-extern int get_socket_fd(int pid, VmaEntry *vma);
-
-extern int packet_receive_one(struct nlmsghdr *h, void *arg);
-
-#ifndef PACKET_VNET_HDR
-#define PACKET_VNET_HDR 15
-#endif
-
-#ifndef PACKET_FANOUT
-#define PACKET_FANOUT 18
-#endif
-
-#ifndef TPACKET3_HDRLEN
-struct tpacket_req3 {
- unsigned int tp_block_size;
- unsigned int tp_block_nr;
- unsigned int tp_frame_size;
- unsigned int tp_frame_nr;
- unsigned int tp_retire_blk_tov;
- unsigned int tp_sizeof_priv;
- unsigned int tp_feature_req_word;
-};
-#endif
-
-#endif /* __CR_SK_PACKET_H__ */
diff --git a/include/sk-queue.h b/include/sk-queue.h
deleted file mode 100644
index 9044de0b0fb6..000000000000
--- a/include/sk-queue.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef __CR_SK_QUEUE_H__
-#define __CR_SK_QUEUE_H__
-
-extern int read_sk_queues(void);
-extern int dump_sk_queue(int sock_fd, int sock_id);
-extern int restore_sk_queue(int fd, unsigned int peer_id);
-
-#endif /* __CR_SK_QUEUE_H__ */
diff --git a/include/sockets.h b/include/sockets.h
deleted file mode 100644
index b726e2f7a0b3..000000000000
--- a/include/sockets.h
+++ /dev/null
@@ -1,89 +0,0 @@
-#ifndef __CR_SOCKETS_H__
-#define __CR_SOCKETS_H__
-
-#include <stdbool.h>
-#include <sys/socket.h>
-
-#include "asm/types.h"
-
-#include "protobuf.h"
-#include "protobuf/sk-opts.pb-c.h"
-
-struct fdinfo_list_entry;
-struct sk_opts_entry;
-struct file_desc;
-struct fd_parms;
-struct cr_imgset;
-struct nlmsghdr;
-struct cr_img;
-
-struct socket_desc {
- unsigned int family;
- unsigned int ino;
- struct socket_desc *next;
- int already_dumped;
-};
-
-extern int dump_socket(struct fd_parms *p, int lfd, struct cr_img *);
-extern int dump_socket_opts(int sk, SkOptsEntry *soe);
-extern int restore_socket_opts(int sk, SkOptsEntry *soe);
-extern void release_skopts(SkOptsEntry *);
-extern int restore_prepare_socket(int sk);
-extern void preload_socket_modules();
-
-extern bool socket_test_collect_bit(unsigned int family, unsigned int proto);
-
-extern int sk_collect_one(int ino, int family, struct socket_desc *d);
-struct ns_id;
-extern int collect_sockets(struct ns_id *);
-extern int collect_inet_sockets(void);
-extern struct collect_image_info unix_sk_cinfo;
-extern int collect_unix_sockets(void);
-extern int fix_external_unix_sockets(void);
-extern int resolve_unix_peers(void);
-
-extern struct collect_image_info netlink_sk_cinfo;
-
-extern struct socket_desc *lookup_socket(int ino, int family, int proto);
-
-extern const struct fdtype_ops unix_dump_ops;
-extern const struct fdtype_ops inet_dump_ops;
-extern const struct fdtype_ops inet6_dump_ops;
-extern const struct fdtype_ops netlink_dump_ops;
-extern const struct fdtype_ops packet_dump_ops;
-
-extern int inet_collect_one(struct nlmsghdr *h, int family, int type);
-extern int unix_receive_one(struct nlmsghdr *h, void *);
-extern int netlink_receive_one(struct nlmsghdr *hdr, void *arg);
-
-extern int unix_sk_id_add(ino_t ino);
-extern int unix_sk_ids_parse(char *optarg);
-
-extern int do_dump_opt(int sk, int level, int name, void *val, int len);
-#define dump_opt(s, l, n, f) do_dump_opt(s, l, n, f, sizeof(*f))
-extern int do_restore_opt(int sk, int level, int name, void *val, int len);
-#define restore_opt(s, l, n, f) do_restore_opt(s, l, n, f, sizeof(*f))
-
-#define sk_encode_shutdown(img, mask) do { \
- /* \
- * protobuf SK_SHUTDOWN__ bits match those \
- * reported by kernel \
- */ \
- (img)->shutdown = mask; \
- if ((img)->shutdown != SK_SHUTDOWN__NONE) \
- (img)->has_shutdown = true; \
- } while (0)
-
-static inline int sk_decode_shutdown(int val)
-{
- static const int hows[] = {-1, SHUT_RD, SHUT_WR, SHUT_RDWR};
- return hows[val];
-}
-
-#define USK_EXT_PARAM "ext-unix-sk"
-
-#ifndef NETLINK_SOCK_DIAG
-#define NETLINK_SOCK_DIAG NETLINK_INET_DIAG
-#endif
-
-#endif /* __CR_SOCKETS_H__ */
diff --git a/include/stats.h b/include/stats.h
deleted file mode 100644
index e417636e6d1e..000000000000
--- a/include/stats.h
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifndef __CR_STATS_H__
-#define __CR_STATS_H__
-
-enum {
- TIME_FREEZING,
- TIME_FROZEN,
- TIME_MEMDUMP,
- TIME_MEMWRITE,
- TIME_IRMAP_RESOLVE,
-
- DUMP_TIME_NR_STATS,
-};
-
-enum {
- TIME_FORK,
- TIME_RESTORE,
-
- RESTORE_TIME_NS_STATS,
-};
-
-extern void timing_start(int t);
-extern void timing_stop(int t);
-
-enum {
- CNT_PAGES_SCANNED,
- CNT_PAGES_SKIPPED_PARENT,
- CNT_PAGES_WRITTEN,
-
- DUMP_CNT_NR_STATS,
-};
-
-enum {
- CNT_PAGES_COMPARED,
- CNT_PAGES_SKIPPED_COW,
- CNT_PAGES_RESTORED,
-
- RESTORE_CNT_NR_STATS,
-};
-
-extern void cnt_add(int c, unsigned long val);
-
-#define DUMP_STATS 1
-#define RESTORE_STATS 2
-
-extern int init_stats(int what);
-extern void write_stats(int what);
-
-#endif /* __CR_STATS_H__ */
diff --git a/include/string.h b/include/string.h
deleted file mode 100644
index b469bfe55a84..000000000000
--- a/include/string.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef __CR_STRING_H__
-#define __CR_STRING_H__
-
-#include <sys/types.h>
-#include <string.h>
-
-#ifdef CONFIG_HAS_LIBBSD
-# include <bsd/string.h>
-#endif
-
-#include "config.h"
-
-#ifndef CONFIG_HAS_STRLCPY
-extern size_t strlcpy(char *dest, const char *src, size_t size);
-#endif
-
-#ifndef CONFIG_HAS_STRLCAT
-extern size_t strlcat(char *dest, const char *src, size_t count);
-#endif
-
-#endif /* __CR_STRING_H__ */
diff --git a/include/syscall-types.h b/include/syscall-types.h
deleted file mode 100644
index e3a114d6c280..000000000000
--- a/include/syscall-types.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Please add here type definitions if
- * syscall prototypes need them.
- *
- * Anything else should go to plain type.h
- */
-
-#ifndef __CR_SYSCALL_TYPES_H__
-#define __CR_SYSCALL_TYPES_H__
-
-#include <sys/time.h>
-#include <arpa/inet.h>
-#include <sched.h>
-
-#include "asm/types.h"
-
-struct cap_header {
- u32 version;
- int pid;
-};
-
-struct cap_data {
- u32 eff;
- u32 prm;
- u32 inh;
-};
-
-struct sockaddr;
-struct msghdr;
-struct rusage;
-struct file_handle;
-struct robust_list_head;
-struct io_event;
-struct timespec;
-
-typedef unsigned long aio_context_t;
-
-struct itimerspec;
-
-#ifndef F_GETFD
-#define F_GETFD 1
-#endif
-
-#ifndef CLONE_NEWNS
-#define CLONE_NEWNS 0x00020000
-#endif
-
-#ifndef CLONE_NEWPID
-#define CLONE_NEWPID 0x20000000
-#endif
-
-#ifndef CLONE_NEWUTS
-#define CLONE_NEWUTS 0x04000000
-#endif
-
-#ifndef CLONE_NEWIPC
-#define CLONE_NEWIPC 0x08000000
-#endif
-
-#ifndef CLONE_NEWNET
-#define CLONE_NEWNET 0x40000000
-#endif
-
-#ifndef CLONE_NEWUSER
-#define CLONE_NEWUSER 0x10000000
-#endif
-
-#define CLONE_ALLNS (CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWUSER)
-
-#define setns sys_setns
-
-struct rlimit;
-struct rlimit64;
-
-struct krlimit {
- unsigned long rlim_cur;
- unsigned long rlim_max;
-};
-
-struct siginfo;
-
-/* Type of timers in the kernel. */
-typedef int kernel_timer_t;
-
-#endif /* __CR_SYSCALL_TYPES_H__ */
diff --git a/include/sysctl.h b/include/sysctl.h
deleted file mode 100644
index b949a409eeb3..000000000000
--- a/include/sysctl.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef __CR_SYSCTL_H__
-#define __CR_SYSCTL_H__
-
-struct sysctl_req {
- char *name;
- void *arg;
- int type;
- int flags;
-};
-
-extern int sysctl_op(struct sysctl_req *req, size_t nr_req, int op, unsigned int ns);
-
-enum {
- CTL_READ,
- CTL_WRITE,
-};
-
-#define CTL_SHIFT 4 /* Up to 16 types */
-
-#define CTL_U32 1 /* Single u32 */
-#define CTL_U64 2 /* Single u64 */
-#define __CTL_U32A 3 /* Array of u32 */
-#define __CTL_U64A 4 /* Array of u64 */
-#define __CTL_STR 5 /* String */
-#define CTL_32 6 /* Single s32 */
-
-#define CTL_U32A(n) (__CTL_U32A | ((n) << CTL_SHIFT))
-#define CTL_U64A(n) (__CTL_U64A | ((n) << CTL_SHIFT))
-#define CTL_STR(len) (__CTL_STR | ((len) << CTL_SHIFT))
-
-#define CTL_LEN(t) ((t) >> CTL_SHIFT)
-#define CTL_TYPE(t) ((t) & ((1 << CTL_SHIFT) - 1))
-
-/*
- * Some entries might be missing mark them as optional.
- */
-#define CTL_FLAGS_OPTIONAL 1
-
-#endif /* __CR_SYSCTL_H__ */
diff --git a/include/sysfs_parse.h b/include/sysfs_parse.h
deleted file mode 100644
index 4d74c4ee5422..000000000000
--- a/include/sysfs_parse.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef __CR_SYSFS_PARSE_H__
-#define __CR_SYSFS_PARSE_H__
-
-#define SYSFS_AUFS "/sys/fs/aufs/"
-#define SBINFO_LEN (3 + 16 + 1) /* si_%lx */
-#define SBINFO_PATH_LEN (sizeof SYSFS_AUFS + SBINFO_LEN) /* /sys/fs/aufs/<sbinfo> */
-#define AUFSBR_PATH_LEN (SBINFO_PATH_LEN + 6 + 1) /* /sys/fs/aufs/<sbinfo>/br%3d */
-
-extern int parse_aufs_branches(struct mount_info *mi);
-extern int fixup_aufs_vma_fd(struct vma_area *vma);
-extern void free_aufs_branches(void);
-
-#endif /* __CR_SYSFS_PARSE_H__ */
-
diff --git a/include/timerfd.h b/include/timerfd.h
deleted file mode 100644
index 67b9187179cf..000000000000
--- a/include/timerfd.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef __CR_TIMERFD_H__
-#define __CR_TIMERFD_H__
-
-#include <time.h>
-#include <sys/ioctl.h>
-
-#include "files.h"
-
-struct pstree_item;
-
-struct restore_timerfd {
- int id;
- int fd;
- int clockid;
- int settime_flags;
- unsigned long ticks;
- struct itimerspec val;
-};
-
-extern const struct fdtype_ops timerfd_dump_ops;
-extern struct collect_image_info timerfd_cinfo;
-
-int rst_timerfd_prep(void);
-extern unsigned long rst_timerfd_cpos;
-extern unsigned int rst_timerfd_nr;
-
-
-extern int check_timerfd(void);
-extern int is_timerfd_link(char *link);
-
-#ifndef TFD_TIMER_ABSTIME
-# define TFD_TIMER_ABSTIME (1 << 0)
-#endif
-
-#ifndef TFD_IOC_SET_TICKS
-# define TFD_IOC_SET_TICKS _IOW('T', 0, u64)
-#endif
-
-#endif /* __CR_TIMERFD_H__ */
diff --git a/include/tty.h b/include/tty.h
deleted file mode 100644
index c8b620992d75..000000000000
--- a/include/tty.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef __CR_TTY_H__
-#define __CR_TTY_H__
-
-#include <linux/major.h>
-#include <linux/vt.h>
-
-#include "files.h"
-
-/* Kernel's limit */
-#define TERMIOS_NCC 19
-
-extern const struct fdtype_ops tty_dump_ops;
-
-struct tty_driver;
-struct tty_driver *get_tty_driver(dev_t rdev, dev_t dev);
-static inline int is_tty(dev_t rdev, dev_t dev)
-{
- return get_tty_driver(rdev, dev) != NULL;
-}
-
-extern int dump_verify_tty_sids(void);
-extern struct collect_image_info tty_info_cinfo;
-extern struct collect_image_info tty_cinfo;
-extern int prepare_shared_tty(void);
-extern int tty_setup_slavery(void);
-
-extern int tty_verify_active_pairs(void);
-
-extern int tty_prep_fds(void);
-extern void tty_fini_fds(void);
-
-#define OPT_SHELL_JOB "shell-job"
-
-#endif /* __CR_TTY_H__ */
diff --git a/include/tun.h b/include/tun.h
deleted file mode 100644
index d70f8f2103c4..000000000000
--- a/include/tun.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef __CR_TUN_H__
-#define __CR_TUN_H__
-
-#ifndef TUN_MINOR
-#define TUN_MINOR 200
-#endif
-
-#include "protobuf/netdev.pb-c.h"
-
-extern const struct fdtype_ops tunfile_dump_ops;
-extern int dump_tun_link(NetDeviceEntry *nde, struct cr_imgset *fds);
-extern int restore_one_tun(NetDeviceEntry *nde, int nlsk);
-extern struct collect_image_info tunfile_cinfo;
-extern int check_tun_cr(int no_tun_err);
-
-#endif /* __CR_TUN_H__ */
diff --git a/include/unix_diag.h b/include/unix_diag.h
deleted file mode 100644
index 3f2468330e2b..000000000000
--- a/include/unix_diag.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef __CR_UNIX_DIAG_H__
-#define __CR_UNIX_DIAG_H__
-
-#include "asm/types.h"
-
-struct unix_diag_req {
- u8 sdiag_family;
- u8 sdiag_protocol;
- u16 pad;
- u32 udiag_states;
- u32 udiag_ino;
- u32 udiag_show;
- u32 udiag_cookie[2];
-};
-
-#define UDIAG_SHOW_NAME 0x00000001 /* show name (not path) */
-#define UDIAG_SHOW_VFS 0x00000002 /* show VFS inode info */
-#define UDIAG_SHOW_PEER 0x00000004 /* show peer socket info */
-#define UDIAG_SHOW_ICONS 0x00000008 /* show pending connections */
-#define UDIAG_SHOW_RQLEN 0x00000010 /* show skb receive queue len */
-#define UDIAG_SHOW_MEMINFO 0x00000020 /* show memory info of a socket */
-
-struct unix_diag_msg {
- u8 udiag_family;
- u8 udiag_type;
- u8 udiag_state;
- u8 pad;
-
- u32 udiag_ino;
- u32 udiag_cookie[2];
-};
-
-enum {
- SK_MEMINFO_RMEM_ALLOC,
- SK_MEMINFO_RCVBUF,
- SK_MEMINFO_WMEM_ALLOC,
- SK_MEMINFO_SNDBUF,
- SK_MEMINFO_FWD_ALLOC,
- SK_MEMINFO_WMEM_QUEUED,
- SK_MEMINFO_OPTMEM,
-
- SK_MEMINFO_VARS,
-};
-
-enum {
- UNIX_DIAG_NAME,
- UNIX_DIAG_VFS,
- UNIX_DIAG_PEER,
- UNIX_DIAG_ICONS,
- UNIX_DIAG_RQLEN,
- UNIX_DIAG_MEMINFO,
- UNIX_DIAG_SHUTDOWN,
-
- UNIX_DIAG_MAX,
-};
-
-struct unix_diag_vfs {
- u32 udiag_vfs_ino;
- u32 udiag_vfs_dev;
-};
-
-struct unix_diag_rqlen {
- u32 udiag_rqueue;
- u32 udiag_wqueue;
-};
-
-#endif /* __CR_UNIX_DIAG_H__ */
diff --git a/include/util-pie.h b/include/util-pie.h
deleted file mode 100644
index cbaed4224cfd..000000000000
--- a/include/util-pie.h
+++ /dev/null
@@ -1,66 +0,0 @@
-#ifndef __CR_UTIL_NET_H__
-#define __CR_UTIL_NET_H__
-
-#include <sys/socket.h>
-#include <sys/un.h>
-
-#include "asm/types.h"
-
-#define UNIX_PATH_MAX (sizeof(struct sockaddr_un) - \
- (size_t)((struct sockaddr_un *) 0)->sun_path)
-
-#ifndef SO_PEEK_OFF
-#define SO_PEEK_OFF 42
-#endif
-
-/*
- * Because of kernel doing kmalloc for user data passed
- * in SCM messages, and there is kernel's SCM_MAX_FD as a limit
- * for descriptors passed at once we're trying to reduce
- * the pressue on kernel memory manager and use predefined
- * known to work well size of the message buffer.
- */
-#define CR_SCM_MSG_SIZE (1024)
-#define CR_SCM_MAX_FD (252)
-
-struct fd_opts {
- char flags;
- struct {
- u32 uid;
- u32 euid;
- u32 signum;
- u32 pid_type;
- u32 pid;
- } fown;
-};
-
-struct scm_fdset {
- struct msghdr hdr;
- struct iovec iov;
- char msg_buf[CR_SCM_MSG_SIZE];
- struct fd_opts opts[CR_SCM_MAX_FD];
-};
-
-extern int send_fds(int sock, struct sockaddr_un *saddr, int saddr_len,
- int *fds, int nr_fds, bool with_flags);
-extern int recv_fds(int sock, int *fds, int nr_fds, struct fd_opts *opts);
-
-static inline int send_fd(int sock, struct sockaddr_un *saddr, int saddr_len, int fd)
-{
- return send_fds(sock, saddr, saddr_len, &fd, 1, false);
-}
-
-static inline int recv_fd(int sock)
-{
- int fd, ret;
-
- ret = recv_fds(sock, &fd, 1, NULL);
- if (ret)
- return -1;
-
- return fd;
-}
-
-extern int open_detach_mount(char *dir);
-
-#endif /* __CR_UTIL_NET_H__ */
diff --git a/include/util-vdso.h b/include/util-vdso.h
deleted file mode 100644
index c8dfa054f825..000000000000
--- a/include/util-vdso.h
+++ /dev/null
@@ -1,65 +0,0 @@
-#ifndef __CR_UTIL_VDSO_H__
-#define __CR_UTIL_VDSO_H__
-
-/*
- * VDSO management common definitions.
- *
- * This header file is included by the criu main code and the parasite code.
- * It contains definitions shared by these 2 parts.
- *
- * This file should not be included except in pie/util-vdso.c, include/vdso.h
- * and include/parasite-vdso.h
- */
-
-#include <sys/types.h>
-
-/*
- * Each architecture must export:
- * VDSO_SYMBOL_MAX, the number of vDSO symbols to manage
- * ARCH_VDSO_SYMBOLS, a table of string containing the vDSO symbol names
- * vdso_redirect_calls, a service called to redirect the vDSO symbols in
- * the parasite code.
- */
-#include "asm/vdso.h"
-
-struct vdso_symbol {
- char name[32];
- unsigned long offset;
-};
-
-struct vdso_symtable {
- unsigned long vma_start;
- unsigned long vma_end;
- unsigned long vvar_start;
- unsigned long vvar_end;
- struct vdso_symbol symbols[VDSO_SYMBOL_MAX];
-};
-
-#define VDSO_SYMBOL_INIT { .offset = VDSO_BAD_ADDR, }
-
-#define VDSO_SYMTABLE_INIT \
- { \
- .vma_start = VDSO_BAD_ADDR, \
- .vma_end = VDSO_BAD_ADDR, \
- .vvar_start = VVAR_BAD_ADDR, \
- .vvar_end = VVAR_BAD_ADDR, \
- .symbols = { \
- [0 ... VDSO_SYMBOL_MAX - 1] = \
- (struct vdso_symbol)VDSO_SYMBOL_INIT, \
- }, \
- }
-
-/* Size of VMA associated with vdso */
-static inline unsigned long vdso_vma_size(struct vdso_symtable *t)
-{
- return t->vma_end - t->vma_start;
-}
-
-static inline unsigned long vvar_vma_size(struct vdso_symtable *t)
-{
- return t->vvar_end - t->vvar_start;
-}
-
-extern int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t);
-
-#endif /* __CR_UTIL_VDSO_H__ */
diff --git a/include/util.h b/include/util.h
deleted file mode 100644
index a64782783615..000000000000
--- a/include/util.h
+++ /dev/null
@@ -1,284 +0,0 @@
-#ifndef __CR_UTIL_H__
-#define __CR_UTIL_H__
-
-/*
- * Some bits are stolen from perf and kvm tools
- */
-#include <signal.h>
-#include <stdio.h>
-#include <errno.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/statfs.h>
-#include <dirent.h>
-
-#include "compiler.h"
-#include "asm/types.h"
-#include "xmalloc.h"
-#include "bug.h"
-#include "log.h"
-#include "err.h"
-
-#include "protobuf/vma.pb-c.h"
-
-#define PREF_SHIFT_OP(pref, op, size) ((size) op (pref ##BYTES_SHIFT))
-#define KBYTES_SHIFT 10
-#define MBYTES_SHIFT 20
-#define GBYTES_SHIFT 30
-
-#define KBYTES(size) PREF_SHIFT_OP(K, >>, size)
-#define MBYTES(size) PREF_SHIFT_OP(M, >>, size)
-#define GBYTES(size) PREF_SHIFT_OP(G, >>, size)
-
-#define KILO(size) PREF_SHIFT_OP(K, <<, size)
-#define MEGA(size) PREF_SHIFT_OP(M, <<, size)
-#define GIGA(size) PREF_SHIFT_OP(G, <<, size)
-
-struct vma_area;
-struct list_head;
-
-extern void pr_vma(unsigned int loglevel, const struct vma_area *vma_area);
-
-#define pr_info_vma(vma_area) pr_vma(LOG_INFO, vma_area)
-#define pr_msg_vma(vma_area) pr_vma(LOG_MSG, vma_area)
-
-#define pr_vma_list(level, head) \
- do { \
- struct vma_area *vma; \
- list_for_each_entry(vma, head, list) \
- pr_vma(level, vma); \
- } while (0)
-#define pr_info_vma_list(head) pr_vma_list(LOG_INFO, head)
-
-extern int move_img_fd(int *img_fd, int want_fd);
-extern int close_safe(int *fd);
-
-extern int reopen_fd_as_safe(char *file, int line, int new_fd, int old_fd, bool allow_reuse_fd);
-#define reopen_fd_as(new_fd, old_fd) reopen_fd_as_safe(__FILE__, __LINE__, new_fd, old_fd, false)
-#define reopen_fd_as_nocheck(new_fd, old_fd) reopen_fd_as_safe(__FILE__, __LINE__, new_fd, old_fd, true)
-
-extern void close_proc(void);
-extern int open_pid_proc(pid_t pid);
-extern int close_pid_proc(void);
-extern int set_proc_fd(int fd);
-
-/*
- * Values for pid argument of the proc opening routines below.
- * SELF would open file under /proc/self
- * GEN would open a file under /proc itself
- * NONE is internal, don't use it ;)
- */
-
-#define PROC_SELF 0
-#define PROC_GEN -1
-#define PROC_NONE -2
-
-extern int do_open_proc(pid_t pid, int flags, const char *fmt, ...);
-
-#define __open_proc(pid, ier, flags, fmt, ...) \
- ({ \
- int __fd = do_open_proc(pid, flags, \
- fmt, ##__VA_ARGS__); \
- if (__fd < 0 && (errno != ier)) \
- pr_perror("Can't open %d/" fmt " on procfs", \
- pid, ##__VA_ARGS__); \
- \
- __fd; \
- })
-
-/* int open_proc(pid_t pid, const char *fmt, ...); */
-#define open_proc(pid, fmt, ...) \
- __open_proc(pid, 0, O_RDONLY, fmt, ##__VA_ARGS__)
-
-/* int open_proc_rw(pid_t pid, const char *fmt, ...); */
-#define open_proc_rw(pid, fmt, ...) \
- __open_proc(pid, 0, O_RDWR, fmt, ##__VA_ARGS__)
-
-#define open_proc_path(pid, fmt, ...) \
- __open_proc(pid, 0, O_PATH, fmt, ##__VA_ARGS__)
-
-/* DIR *opendir_proc(pid_t pid, const char *fmt, ...); */
-#define opendir_proc(pid, fmt, ...) \
- ({ \
- int __fd = open_proc(pid, fmt, ##__VA_ARGS__); \
- DIR *__d = NULL; \
- \
- if (__fd >= 0) { \
- __d = fdopendir(__fd); \
- if (__d == NULL) \
- pr_perror("Can't fdopendir %d " \
- "(%d/" fmt " on procfs)", \
- __fd, pid, ##__VA_ARGS__); \
- } \
- __d; \
- })
-
-/* FILE *fopen_proc(pid_t pid, const char *fmt, ...); */
-#define fopen_proc(pid, fmt, ...) \
- ({ \
- int __fd = open_proc(pid, fmt, ##__VA_ARGS__); \
- FILE *__f = NULL; \
- \
- if (__fd >= 0) { \
- __f = fdopen(__fd, "r"); \
- if (__f == NULL) \
- pr_perror("Can't fdopen %d " \
- "(%d/" fmt " on procfs)", \
- __fd, pid, ##__VA_ARGS__); \
- } \
- __f; \
- })
-
-#define pr_img_head(type, ...) pr_msg("\n"#type __VA_ARGS__ "\n----------------\n")
-#define pr_img_tail(type) pr_msg("----------------\n")
-
-#define DEVZERO (makedev(1, 5))
-
-#define KDEV_MINORBITS 20
-#define KDEV_MINORMASK ((1UL << KDEV_MINORBITS) - 1)
-#define MKKDEV(ma, mi) (((ma) << KDEV_MINORBITS) | (mi))
-
-static inline u32 kdev_major(u32 kdev)
-{
- return kdev >> KDEV_MINORBITS;
-}
-
-static inline u32 kdev_minor(u32 kdev)
-{
- return kdev & KDEV_MINORMASK;
-}
-
-static inline dev_t kdev_to_odev(u32 kdev)
-{
- /*
- * New kernels encode devices in a new form.
- * See kernel's fs/stat.c for details, there
- * choose_32_64 helpers which are the key.
- */
- unsigned major = kdev_major(kdev);
- unsigned minor = kdev_minor(kdev);
-
- return makedev(major, minor);
-}
-
-extern int copy_file(int fd_in, int fd_out, size_t bytes);
-extern int is_anon_link_type(char *link, char *type);
-
-#define is_hex_digit(c) \
- (((c) >= '0' && (c) <= '9') || \
- ((c) >= 'a' && (c) <= 'f') || \
- ((c) >= 'A' && (c) <= 'F'))
-
-extern void *shmalloc(size_t bytes);
-extern void shfree_last(void *ptr);
-
-#define CRS_CAN_FAIL 0x1 /* cmd can validly exit with non zero code */
-
-extern int cr_system(int in, int out, int err, char *cmd, char *const argv[], unsigned flags);
-extern int cr_system_userns(int in, int out, int err, char *cmd,
- char *const argv[], unsigned flags, int userns_pid);
-extern int cr_daemon(int nochdir, int noclose, int *keep_fd, int close_fd);
-extern int is_root_user(void);
-
-static inline bool dir_dots(struct dirent *de)
-{
- return !strcmp(de->d_name, ".") || !strcmp(de->d_name, "..");
-}
-
-extern int is_empty_dir(int dirfd);
-
-/*
- * Size of buffer to carry the worst case or /proc/self/fd/N
- * path. Since fd is an integer, we can easily estimate one :)
- */
-#define PSFDS (sizeof("/proc/self/fd/2147483647"))
-
-extern int read_fd_link(int lfd, char *buf, size_t size);
-
-#define USEC_PER_SEC 1000000L
-#define NSEC_PER_SEC 1000000000L
-
-int vaddr_to_pfn(unsigned long vaddr, u64 *pfn);
-
-/*
- * Check whether @str starts with @sub and report the
- * next character of @str in @end
- */
-static inline bool strstartswith2(const char *str, const char *sub, char *end)
-{
- const char *osub = sub;
-
- while (1) {
- if (*sub == '\0') /* end of sub -- match */ {
- if (end) {
- if (sub == osub + 1) /* pure root */
- *end = '/';
- else
- *end = *str;
- }
-
- return true;
- }
- if (*str == '\0') /* end of str, sub is NOT ended -- miss */
- return false;
- if (*str != *sub)
- return false;
-
- str++;
- sub++;
- }
-}
-
-static inline bool strstartswith(const char *str, const char *sub)
-{
- return strstartswith2(str, sub, NULL);
-}
-
-/*
- * Checks whether the @path has @sub_path as a sub path, i.e.
- * sub_path is the beginning of path and the last component
- * match is full (next character terminates path component).
- *
- * Paths shouldn't contain excessive /-s, i.e. only one slash
- * between path components and no slash at the end (except for
- * the "/" path. This is pretty good assumption to what paths
- * are used by criu.
- */
-
-static inline bool issubpath(const char *path, const char *sub_path)
-{
- char end;
- return strstartswith2(path, sub_path, &end) &&
- (end == '/' || end == '\0');
-}
-
-/*
- * mkdir -p
- */
-int mkdirpat(int fd, const char *path);
-
-/*
- * Tests whether a path is a prefix of another path. This is different than
- * strstartswith because "/foo" is _not_ a path prefix of "/foobar", since they
- * refer to different directories.
- */
-bool is_path_prefix(const char *path, const char *prefix);
-FILE *fopenat(int dirfd, char *path, char *cflags);
-void split(char *str, char token, char ***out, int *n);
-
-int fd_has_data(int lfd);
-size_t read_into_buffer(int fd, char *buff, size_t size);
-
-int make_yard(char *path);
-
-void tcp_nodelay(int sk, bool on);
-void tcp_cork(int sk, bool on);
-
-const char *ns_to_string(unsigned int ns);
-
-char *xstrcat(char *str, const char *fmt, ...)
- __attribute__ ((__format__ (__printf__, 2, 3)));
-char *xsprintf(const char *fmt, ...)
- __attribute__ ((__format__ (__printf__, 1, 2)));
-
-#endif /* __CR_UTIL_H__ */
diff --git a/include/uts_ns.h b/include/uts_ns.h
deleted file mode 100644
index ab054ffe87d1..000000000000
--- a/include/uts_ns.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef __CR_UTS_NS_H__
-#define __CR_UTS_NS_H__
-
-extern int dump_uts_ns(int ns_id);
-extern int prepare_utsns(int pid);
-
-extern struct ns_desc uts_ns_desc;
-
-#endif /* __CR_UTS_NS_H__ */
diff --git a/include/vdso.h b/include/vdso.h
deleted file mode 100644
index ea6bfabbf3ec..000000000000
--- a/include/vdso.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef __CR_VDSO_H__
-#define __CR_VDSO_H__
-
-#include <sys/mman.h>
-#include <stdbool.h>
-
-#include "config.h"
-
-#ifdef CONFIG_VDSO
-
-#include "util-vdso.h"
-
-extern struct vdso_symtable vdso_sym_rt;
-
-extern int vdso_init(void);
-
-extern int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid,
- struct vm_area_list *vma_area_list);
-
-#else /* CONFIG_VDSO */
-
-#define vdso_init() (0)
-#define parasite_fixup_vdso(ctl, pid, vma_area_list) (0)
-
-#endif /* CONFIG_VDSO */
-
-#endif /* __CR_VDSO_H__ */
diff --git a/include/vma.h b/include/vma.h
deleted file mode 100644
index 6c28136612e9..000000000000
--- a/include/vma.h
+++ /dev/null
@@ -1,110 +0,0 @@
-#ifndef __CR_VMA_H__
-#define __CR_VMA_H__
-
-#include "asm/types.h"
-#include "image.h"
-#include "list.h"
-
-#include "protobuf/vma.pb-c.h"
-
-struct vm_area_list {
- struct list_head h;
- unsigned nr;
- unsigned int nr_aios;
- unsigned long priv_size; /* nr of pages in private VMAs */
- unsigned long longest; /* nr of pages in longest VMA */
-};
-
-#define VM_AREA_LIST(name) struct vm_area_list name = { .h = LIST_HEAD_INIT(name.h), .nr = 0, }
-
-static inline void vm_area_list_init(struct vm_area_list *vml)
-{
- INIT_LIST_HEAD(&vml->h);
- vml->nr = 0;
- vml->priv_size = 0;
- vml->longest = 0;
-}
-
-struct file_desc;
-
-struct vma_area {
- struct list_head list;
- VmaEntry *e;
-
- union {
- struct /* for dump */ {
- union {
- /*
- * These two cannot be assigned at once.
- * The file_fd is an fd for a regular file and
- * the socket_id is the inode number of the
- * mapped (PF_PACKET) socket.
- *
- * The aio_nr_req is only for aio rings.
- */
- int vm_file_fd;
- int vm_socket_id;
- unsigned int aio_nr_req;
- };
-
- char *aufs_rpath; /* path from aufs root */
- char *aufs_fpath; /* full path from global root */
-
- /*
- * When several subsequent vmas have the same
- * dev:ino pair all 'tail' ones set this to true
- * and the vmst points to the head's stat buf.
- */
- bool file_borrowed;
- struct stat *vmst;
- int mnt_id;
- };
-
- struct /* for restore */ {
- struct file_desc *vmfd;
- unsigned long *page_bitmap; /* existent pages */
- unsigned long *ppage_bitmap; /* parent's existent pages */
- unsigned long premmaped_addr; /* restore only */
- };
- };
-};
-
-extern struct vma_area *alloc_vma_area(void);
-extern int collect_mappings(pid_t pid, struct vm_area_list *vma_area_list);
-extern void free_mappings(struct vm_area_list *vma_area_list);
-
-#define vma_area_is(vma_area, s) vma_entry_is((vma_area)->e, s)
-#define vma_area_len(vma_area) vma_entry_len((vma_area)->e)
-#define vma_entry_is(vma, s) (((vma)->status & (s)) == (s))
-#define vma_entry_len(vma) ((vma)->end - (vma)->start)
-
-/*
- * vma_premmaped_start() can be used only in restorer.
- * In other cases vma_area->premmaped_addr must be used.
- * This hack is required, because vma_area isn't tranfered in restorer and
- * shmid is used to determing which vma-s are cowed.
- */
-#define vma_premmaped_start(vma) ((vma)->shmid)
-
-static inline int in_vma_area(struct vma_area *vma, unsigned long addr)
-{
- return addr >= (unsigned long)vma->e->start &&
- addr < (unsigned long)vma->e->end;
-}
-
-static inline bool vma_entry_is_private(VmaEntry *entry,
- unsigned long task_size)
-{
- return vma_entry_is(entry, VMA_AREA_REGULAR) &&
- (vma_entry_is(entry, VMA_ANON_PRIVATE) ||
- vma_entry_is(entry, VMA_FILE_PRIVATE)) &&
- (entry->end <= task_size);
-}
-
-static inline bool vma_area_is_private(struct vma_area *vma,
- unsigned long task_size)
-{
- return vma_entry_is_private(vma->e, task_size);
-}
-
-#endif /* __CR_VMA_H__ */
diff --git a/include/xmalloc.h b/include/xmalloc.h
deleted file mode 100644
index e5ce279fab1a..000000000000
--- a/include/xmalloc.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef __CR_XMALLOC_H__
-#define __CR_XMALLOC_H__
-
-#include <stdlib.h>
-#include <string.h>
-
-#include "log.h"
-
-#define __xalloc(op, size, ...) \
- ({ \
- void *___p = op( __VA_ARGS__ ); \
- if (!___p) \
- pr_err("%s: Can't allocate %li bytes\n", \
- __func__, (long)(size)); \
- ___p; \
- })
-
-#define xstrdup(str) __xalloc(strdup, strlen(str) + 1, str)
-#define xmalloc(size) __xalloc(malloc, size, size)
-#define xzalloc(size) __xalloc(calloc, size, 1, size)
-#define xrealloc(p, size) __xalloc(realloc, size, p, size)
-
-#define xfree(p) free(p)
-
-#define xrealloc_safe(pptr, size) \
- ({ \
- int __ret = -1; \
- void *new = xrealloc(*pptr, size); \
- if (new) { \
- *pptr = new; \
- __ret = 0; \
- } \
- __ret; \
- })
-
-#define xmemdup(ptr, size) \
- ({ \
- void *new = xmalloc(size); \
- if (new) \
- memcpy(new, ptr, size); \
- new; \
- })
-
-#define memzero_p(p) memset(p, 0, sizeof(*p))
-#define memzero(p, size) memset(p, 0, size)
-
-/*
- * Helper for allocating trees with single xmalloc.
- * This one advances the void *pointer on s bytes and
- * returns the previous value. Use like this
- *
- * m = xmalloc(total_size);
- * a = xptr_pull(&m, tree_root_t);
- * a->b = xptr_pull(&m, leaf_a_t);
- * a->c = xptr_pull(&m, leaf_c_t);
- * ...
- */
-static inline void *xptr_pull_s(void **m, size_t s)
-{
- void *ret = (*m);
- (*m) += s;
- return ret;
-}
-
-#define xptr_pull(m, type) xptr_pull_s(m, sizeof(type))
-
-#endif /* __CR_XMALLOC_H__ */
diff --git a/ipc_ns.c b/ipc_ns.c
deleted file mode 100644
index 9abb40311a18..000000000000
--- a/ipc_ns.c
+++ /dev/null
@@ -1,936 +0,0 @@
-#include <unistd.h>
-#include <stdlib.h>
-#include <string.h>
-#include <fcntl.h>
-#include <sys/wait.h>
-#include <sys/msg.h>
-#include <sys/sem.h>
-#include <sys/shm.h>
-#include <sched.h>
-
-#include "util.h"
-#include "cr_options.h"
-#include "imgset.h"
-#include "namespaces.h"
-#include "sysctl.h"
-#include "ipc_ns.h"
-
-#include "protobuf.h"
-#include "protobuf/ipc-var.pb-c.h"
-#include "protobuf/ipc-shm.pb-c.h"
-#include "protobuf/ipc-sem.pb-c.h"
-#include "protobuf/ipc-msg.pb-c.h"
-
-#if defined (__GLIBC__) && __GLIBC__ >= 2
-#define KEY __key
-#else
-#define KEY key
-#endif
-
-#ifndef MSGMAX
-#define MSGMAX 8192
-#endif
-
-#ifndef MSG_COPY
-#define MSG_COPY 040000
-#endif
-
-static void pr_ipc_desc_entry(unsigned int loglevel, const IpcDescEntry *desc)
-{
- print_on_level(loglevel, "id: %-10d key: 0x%08x uid: %-10d gid: %-10d "
- "cuid: %-10d cgid: %-10d mode: %-10o ",
- desc->id, desc->key, desc->uid, desc->gid,
- desc->cuid, desc->cgid, desc->mode);
-}
-
-static void fill_ipc_desc(int id, IpcDescEntry *desc, const struct ipc_perm *ipcp)
-{
- desc->id = id;
- desc->key = ipcp->KEY;
- desc->uid = userns_uid(ipcp->uid);
- desc->gid = userns_gid(ipcp->gid);
- desc->cuid = userns_uid(ipcp->cuid);
- desc->cgid = userns_gid(ipcp->cgid);
- desc->mode = ipcp->mode;
-}
-
-static void pr_ipc_sem_array(unsigned int loglevel, int nr, u16 *values)
-{
- while (nr--)
- print_on_level(loglevel, " %-5d", values[nr]);
- print_on_level(loglevel, "\n");
-}
-
-#define pr_info_ipc_sem_array(nr, values) pr_ipc_sem_array(LOG_INFO, nr, values)
-#define pr_msg_ipc_sem_array(nr, values) pr_ipc_sem_array(LOG_MSG, nr, values)
-
-static void pr_info_ipc_sem_entry(const IpcSemEntry *sem)
-{
- pr_ipc_desc_entry(LOG_INFO, sem->desc);
- print_on_level(LOG_INFO, "nsems: %-10d\n", sem->nsems);
-}
-
-static int dump_ipc_sem_set(struct cr_img *img, const IpcSemEntry *sem)
-{
- size_t rounded;
- int ret, size;
- u16 *values;
-
- size = sizeof(u16) * sem->nsems;
- rounded = round_up(size, sizeof(u64));
- values = xmalloc(rounded);
- if (values == NULL) {
- pr_err("Failed to allocate memory for semaphore set values\n");
- ret = -ENOMEM;
- goto out;
- }
- ret = semctl(sem->desc->id, 0, GETALL, values);
- if (ret < 0) {
- pr_perror("Failed to get semaphore set values");
- ret = -errno;
- goto out;
- }
- pr_info_ipc_sem_array(sem->nsems, values);
-
- memzero((void *)values + size, rounded - size);
- ret = write_img_buf(img, values, rounded);
- if (ret < 0) {
- pr_err("Failed to write IPC message data\n");
- goto out;
- }
-out:
- xfree(values);
- return ret;
-}
-
-static int dump_ipc_sem_desc(struct cr_img *img, int id, const struct semid_ds *ds)
-{
- IpcSemEntry sem = IPC_SEM_ENTRY__INIT;
- IpcDescEntry desc = IPC_DESC_ENTRY__INIT;
- int ret;
-
- sem.desc = &desc;
- sem.nsems = ds->sem_nsems;
-
- fill_ipc_desc(id, sem.desc, &ds->sem_perm);
- pr_info_ipc_sem_entry(&sem);
-
- ret = pb_write_one(img, &sem, PB_IPC_SEM);
- if (ret < 0) {
- pr_err("Failed to write IPC semaphores set\n");
- return ret;
- }
- return dump_ipc_sem_set(img, &sem);
-}
-
-static int dump_ipc_sem(struct cr_img *img)
-{
- int i, maxid;
- struct seminfo info;
- int slot;
-
- maxid = semctl(0, 0, SEM_INFO, &info);
- if (maxid < 0) {
- pr_perror("semctl failed");
- return -errno;
- }
-
- pr_info("IPC semaphore sets: %d\n", info.semusz);
- for (i = 0, slot = 0; i <= maxid; i++) {
- struct semid_ds ds;
- int id, ret;
-
- id = semctl(i, 0, SEM_STAT, &ds);
- if (id < 0) {
- if (errno == EINVAL)
- continue;
- pr_perror("Failed to get stats for IPC semaphore set");
- break;
- }
- ret = dump_ipc_sem_desc(img, id, &ds);
- if (!ret)
- slot++;
- }
- if (slot != info.semusz) {
- pr_err("Failed to collect %d (only %d succeeded)\n", info.semusz, slot);
- return -EFAULT;
- }
- return info.semusz;
-}
-
-static void pr_info_ipc_msg(int nr, const IpcMsg *msg)
-{
- print_on_level(LOG_INFO, " %-5d: type: %-20"PRId64" size: %-10d\n",
- nr++, msg->mtype, msg->msize);
-}
-
-static void pr_info_ipc_msg_entry(const IpcMsgEntry *msg)
-{
- pr_ipc_desc_entry(LOG_INFO, msg->desc);
- print_on_level(LOG_INFO, "qbytes: %-10d qnum: %-10d\n",
- msg->qbytes, msg->qnum);
-}
-
-static int dump_ipc_msg_queue_messages(struct cr_img *img, const IpcMsgEntry *msq,
- unsigned int msg_nr)
-{
- struct msgbuf *message = NULL;
- unsigned int msgmax;
- int ret, msg_cnt = 0;
- struct sysctl_req req[] = {
- { "kernel/msgmax", &msgmax, CTL_U32 },
- };
-
- ret = sysctl_op(req, ARRAY_SIZE(req), CTL_READ, CLONE_NEWIPC);
- if (ret < 0) {
- pr_err("Failed to read max IPC message size\n");
- goto err;
- }
-
- msgmax += sizeof(struct msgbuf);
- message = xmalloc(round_up(msgmax, sizeof(u64)));
- if (message == NULL) {
- pr_err("Failed to allocate memory for IPC message\n");
- return -ENOMEM;
- }
-
- for (msg_cnt = 0; msg_cnt < msg_nr; msg_cnt++) {
- IpcMsg msg = IPC_MSG__INIT;
- size_t rounded;
-
- ret = msgrcv(msq->desc->id, message, msgmax, msg_cnt, IPC_NOWAIT | MSG_COPY);
- if (ret < 0) {
- pr_perror("Failed to copy IPC message");
- goto err;
- }
-
- msg.msize = ret;
- msg.mtype = message->mtype;
-
- pr_info_ipc_msg(msg_cnt, &msg);
-
- ret = pb_write_one(img, &msg, PB_IPCNS_MSG);
- if (ret < 0) {
- pr_err("Failed to write IPC message header\n");
- break;
- }
-
- rounded = round_up(msg.msize, sizeof(u64));
- memzero(((void *)message->mtext + msg.msize), rounded - msg.msize);
- ret = write_img_buf(img, message->mtext, rounded);
- if (ret < 0) {
- pr_err("Failed to write IPC message data\n");
- break;
- }
- }
- ret = 0;
-err:
- xfree(message);
- return ret;
-}
-
-static int dump_ipc_msg_queue(struct cr_img *img, int id, const struct msqid_ds *ds)
-{
- IpcMsgEntry msg = IPC_MSG_ENTRY__INIT;
- IpcDescEntry desc = IPC_DESC_ENTRY__INIT;
- int ret;
-
- msg.desc = &desc;
- fill_ipc_desc(id, msg.desc, &ds->msg_perm);
- msg.qbytes = ds->msg_qbytes;
- msg.qnum = ds->msg_qnum;
-
- pr_info_ipc_msg_entry(&msg);
-
- ret = pb_write_one(img, &msg, PB_IPCNS_MSG_ENT);
- if (ret < 0) {
- pr_err("Failed to write IPC message queue\n");
- return ret;
- }
- return dump_ipc_msg_queue_messages(img, &msg, ds->msg_qnum);
-}
-
-static int dump_ipc_msg(struct cr_img *img)
-{
- int i, maxid;
- struct msginfo info;
- int slot;
-
- maxid = msgctl(0, MSG_INFO, (struct msqid_ds *)&info);
- if (maxid < 0) {
- pr_perror("msgctl failed");
- return -errno;
- }
-
- pr_info("IPC message queues: %d\n", info.msgpool);
- for (i = 0, slot = 0; i <= maxid; i++) {
- struct msqid_ds ds;
- int id, ret;
-
- id = msgctl(i, MSG_STAT, &ds);
- if (id < 0) {
- if (errno == EINVAL)
- continue;
- pr_perror("Failed to get stats for IPC message queue");
- break;
- }
- ret = dump_ipc_msg_queue(img, id, &ds);
- if (!ret)
- slot++;
- }
- if (slot != info.msgpool) {
- pr_err("Failed to collect %d message queues (only %d succeeded)\n", info.msgpool, slot);
- return -EFAULT;
- }
- return info.msgpool;
-}
-
-static void pr_info_ipc_shm(const IpcShmEntry *shm)
-{
- pr_ipc_desc_entry(LOG_INFO, shm->desc);
- print_on_level(LOG_INFO, "size: %-10"PRIu64"\n", shm->size);
-}
-
-static int ipc_sysctl_req(IpcVarEntry *e, int op)
-{
- struct sysctl_req req[] = {
- { "kernel/sem", e->sem_ctls, CTL_U32A(e->n_sem_ctls) },
- { "kernel/msgmax", &e->msg_ctlmax, CTL_U32 },
- { "kernel/msgmnb", &e->msg_ctlmnb, CTL_U32 },
- { "kernel/auto_msgmni", &e->auto_msgmni, CTL_U32 },
- { "kernel/msgmni", &e->msg_ctlmni, CTL_U32 },
- { "kernel/shmmax", &e->shm_ctlmax, CTL_U64 },
- { "kernel/shmall", &e->shm_ctlall, CTL_U64 },
- { "kernel/shmmni", &e->shm_ctlmni, CTL_U32 },
- { "kernel/shm_rmid_forced", &e->shm_rmid_forced, CTL_U32 },
- };
-
- struct sysctl_req req_mq[] = {
- { "fs/mqueue/queues_max", &e->mq_queues_max, CTL_U32 },
- { "fs/mqueue/msg_max", &e->mq_msg_max, CTL_U32 },
- { "fs/mqueue/msgsize_max", &e->mq_msgsize_max, CTL_U32 },
- };
-
- int ret;
-
- ret = sysctl_op(req, ARRAY_SIZE(req), op, CLONE_NEWIPC);
- if (ret)
- return ret;
-
- if (access("/proc/sys/fs/mqueue", X_OK)) {
- pr_info("Mqueue sysctls are missing\n");
- return 0;
- }
-
- return sysctl_op(req_mq, ARRAY_SIZE(req_mq), op, CLONE_NEWIPC);
-}
-
-/*
- * TODO: Function below should be later improved to locate and dump only dirty
- * pages via updated sys_mincore().
- */
-static int dump_ipc_shm_pages(struct cr_img *img, const IpcShmEntry *shm)
-{
- void *data;
- int ret;
-
- data = shmat(shm->desc->id, NULL, SHM_RDONLY);
- if (data == (void *)-1) {
- pr_perror("Failed to attach IPC shared memory");
- return -errno;
- }
- ret = write_img_buf(img, data, round_up(shm->size, sizeof(u32)));
- if (ret < 0) {
- pr_err("Failed to write IPC shared memory data\n");
- return ret;
- }
- if (shmdt(data)) {
- pr_perror("Failed to detach IPC shared memory");
- return -errno;
- }
- return 0;
-}
-
-static int dump_ipc_shm_seg(struct cr_img *img, int id, const struct shmid_ds *ds)
-{
- IpcShmEntry shm = IPC_SHM_ENTRY__INIT;
- IpcDescEntry desc = IPC_DESC_ENTRY__INIT;
- int ret;
-
- shm.desc = &desc;
- shm.size = ds->shm_segsz;
- fill_ipc_desc(id, shm.desc, &ds->shm_perm);
- pr_info_ipc_shm(&shm);
-
- ret = pb_write_one(img, &shm, PB_IPC_SHM);
- if (ret < 0) {
- pr_err("Failed to write IPC shared memory segment\n");
- return ret;
- }
- return dump_ipc_shm_pages(img, &shm);
-}
-
-static int dump_ipc_shm(struct cr_img *img)
-{
- int i, maxid, slot;
- struct shm_info info;
-
- maxid = shmctl(0, SHM_INFO, (void *)&info);
- if (maxid < 0) {
- pr_perror("shmctl(SHM_INFO) failed");
- return -errno;
- }
-
- pr_info("IPC shared memory segments: %d\n", info.used_ids);
- for (i = 0, slot = 0; i <= maxid; i++) {
- struct shmid_ds ds;
- int id, ret;
-
- id = shmctl(i, SHM_STAT, &ds);
- if (id < 0) {
- if (errno == EINVAL)
- continue;
- pr_perror("Failed to get stats for IPC shared memory");
- break;
- }
-
- ret = dump_ipc_shm_seg(img, id, &ds);
- if (ret < 0)
- return ret;
- slot++;
- }
- if (slot != info.used_ids) {
- pr_err("Failed to collect %d (only %d succeeded)\n",
- info.used_ids, slot);
- return -EFAULT;
- }
- return 0;
-}
-
-static int dump_ipc_var(struct cr_img *img)
-{
- IpcVarEntry var = IPC_VAR_ENTRY__INIT;
- int ret = -1;
-
- var.n_sem_ctls = 4;
- var.sem_ctls = xmalloc(pb_repeated_size(&var, sem_ctls));
- if (!var.sem_ctls)
- goto err;
-
- ret = ipc_sysctl_req(&var, CTL_READ);
- if (ret < 0) {
- pr_err("Failed to read IPC variables\n");
- goto err;
- }
-
- ret = pb_write_one(img, &var, PB_IPC_VAR);
- if (ret < 0) {
- pr_err("Failed to write IPC variables\n");
- goto err;
- }
-
-err:
- xfree(var.sem_ctls);
- return ret;
-}
-
-static int dump_ipc_data(const struct cr_imgset *imgset)
-{
- int ret;
-
- ret = dump_ipc_var(img_from_set(imgset, CR_FD_IPC_VAR));
- if (ret < 0)
- return ret;
- ret = dump_ipc_shm(img_from_set(imgset, CR_FD_IPCNS_SHM));
- if (ret < 0)
- return ret;
- ret = dump_ipc_msg(img_from_set(imgset, CR_FD_IPCNS_MSG));
- if (ret < 0)
- return ret;
- ret = dump_ipc_sem(img_from_set(imgset, CR_FD_IPCNS_SEM));
- if (ret < 0)
- return ret;
- return 0;
-}
-
-int dump_ipc_ns(int ns_id)
-{
- int ret;
- struct cr_imgset *imgset;
-
- imgset = cr_imgset_open(ns_id, IPCNS, O_DUMP);
- if (imgset == NULL)
- return -1;
-
- ret = dump_ipc_data(imgset);
- if (ret < 0) {
- pr_err("Failed to write IPC namespace data\n");
- goto err;
- }
-
-err:
- close_cr_imgset(&imgset);
- return ret < 0 ? -1 : 0;
-}
-
-void ipc_sem_handler(struct cr_img *img, void *obj)
-{
- IpcSemEntry *e = obj;
- u16 *values;
- int size;
-
- pr_msg("\n");
- size = round_up(sizeof(u16) * e->nsems, sizeof(u64));
- values = xmalloc(size);
- if (values == NULL)
- return;
- if (read_img_buf(img, values, size) <= 0) {
- xfree(values);
- return;
- }
- pr_msg_ipc_sem_array(e->nsems, values);
- xfree(values);
-}
-
-static void ipc_msg_data_handler(struct cr_img *img, void *obj)
-{
- IpcMsg *e = obj;
- print_image_data(img, round_up(e->msize, sizeof(u64)), opts.show_pages_content);
-}
-
-void ipc_msg_handler(struct cr_img *img, void *obj)
-{
- IpcMsgEntry *e = obj;
- int msg_nr = 0;
-
- pr_msg("\n");
- while (msg_nr++ < e->qnum)
- pb_show_plain_payload(img, PB_IPCNS_MSG, ipc_msg_data_handler);
-
-}
-
-void ipc_shm_handler(struct cr_img *img, void *obj)
-{
- IpcShmEntry *e = obj;
- print_image_data(img, round_up(e->size, sizeof(u32)), opts.show_pages_content);
-}
-
-static int prepare_ipc_sem_values(struct cr_img *img, const IpcSemEntry *sem)
-{
- int ret, size;
- u16 *values;
-
- size = round_up(sizeof(u16) * sem->nsems, sizeof(u64));
- values = xmalloc(size);
- if (values == NULL) {
- pr_err("Failed to allocate memory for semaphores set values\n");
- ret = -ENOMEM;
- goto out;
- }
-
- ret = read_img_buf(img, values, size);
- if (ret < 0) {
- pr_err("Failed to allocate memory for semaphores set values\n");
- ret = -ENOMEM;
- goto out;
- }
-
- pr_info_ipc_sem_array(sem->nsems, values);
-
- ret = semctl(sem->desc->id, 0, SETALL, values);
- if (ret < 0) {
- pr_perror("Failed to set semaphores set values");
- ret = -errno;
- }
-out:
- xfree(values);
- return ret;
-}
-
-static int prepare_ipc_sem_desc(struct cr_img *img, const IpcSemEntry *sem)
-{
- int ret, id;
- struct sysctl_req req[] = {
- { "kernel/sem_next_id", &sem->desc->id, CTL_U32 },
- };
- struct semid_ds semid;
-
- ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWIPC);
- if (ret < 0) {
- pr_err("Failed to set desired IPC sem ID\n");
- return ret;
- }
-
- id = semget(sem->desc->key, sem->nsems,
- sem->desc->mode | IPC_CREAT | IPC_EXCL);
- if (id == -1) {
- pr_perror("Failed to create sem set");
- return -errno;
- }
-
- if (id != sem->desc->id) {
- pr_err("Failed to restore sem id (%d instead of %d)\n",
- id, sem->desc->id);
- return -EFAULT;
- }
-
- ret = semctl(id, sem->nsems, IPC_STAT, &semid);
- if (ret == -1) {
- pr_err("Failed to get sem stat structure\n");
- return -EFAULT;
- }
-
- semid.sem_perm.uid = sem->desc->uid;
- semid.sem_perm.gid = sem->desc->gid;
-
- ret = semctl(id, sem->nsems, IPC_SET, &semid);
- if (ret == -1) {
- pr_err("Failed to set sem uid and gid\n");
- return -EFAULT;
- }
-
- ret = prepare_ipc_sem_values(img, sem);
- if (ret < 0) {
- pr_err("Failed to update sem pages\n");
- return ret;
- }
- return 0;
-}
-
-static int prepare_ipc_sem(int pid)
-{
- int ret;
- struct cr_img *img;
-
- pr_info("Restoring IPC semaphores sets\n");
- img = open_image(CR_FD_IPCNS_SEM, O_RSTR, pid);
- if (!img)
- return -1;
-
- while (1) {
- IpcSemEntry *sem;
-
- ret = pb_read_one_eof(img, &sem, PB_IPC_SEM);
- if (ret < 0) {
- ret = -EIO;
- goto err;
- }
- if (ret == 0)
- break;
-
- pr_info_ipc_sem_entry(sem);
-
- ret = prepare_ipc_sem_desc(img, sem);
- ipc_sem_entry__free_unpacked(sem, NULL);
-
- if (ret < 0) {
- pr_err("Failed to prepare semaphores set\n");
- goto err;
- }
- }
-
- close_image(img);
- return 0;
-
-err:
- close_image(img);
- return ret;
-}
-
-static int prepare_ipc_msg_queue_messages(struct cr_img *img, const IpcMsgEntry *msq)
-{
- IpcMsg *msg = NULL;
- int msg_nr = 0;
- int ret = 0;
-
- while (msg_nr < msq->qnum) {
- struct msgbuf {
- long mtype;
- char mtext[MSGMAX];
- } data;
-
- ret = pb_read_one(img, &msg, PB_IPCNS_MSG);
- if (ret <= 0)
- return -EIO;
-
- pr_info_ipc_msg(msg_nr, msg);
-
- if (msg->msize > MSGMAX) {
- ret = -1;
- pr_err("Unsupported message size: %d (MAX: %d)\n",
- msg->msize, MSGMAX);
- break;
- }
-
- ret = read_img_buf(img, data.mtext, round_up(msg->msize, sizeof(u64)));
- if (ret < 0) {
- pr_err("Failed to read IPC message data\n");
- break;
- }
-
- data.mtype = msg->mtype;
- ret = msgsnd(msq->desc->id, &data, msg->msize, IPC_NOWAIT);
- if (ret < 0) {
- pr_perror("Failed to send IPC message");
- ret = -errno;
- break;
- }
- msg_nr++;
- }
-
- if (msg)
- ipc_msg__free_unpacked(msg, NULL);
- return ret;
-}
-
-static int prepare_ipc_msg_queue(struct cr_img *img, const IpcMsgEntry *msq)
-{
- int ret, id;
- struct sysctl_req req[] = {
- { "kernel/msg_next_id", &msq->desc->id, CTL_U32 },
- };
- struct msqid_ds msqid;
-
- ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWIPC);
- if (ret < 0) {
- pr_err("Failed to set desired IPC msg ID\n");
- return ret;
- }
-
- id = msgget(msq->desc->key, msq->desc->mode | IPC_CREAT | IPC_EXCL);
- if (id == -1) {
- pr_perror("Failed to create msg set");
- return -errno;
- }
-
- if (id != msq->desc->id) {
- pr_err("Failed to restore msg id (%d instead of %d)\n",
- id, msq->desc->id);
- return -EFAULT;
- }
-
- ret = msgctl(id, IPC_STAT, &msqid);
- if (ret == -1) {
- pr_err("Failed to get msq stat structure\n");
- return -EFAULT;
- }
-
- msqid.msg_perm.uid = msq->desc->uid;
- msqid.msg_perm.gid = msq->desc->gid;
-
- ret = msgctl(id, IPC_SET, &msqid);
- if (ret == -1) {
- pr_err("Failed to set msq queue uid and gid\n");
- return -EFAULT;
- }
-
- ret = prepare_ipc_msg_queue_messages(img, msq);
- if (ret < 0) {
- pr_err("Failed to update message queue messages\n");
- return ret;
- }
- return 0;
-}
-
-static int prepare_ipc_msg(int pid)
-{
- int ret;
- struct cr_img *img;
-
- pr_info("Restoring IPC message queues\n");
- img = open_image(CR_FD_IPCNS_MSG, O_RSTR, pid);
- if (!img)
- return -1;
-
- while (1) {
- IpcMsgEntry *msq;
-
- ret = pb_read_one_eof(img, &msq, PB_IPCNS_MSG_ENT);
- if (ret < 0) {
- pr_err("Failed to read IPC messages queue\n");
- ret = -EIO;
- goto err;
- }
- if (ret == 0)
- break;
-
- pr_info_ipc_msg_entry(msq);
-
- ret = prepare_ipc_msg_queue(img, msq);
- ipc_msg_entry__free_unpacked(msq, NULL);
-
- if (ret < 0) {
- pr_err("Failed to prepare messages queue\n");
- goto err;
- }
- }
-
- close_image(img);
- return 0;
-err:
- close_image(img);
- return ret;
-}
-
-static int prepare_ipc_shm_pages(struct cr_img *img, const IpcShmEntry *shm)
-{
- int ret;
- void *data;
-
- data = shmat(shm->desc->id, NULL, 0);
- if (data == (void *)-1) {
- pr_perror("Failed to attach IPC shared memory");
- return -errno;
- }
- ret = read_img_buf(img, data, round_up(shm->size, sizeof(u32)));
- if (ret < 0) {
- pr_err("Failed to read IPC shared memory data\n");
- return ret;
- }
- if (shmdt(data)) {
- pr_perror("Failed to detach IPC shared memory");
- return -errno;
- }
- return 0;
-}
-
-static int prepare_ipc_shm_seg(struct cr_img *img, const IpcShmEntry *shm)
-{
- int ret, id;
- struct sysctl_req req[] = {
- { "kernel/shm_next_id", &shm->desc->id, CTL_U32 },
- };
- struct shmid_ds shmid;
-
- ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWIPC);
- if (ret < 0) {
- pr_err("Failed to set desired IPC shm ID\n");
- return ret;
- }
-
- id = shmget(shm->desc->key, shm->size,
- shm->desc->mode | IPC_CREAT | IPC_EXCL);
- if (id == -1) {
- pr_perror("Failed to create shm set");
- return -errno;
- }
-
- if (id != shm->desc->id) {
- pr_err("Failed to restore shm id (%d instead of %d)\n",
- id, shm->desc->id);
- return -EFAULT;
- }
-
- ret = shmctl(id, IPC_STAT, &shmid);
- if (ret == -1) {
- pr_err("Failed to get shm stat structure\n");
- return -EFAULT;
- }
-
- shmid.shm_perm.uid = shm->desc->uid;
- shmid.shm_perm.gid = shm->desc->gid;
-
- ret = shmctl(id, IPC_SET, &shmid);
- if (ret == -1) {
- pr_err("Failed to set shm uid and gid\n");
- return -EFAULT;
- }
-
- ret = prepare_ipc_shm_pages(img, shm);
- if (ret < 0) {
- pr_err("Failed to update shm pages\n");
- return ret;
- }
- return 0;
-}
-
-static int prepare_ipc_shm(int pid)
-{
- int ret;
- struct cr_img *img;
-
- pr_info("Restoring IPC shared memory\n");
- img = open_image(CR_FD_IPCNS_SHM, O_RSTR, pid);
- if (!img)
- return -1;
-
- while (1) {
- IpcShmEntry *shm;
-
- ret = pb_read_one_eof(img, &shm, PB_IPC_SHM);
- if (ret < 0) {
- pr_err("Failed to read IPC shared memory segment\n");
- ret = -EIO;
- goto err;
- }
- if (ret == 0)
- break;
-
- pr_info_ipc_shm(shm);
-
- ret = prepare_ipc_shm_seg(img, shm);
- ipc_shm_entry__free_unpacked(shm, NULL);
-
- if (ret < 0) {
- pr_err("Failed to prepare shm segment\n");
- goto err;
- }
- }
-
- close_image(img);
- return 0;
-err:
- close_image(img);
- return ret;
-}
-
-static int prepare_ipc_var(int pid)
-{
- int ret;
- struct cr_img *img;
- IpcVarEntry *var;
-
- pr_info("Restoring IPC variables\n");
- img = open_image(CR_FD_IPC_VAR, O_RSTR, pid);
- if (!img)
- return -1;
-
- ret = pb_read_one(img, &var, PB_IPC_VAR);
- close_image(img);
- if (ret <= 0) {
- pr_err("Failed to read IPC namespace variables\n");
- return -EFAULT;
- }
-
- ret = ipc_sysctl_req(var, CTL_WRITE);
- ipc_var_entry__free_unpacked(var, NULL);
-
- if (ret < 0) {
- pr_err("Failed to prepare IPC namespace variables\n");
- return -EFAULT;
- }
-
- return 0;
-}
-
-int prepare_ipc_ns(int pid)
-{
- int ret;
-
- pr_info("Restoring IPC namespace\n");
- ret = prepare_ipc_var(pid);
- if (ret < 0)
- return ret;
- ret = prepare_ipc_shm(pid);
- if (ret < 0)
- return ret;
- ret = prepare_ipc_msg(pid);
- if (ret < 0)
- return ret;
- ret = prepare_ipc_sem(pid);
- if (ret < 0)
- return ret;
- return 0;
-}
-
-struct ns_desc ipc_ns_desc = NS_DESC_ENTRY(CLONE_NEWIPC, "ipc");
diff --git a/irmap.c b/irmap.c
deleted file mode 100644
index 81d245c3e851..000000000000
--- a/irmap.c
+++ /dev/null
@@ -1,489 +0,0 @@
-/*
- * IRMAP -- inode reverse mapping.
- *
- * Helps us to map inode number (and device) back to path
- * so that we can restore inotify/fanotify-s.
- *
- * Scanning _is_ slow, so we limit it with hints, which are
- * heurisitical known places where notifies are typically put.
- */
-
-#include <stdbool.h>
-#include <fcntl.h>
-#include <dirent.h>
-#include <string.h>
-#include <stdio.h>
-#include <sys/stat.h>
-#include <unistd.h>
-
-#include "xmalloc.h"
-#include "irmap.h"
-#include "mount.h"
-#include "log.h"
-#include "util.h"
-#include "image.h"
-#include "stats.h"
-#include "pstree.h"
-#include "cr_options.h"
-
-#include "protobuf.h"
-#include "protobuf/fsnotify.pb-c.h"
-#include "protobuf/fh.pb-c.h"
-
-#undef LOG_PREFIX
-#define LOG_PREFIX "irmap: "
-
-#define IRMAP_CACHE_BITS 5
-#define IRMAP_CACHE_SIZE (1 << IRMAP_CACHE_BITS)
-#define IRMAP_CACHE_MASK (IRMAP_CACHE_SIZE - 1)
-
-static inline int irmap_hashfn(unsigned int s_dev, unsigned long i_ino)
-{
- return (s_dev + i_ino) & IRMAP_CACHE_MASK;
-}
-
-struct irmap {
- unsigned int dev;
- unsigned long ino;
- char *path;
- struct irmap *next;
- bool revalidate;
- int nr_kids;
- struct irmap *kids;
-};
-
-static struct irmap *cache[IRMAP_CACHE_SIZE];
-
-static struct irmap hints[] = {
- { .path = "/etc", .nr_kids = -1, },
- { .path = "/var/spool", .nr_kids = -1, },
- { .path = "/var/log", .nr_kids = -1, },
- { .path = "/usr/share/dbus-1/system-services", .nr_kids = -1 },
- { .path = "/var/lib/polkit-1/localauthority", .nr_kids = -1 },
- { .path = "/usr/share/polkit-1/actions", .nr_kids = -1 },
- { .path = "/lib/udev", .nr_kids = -1, },
- { .path = "/.", .nr_kids = 0, },
- { .path = "/no-such-path", .nr_kids = -1, },
- { },
-};
-
-/*
- * Update inode (and device) number and cache the entry
- */
-static int irmap_update_stat(struct irmap *i)
-{
- struct stat st;
- int mntns_root;
- unsigned hv;
-
- if (i->ino)
- return 0;
-
- mntns_root = get_service_fd(ROOT_FD_OFF);
-
- pr_debug("Refresh stat for %s\n", i->path);
- if (fstatat(mntns_root, i->path + 1, &st, AT_SYMLINK_NOFOLLOW)) {
- pr_perror("Can't stat %s", i->path);
- return -1;
- }
-
- i->revalidate = false;
- i->dev = st.st_dev;
- i->ino = st.st_ino;
- if (!S_ISDIR(st.st_mode))
- i->nr_kids = 0; /* don't irmap_update_dir */
-
- hv = irmap_hashfn(i->dev, i->ino);
- i->next = cache[hv];
- cache[hv] = i;
-
- return 0;
-}
-
-/*
- * Update list of children, but don't cache any. Later
- * we'll scan them one-by-one and cache.
- */
-static int irmap_update_dir(struct irmap *t)
-{
- int fd, nr = 0, mntns_root;
- DIR *dfd;
- struct dirent *de;
-
- if (t->nr_kids >= 0)
- return 0;
-
- mntns_root = get_service_fd(ROOT_FD_OFF);
-
- pr_debug("Refilling %s dir\n", t->path);
- fd = openat(mntns_root, t->path + 1, O_RDONLY);
- if (fd < 0) {
- pr_perror("Can't open %s", t->path);
- return -1;
- }
-
- dfd = fdopendir(fd);
- if (!dfd) {
- pr_perror("Can't opendir %s", t->path);
- return -1;
- }
-
- errno = 0;
- while ((de = readdir(dfd)) != NULL) {
- struct irmap *k;
-
- if (dir_dots(de))
- continue;
-
- nr++;
- if (xrealloc_safe(&t->kids, nr * sizeof(struct irmap)))
- goto out_err;
-
- k = &t->kids[nr - 1];
-
- k->kids = NULL; /* for xrealloc above */
- k->ino = 0; /* for irmap_update_stat */
- k->nr_kids = -1; /* for irmap_update_dir */
- k->path = xsprintf("%s/%s", t->path, de->d_name);
- if (!k->path)
- goto out_err;
- }
-
- if (errno) {
- pr_perror("Readdir failed");
- goto out_err;
- }
-
- closedir(dfd);
- close(fd);
- t->nr_kids = nr;
- return 0;
-
-out_err:
- xfree(t->kids);
- closedir(dfd);
- close(fd);
- return -1;
-}
-
-static struct irmap *irmap_scan(struct irmap *t, unsigned int dev, unsigned long ino)
-{
- struct irmap *c;
- int i;
-
- if (irmap_update_stat(t))
- return NULL;
-
- if (t->dev == dev && t->ino == ino)
- return t;
-
- if (irmap_update_dir(t))
- return NULL;
-
- for (i = 0; i < t->nr_kids; i++) {
- c = irmap_scan(&t->kids[i], dev, ino);
- if (c)
- return c;
- }
-
- return NULL;
-}
-
-static int irmap_revalidate(struct irmap *c, struct irmap **p)
-{
- struct stat st;
- int mntns_root;
-
- mntns_root = get_service_fd(ROOT_FD_OFF);
-
- pr_debug("Revalidate stat for %s\n", c->path);
- if (fstatat(mntns_root, c->path + 1, &st, AT_SYMLINK_NOFOLLOW)) {
- /* File can be (re)moved, so just treat it as invalid */
- pr_perror("Can't stat %s", c->path);
- goto invalid;
- }
-
- if (c->dev != st.st_dev)
- goto invalid;
- if (c->ino != st.st_ino)
- goto invalid;
-
- c->revalidate = false;
- return 0;
-
-invalid:
- pr_debug("\t%x:%lx is invalid\n", c->dev, c->ino);
- *p = c->next;
- xfree(c->path);
- xfree(c);
- return 1;
-}
-
-static bool doing_predump = false;
-
-char *irmap_lookup(unsigned int s_dev, unsigned long i_ino)
-{
- struct irmap *c, *h, **p;
- char *path = NULL;
- int hv;
- struct irmap_path_opt *o;
-
- s_dev = kdev_to_odev(s_dev);
-
- pr_debug("Resolving %x:%lx path\n", s_dev, i_ino);
-
- /*
- * If we're in predump, then processes already run
- * and the root_item is already freed by that time.
- * But the root service fd is already set by the
- * irmap_predump_prep, so we just go ahead and scan.
- */
- if (!doing_predump &&
- __mntns_get_root_fd(root_item->pid.real) < 0)
- goto out;
-
- timing_start(TIME_IRMAP_RESOLVE);
-
- hv = irmap_hashfn(s_dev, i_ino);
- for (p = &cache[hv]; *p; p = &(*p)->next) {
- c = *p;
- if (!(c->dev == s_dev && c->ino == i_ino))
- continue;
-
- if (c->revalidate && irmap_revalidate(c, p))
- continue;
-
- pr_debug("\tFound %s in cache\n", c->path);
- path = c->path;
- goto out;
- }
-
- /* Let's scan any user provided paths first; since the user told us
- * about them, hopefully they're more interesting than our hints.
- */
- list_for_each_entry(o, &opts.irmap_scan_paths, node) {
- c = irmap_scan(o->ir, s_dev, i_ino);
- if (c) {
- pr_debug("\tScanned %s\n", c->path);
- path = c->path;
- goto out;
- }
- }
-
- for (h = hints; h->path; h++) {
- pr_debug("Scanning %s hint\n", h->path);
- c = irmap_scan(h, s_dev, i_ino);
- if (c) {
- pr_debug("\tScanned %s\n", c->path);
- path = c->path;
- goto out;
- }
- }
-
-out:
- timing_stop(TIME_IRMAP_RESOLVE);
- return path;
-}
-
-/*
- * IRMAP pre-cache -- do early irmap scan on pre-dump to reduce
- * the freeze time on dump
- */
-
-struct irmap_predump {
- unsigned int dev;
- unsigned long ino;
- FhEntry fh;
- struct irmap_predump *next;
-};
-
-static struct irmap_predump *predump_queue;
-
-int irmap_queue_cache(unsigned int dev, unsigned long ino,
- FhEntry *fh)
-{
- struct irmap_predump *ip;
-
- ip = xmalloc(sizeof(*ip));
- if (!ip)
- return -1;
-
- ip->dev = dev;
- ip->ino = ino;
- ip->fh = *fh;
- fh->handle = NULL; /* don't free in free_fhandle */
-
- pr_debug("Queue %x:%lx for pre-dump\n", dev, ino);
-
- ip->next = predump_queue;
- predump_queue = ip;
- return 0;
-}
-
-int irmap_predump_prep(void)
-{
- /*
- * Tasks are about to get released soon, but
- * we'll need to do FS scan for irmaps. In this
- * scan we will need to know the root dir tasks
- * live in. Need to make sure the respective fd
- * (service) is set to that root, so that the
- * scan works and doesn't race with the tasks
- * dying or changind root.
- */
-
- doing_predump = true;
- return __mntns_get_root_fd(root_item->pid.real) < 0 ? -1 : 0;
-}
-
-int irmap_predump_run(void)
-{
- int ret = 0;
- struct cr_img *img;
- struct irmap_predump *ip;
-
- img = open_image_at(AT_FDCWD, CR_FD_IRMAP_CACHE, O_DUMP);
- if (!img)
- return -1;
-
- pr_info("Running irmap pre-dump\n");
-
- for (ip = predump_queue; ip; ip = ip->next) {
- pr_debug("\tchecking %x:%lx\n", ip->dev, ip->ino);
- ret = check_open_handle(ip->dev, ip->ino, &ip->fh);
- if (ret) {
- pr_err("Failed to resolve %x:%lx\n", ip->dev, ip->ino);
- break;
- }
-
- if (ip->fh.path) {
- IrmapCacheEntry ic = IRMAP_CACHE_ENTRY__INIT;
-
- pr_info("Irmap cache %x:%lx -> %s\n", ip->dev, ip->ino, ip->fh.path);
- ic.dev = ip->dev;
- ic.inode = ip->ino;
- ic.path = ip->fh.path;
-
- ret = pb_write_one(img, &ic, PB_IRMAP_CACHE);
- if (ret)
- break;
- }
- }
-
- close_image(img);
- return ret;
-}
-
-static int irmap_cache_one(IrmapCacheEntry *ie)
-{
- struct irmap *ic;
- unsigned hv;
-
- ic = xmalloc(sizeof(*ic));
- if (!ic)
- return -1;
-
- ic->dev = ie->dev;
- ic->ino = ie->inode;
- ic->path = xstrdup(ie->path);
- if (!ie->path) {
- xfree(ic);
- return -1;
- }
-
- ic->nr_kids = 0;
- /*
- * We've loaded entry from cache, thus we'll need to check
- * whether it's still valid when find it in cache.
- */
- ic->revalidate = true;
-
- pr_debug("Pre-cache %x:%lx -> %s\n", ic->dev, ic->ino, ic->path);
-
- hv = irmap_hashfn(ic->dev, ic->ino);
- ic->next = cache[hv];
- cache[hv] = ic;
-
- return 0;
-}
-
-static int open_irmap_cache(struct cr_img **img)
-{
- int dir = AT_FDCWD;
-
- pr_info("Searching irmap cache in work dir\n");
-in:
- *img = open_image_at(dir, CR_FD_IRMAP_CACHE, O_RSTR);
- if (dir != AT_FDCWD)
- close(dir);
-
- if (empty_image(*img)) {
- close_image(*img);
- if (dir == AT_FDCWD) {
- pr_info("Searching irmap cache in parent\n");
- dir = openat(get_service_fd(IMG_FD_OFF),
- CR_PARENT_LINK, O_RDONLY);
- if (dir >= 0)
- goto in;
- if (errno != ENOENT)
- return -1;
- }
-
- pr_info("No irmap cache\n");
- return 0;
- }
-
- if (!*img)
- return -1;
-
- pr_info("... done\n");
- return 1;
-}
-
-int irmap_load_cache(void)
-{
- int ret;
- struct cr_img *img;
-
- ret = open_irmap_cache(&img);
- if (ret <= 0)
- return ret;
-
- pr_info("Loading irmap cache\n");
- while (1) {
- IrmapCacheEntry *ic;
-
- ret = pb_read_one_eof(img, &ic, PB_IRMAP_CACHE);
- if (ret <= 0)
- break;
-
- ret = irmap_cache_one(ic);
- if (ret < 0)
- break;
-
- irmap_cache_entry__free_unpacked(ic, NULL);
- }
-
- close_image(img);
- return ret;
-}
-
-int irmap_scan_path_add(char *path)
-{
- struct irmap_path_opt *o;
-
- o = xzalloc(sizeof(*o));
- if (!o)
- return -1;
-
- o->ir = xzalloc(sizeof(*o->ir));
- if (!o->ir) {
- xfree(o);
- return -1;
- }
-
- o->ir->path = path;
- o->ir->nr_kids = -1;
- list_add(&o->node, &opts.irmap_scan_paths);
- return 0;
-}
diff --git a/kcmp-ids.c b/kcmp-ids.c
deleted file mode 100644
index 853879fe071b..000000000000
--- a/kcmp-ids.c
+++ /dev/null
@@ -1,153 +0,0 @@
-#include <unistd.h>
-#include <stdlib.h>
-#include <sys/syscall.h>
-
-#include "asm/types.h"
-#include "rbtree.h"
-#include "util.h"
-#include "kcmp-ids.h"
-
-/*
- * We track shared files by global rbtree, where each node might
- * be a root for subtree. The reason for that is the nature of data
- * we obtain from operating system.
- *
- * Basically OS provides us two ways to distinguish files
- *
- * - information obtained from fstat call
- * - shiny new sys_kcmp system call (which may compare the file descriptor
- * pointers inside the kernel and provide us order info)
- *
- * So, to speedup procedure of searching for shared file descriptors
- * we use both techniques. From fstat call we get that named general file
- * IDs (genid) which are carried in the main rbtree.
- *
- * In case if two genid are the same -- we need to use a second way and
- * call for sys_kcmp. Thus, if kernel tells us that files have identical
- * genid but in real they are different from kernel point of view -- we assign
- * a second unique key (subid) to such file descriptor and put it into a subtree.
- *
- * So the tree will look like
- *
- * (root)
- * genid-1
- * / \
- * genid-2 genid-3
- * / \ / \
- *
- * Where each genid node might be a sub-rbtree as well
- *
- * (genid-N)
- * / \
- * subid-1 subid-2
- * / \ / \
- *
- * Carrying two rbtree at once allow us to minimize the number
- * of sys_kcmp syscalls, also to collect and dump file descriptors
- * in one pass.
- */
-
-struct kid_entry {
- struct rb_node node;
-
- struct rb_root subtree_root;
- struct rb_node subtree_node;
-
- u32 subid; /* subid is always unique */
- struct kid_elem elem;
-} __aligned(sizeof(long));
-
-static struct kid_entry *alloc_kid_entry(struct kid_tree *tree, struct kid_elem *elem)
-{
- struct kid_entry *e;
-
- e = xmalloc(sizeof(*e));
- if (!e)
- goto err;
-
- e->subid = tree->subid++;
- e->elem = *elem;
-
- /* Make sure no overflow here */
- BUG_ON(!e->subid);
-
- rb_init_node(&e->node);
- rb_init_node(&e->subtree_node);
- e->subtree_root = RB_ROOT;
- rb_link_and_balance(&e->subtree_root, &e->subtree_node,
- NULL, &e->subtree_root.rb_node);
-err:
- return e;
-}
-
-static u32 kid_generate_sub(struct kid_tree *tree, struct kid_entry *e,
- struct kid_elem *elem, int *new_id)
-{
- struct rb_node *node = e->subtree_root.rb_node;
- struct kid_entry *sub = NULL;
-
- struct rb_node **new = &e->subtree_root.rb_node;
- struct rb_node *parent = NULL;
-
- BUG_ON(!node);
-
- while (node) {
- struct kid_entry *this = rb_entry(node, struct kid_entry, subtree_node);
- int ret = syscall(SYS_kcmp, this->elem.pid, elem->pid, tree->kcmp_type,
- this->elem.idx, elem->idx);
-
- parent = *new;
- if (ret == 1)
- node = node->rb_left, new = &((*new)->rb_left);
- else if (ret == 2)
- node = node->rb_right, new = &((*new)->rb_right);
- else if (ret == 0)
- return this->subid;
- else {
- pr_perror("kcmp failed: pid (%d %d) type %u idx (%u %u)",
- this->elem.pid, elem->pid, tree->kcmp_type,
- this->elem.idx, elem->idx);
- return 0;
- }
- }
-
- sub = alloc_kid_entry(tree, elem);
- if (!sub)
- return 0;
-
- rb_link_and_balance(&e->subtree_root, &sub->subtree_node, parent, new);
- *new_id = 1;
- return sub->subid;
-}
-
-u32 kid_generate_gen(struct kid_tree *tree,
- struct kid_elem *elem, int *new_id)
-{
- struct rb_node *node = tree->root.rb_node;
- struct kid_entry *e = NULL;
-
- struct rb_node **new = &tree->root.rb_node;
- struct rb_node *parent = NULL;
-
- while (node) {
- struct kid_entry *this = rb_entry(node, struct kid_entry, node);
-
- parent = *new;
- if (elem->genid < this->elem.genid)
- node = node->rb_left, new = &((*new)->rb_left);
- else if (elem->genid > this->elem.genid)
- node = node->rb_right, new = &((*new)->rb_right);
- else
- return kid_generate_sub(tree, this, elem, new_id);
- }
-
- e = alloc_kid_entry(tree, elem);
- if (!e)
- return 0;
-
- rb_link_and_balance(&tree->root, &e->node, parent, new);
- *new_id = 1;
- return e->subid;
-
-}
-
diff --git a/kerndat.c b/kerndat.c
deleted file mode 100644
index eb296033e5d9..000000000000
--- a/kerndat.c
+++ /dev/null
@@ -1,556 +0,0 @@
-#include <unistd.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <sys/file.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sys/mman.h>
-#include <errno.h>
-#include <sys/syscall.h>
-
-#include "log.h"
-#include "bug.h"
-#include "kerndat.h"
-#include "fs-magic.h"
-#include "mem.h"
-#include "compiler.h"
-#include "sysctl.h"
-#include "asm/types.h"
-#include "cr_options.h"
-#include "util.h"
-#include "lsm.h"
-#include "proc_parse.h"
-#include "config.h"
-
-struct kerndat_s kdat = {
- /*
- * TCP send receive buffers are calculated
- * dynamically by the kernel taking into account
- * the size of memory present on the machine.
- *
- * On machines with huge amount of memory it grants
- * up to 4M for sendding buffer and 6M for receiving.
- * But in turn for low mem machines these limits
- * are quite small down to 16K for sending and
- * 87380 for receiving.
- *
- * We will find out precise limits in tcp_read_sysctl_limits
- * but by default lets stick for small data to not fail
- * on restore: better to slowdown restore procedure than
- * failing completely.
- */
- .tcp_max_rshare = 87380,
-};
-
-static int check_pagemap(void)
-{
- int ret, fd;
- u64 pfn = 0;
-
- fd = __open_proc(PROC_SELF, EPERM, O_RDONLY, "pagemap");
- if (fd < 0) {
- if (errno == EPERM) {
- pr_info("Pagemap disabled");
- kdat.pmap = PM_DISABLED;
- return 0;
- }
-
- return -1;
- }
-
- /* Get the PFN of some present page. Stack is here, so try it :) */
- ret = pread(fd, &pfn, sizeof(pfn), (((unsigned long)&ret) / page_size()) * sizeof(pfn));
- if (ret != sizeof(pfn)) {
- pr_perror("Can't read pagemap");
- return -1;
- }
-
- close(fd);
-
- if ((pfn & PME_PFRAME_MASK) == 0) {
- pr_info("Pagemap provides flags only\n");
- kdat.pmap = PM_FLAGS_ONLY;
- } else {
- pr_info("Pagemap is fully functional\n");
- kdat.pmap = PM_FULL;
- }
-
- return 0;
-}
-
-/*
- * Anonymous shared mappings are backed by hidden tmpfs
- * mount. Find out its dev to distinguish such mappings
- * from real tmpfs files maps.
- */
-
-static int parse_self_maps(unsigned long vm_start, dev_t *device)
-{
- FILE *maps;
- char buf[1024];
-
- maps = fopen_proc(PROC_SELF, "maps");
- if (maps == NULL) {
- pr_perror("Can't open self maps");
- return -1;
- }
-
- while (fgets(buf, sizeof(buf), maps) != NULL) {
- char *end, *aux;
- unsigned long start;
- int maj, min;
-
- start = strtoul(buf, &end, 16);
- if (vm_start > start)
- continue;
- if (vm_start < start)
- break;
-
- /* It's ours */
- aux = strchr(end + 1, ' '); /* end prot */
- aux = strchr(aux + 1, ' '); /* prot pgoff */
- aux = strchr(aux + 1, ' '); /* pgoff dev */
-
- maj = strtoul(aux + 1, &end, 16);
- min = strtoul(end + 1, NULL, 16);
-
- *device = makedev(maj, min);
- fclose(maps);
- return 0;
- }
-
- fclose(maps);
- return -1;
-}
-
-static int kerndat_get_shmemdev(void)
-{
- void *map;
- char maps[128];
- struct stat buf;
- dev_t dev;
-
- map = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_ANONYMOUS, 0, 0);
- if (map == MAP_FAILED) {
- pr_perror("Can't mmap memory for shmemdev test");
- return -1;
- }
-
- sprintf(maps, "/proc/self/map_files/%lx-%lx",
- (unsigned long)map, (unsigned long)map + page_size());
- if (stat(maps, &buf) < 0) {
- int e = errno;
- if (errno == EPERM) {
- /*
- * Kernel disables messing with map_files.
- * OK, let's go the slower route.
- */
-
- if (parse_self_maps((unsigned long)map, &dev) < 0) {
- pr_err("Can't read self maps\n");
- goto err;
- }
- } else {
- pr_perror("Can't stat self map_files %d", e);
- goto err;
- }
- } else
- dev = buf.st_dev;
-
- munmap(map, PAGE_SIZE);
- kdat.shmem_dev = dev;
- pr_info("Found anon-shmem device at %"PRIx64"\n", kdat.shmem_dev);
- return 0;
-
-err:
- munmap(map, PAGE_SIZE);
- return -1;
-}
-
-static dev_t get_host_dev(unsigned int which)
-{
- static struct kst {
- const char *name;
- const char *path;
- unsigned int magic;
- dev_t fs_dev;
- } kstat[KERNDAT_FS_STAT_MAX] = {
- [KERNDAT_FS_STAT_DEVPTS] = {
- .name = "devpts",
- .path = "/dev/pts",
- .magic = DEVPTS_SUPER_MAGIC,
- },
- [KERNDAT_FS_STAT_DEVTMPFS] = {
- .name = "devtmpfs",
- .path = "/dev",
- .magic = TMPFS_MAGIC,
- },
- [KERNDAT_FS_STAT_BINFMT_MISC] = {
- .name = "binfmt_misc",
- .path = "/proc/sys/fs/binfmt_misc",
- .magic = BINFMTFS_MAGIC,
- },
- };
-
- if (which >= KERNDAT_FS_STAT_MAX) {
- pr_err("Wrong fs type %u passed\n", which);
- return 0;
- }
-
- if (kstat[which].fs_dev == 0) {
- struct statfs fst;
- struct stat st;
-
- if (statfs(kstat[which].path, &fst)) {
- pr_perror("Unable to statefs %s", kstat[which].path);
- return 0;
- }
-
- /*
- * XXX: If the fs we need is not there, it still
- * may mean that it's virtualized, but just not
- * mounted on the host.
- */
-
- if (fst.f_type != kstat[which].magic) {
- pr_err("%s isn't mount on the host\n", kstat[which].name);
- return 0;
- }
-
- if (stat(kstat[which].path, &st)) {
- pr_perror("Unable to stat %s", kstat[which].path);
- return 0;
- }
-
- BUG_ON(st.st_dev == 0);
- kstat[which].fs_dev = st.st_dev;
- }
-
- return kstat[which].fs_dev;
-}
-
-int kerndat_fs_virtualized(unsigned int which, u32 kdev)
-{
- dev_t host_fs_dev;
-
- host_fs_dev = get_host_dev(which);
- if (host_fs_dev == 0)
- return -1;
-
- return (kdev_to_odev(kdev) == host_fs_dev) ? 0 : 1;
-}
-
-/*
- * Check whether pagemap reports soft dirty bit. Kernel has
- * this functionality under CONFIG_MEM_SOFT_DIRTY option.
- */
-
-int kerndat_get_dirty_track(void)
-{
- char *map;
- int pm2;
- u64 pmap = 0;
- int ret = -1;
-
- map = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE,
- MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
- if (map == MAP_FAILED) {
- pr_perror("Can't mmap memory for pagemap test");
- return ret;
- }
-
- /*
- * Kernel shows soft-dirty bits only if this soft-dirty
- * was at least once re-set. (this is to be removed in
- * a couple of kernel releases)
- */
- ret = do_task_reset_dirty_track(getpid());
- if (ret < 0)
- return ret;
- if (ret == 1)
- goto no_dt;
-
- ret = -1;
- pm2 = open("/proc/self/pagemap", O_RDONLY);
- if (pm2 < 0) {
- pr_perror("Can't open pagemap file");
- munmap(map, PAGE_SIZE);
- return ret;
- }
-
- map[0] = '\0';
-
- lseek(pm2, (unsigned long)map / PAGE_SIZE * sizeof(u64), SEEK_SET);
- ret = read(pm2, &pmap, sizeof(pmap));
- if (ret < 0)
- pr_perror("Read pmap err!");
-
- close(pm2);
- munmap(map, PAGE_SIZE);
-
- if (pmap & PME_SOFT_DIRTY) {
- pr_info("Dirty track supported on kernel\n");
- kdat.has_dirty_track = true;
- } else {
-no_dt:
- pr_info("Dirty tracking support is OFF\n");
- if (opts.track_mem) {
- pr_err("Tracking memory is not available\n");
- return -1;
- }
- }
-
- return 0;
-}
-
-/*
- * Strictly speaking, if there is a machine with huge amount
- * of memory, we're allowed to send up to 4M and read up to
- * 6M of tcp data at once. But we will figure out precise size
- * of a limit a bit later when restore starts.
- *
- * Meanwhile set it up to 2M and 3M, which is safe enough to
- * proceed without errors.
- */
-
-static int tcp_read_sysctl_limits(void)
-{
- u32 vect[3] = { };
- int ret;
-
- struct sysctl_req req[] = {
- { "net/ipv4/tcp_rmem", &vect, CTL_U32A(ARRAY_SIZE(vect)), CTL_FLAGS_OPTIONAL },
- };
-
- /*
- * Lets figure out which exactly amount of memory is
- * availabe for send/read queues on restore.
- */
- ret = sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0);
- if (ret || vect[0] == 0) {
- pr_warn("TCP mem sysctls are not available. Using defaults.\n");
- goto out;
- }
-
- kdat.tcp_max_rshare = min(kdat.tcp_max_rshare, (int)vect[2]);
-
- if (kdat.tcp_max_rshare < 128)
- pr_warn("The memory limits for TCP queues are suspiciously small\n");
-out:
- pr_debug("TCP recv queue memory limit is %d\n", kdat.tcp_max_rshare);
- return 0;
-}
-
-/* The page frame number (PFN) is constant for the zero page */
-static int init_zero_page_pfn()
-{
- void *addr;
- int ret;
-
- addr = mmap(NULL, PAGE_SIZE, PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
- if (addr == MAP_FAILED) {
- pr_perror("Unable to map zero page");
- return 0;
- }
-
- if (*((int *) addr) != 0) {
- BUG();
- return -1;
- }
-
- if (kdat.pmap != PM_FULL) {
- pr_info("Zero page detection failed, optimization turns off.\n");
- return 0;
- }
-
- ret = vaddr_to_pfn((unsigned long)addr, &kdat.zero_page_pfn);
- munmap(addr, PAGE_SIZE);
-
- if (kdat.zero_page_pfn == 0)
- ret = -1;
-
- return ret;
-}
-
-static int get_last_cap(void)
-{
- struct sysctl_req req[] = {
- { "kernel/cap_last_cap", &kdat.last_cap, CTL_U32 },
- };
-
- return sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0);
-}
-
-#ifdef CONFIG_HAS_MEMFD
-static bool kerndat_has_memfd_create(void)
-{
- int ret;
-
- ret = syscall(SYS_memfd_create, NULL, 0);
-
- if (ret == -1 && errno == ENOSYS)
- kdat.has_memfd = false;
- else if (ret == -1 && errno == EFAULT)
- kdat.has_memfd = true;
- else {
- pr_err("Unexpected error %d from memfd_create(NULL, 0)\n", ret);
- return -1;
- }
-
- return 0;
-}
-#else
-static bool kerndat_has_memfd_create(void)
-{
- kdat.has_memfd = false;
- return 0;
-}
-#endif
-
-static int get_task_size(void)
-{
- kdat.task_size = task_size();
- pr_debug("Found task size of %lx\n", kdat.task_size);
- return 0;
-}
-
-int kerndat_fdinfo_has_lock()
-{
- int fd, pfd = -1, exit_code = -1, len;
- char buf[PAGE_SIZE];
-
- fd = open("/proc/locks", O_RDONLY);
- if (fd < 0) {
- pr_perror("Unable to open /proc/locks");
- return -1;
- }
-
- if (flock(fd, LOCK_SH)) {
- pr_perror("Can't take a lock");
- goto out;
- }
-
- pfd = open_proc(PROC_SELF, "fdinfo/%d", fd);
- if (pfd < 0)
- goto out;
-
- len = read(pfd, buf, sizeof(buf) - 1);
- if (len < 0) {
- pr_perror("Unable to read");
- goto out;
- }
- buf[len] = 0;
-
- kdat.has_fdinfo_lock = (strstr(buf, "lock:") != NULL);
-
- exit_code = 0;
-out:
- close(pfd);
- close(fd);
-
- return exit_code;
-}
-
-static int get_ipv6()
-{
- if (access("/proc/sys/net/ipv6", F_OK) < 0) {
- if (errno == ENOENT) {
- pr_debug("ipv6 is disabled\n");
- kdat.ipv6 = false;
- return 0;
- }
- pr_perror("Unable to access /proc/sys/net/ipv6");
- return -1;
- }
- kdat.ipv6 = true;
- return 0;
-}
-
-int kerndat_loginuid(bool only_dump)
-{
- unsigned int saved_loginuid;
- int ret;
-
- kdat.has_loginuid = false;
-
- /* No such file: CONFIG_AUDITSYSCALL disabled */
- saved_loginuid = parse_pid_loginuid(getpid(), &ret, true);
- if (ret < 0)
- return 0;
-
- if (only_dump) {
- kdat.has_loginuid = true;
- return 0;
- }
-
- /*
- * From kernel v3.13-rc2 it's possible to unset loginuid value,
- * on that rely dump/restore code.
- * See also: marc.info/?l=git-commits-head&m=138509506407067
- */
- if (prepare_loginuid(INVALID_UID, LOG_WARN) < 0)
- return 0;
- /* Cleaning value back as it was */
- if (prepare_loginuid(saved_loginuid, LOG_WARN) < 0)
- return 0;
-
- kdat.has_loginuid = true;
- return 0;
-}
-
-int kerndat_init(void)
-{
- int ret;
-
- ret = check_pagemap();
- if (!ret)
- ret = kerndat_get_shmemdev();
- if (!ret)
- ret = kerndat_get_dirty_track();
- if (!ret)
- ret = init_zero_page_pfn();
- if (!ret)
- ret = get_last_cap();
- if (!ret)
- ret = kerndat_fdinfo_has_lock();
- if (!ret)
- ret = get_task_size();
- if (!ret)
- ret = get_ipv6();
- if (!ret)
- ret = kerndat_loginuid(true);
-
- kerndat_lsm();
-
- return ret;
-}
-
-int kerndat_init_rst(void)
-{
- int ret;
-
- /*
- * Read TCP sysctls before anything else,
- * since the limits we're interested in are
- * not available inside namespaces.
- */
-
- ret = check_pagemap();
- if (!ret)
- ret = tcp_read_sysctl_limits();
- if (!ret)
- ret = get_last_cap();
- if (!ret)
- ret = kerndat_has_memfd_create();
- if (!ret)
- ret = get_task_size();
- if (!ret)
- ret = get_ipv6();
- if (!ret)
- ret = kerndat_loginuid(false);
-
- kerndat_lsm();
-
- return ret;
-}
diff --git a/lib/Makefile b/lib/Makefile
index f07e07b9051e..d1b0450599f8 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -15,11 +15,11 @@ c/built-in.o:
$(call msg-gen, $@)
$(Q) $(MAKE) $(build)=c all
-ccflags-so += $(CFLAGS) -rdynamic -Wl,-soname,$(lib-so).so.$(VERSION_SO_MAJOR)
+cflags-so += $(CFLAGS) -rdynamic -Wl,-soname,$(lib-so).so.$(VERSION_SO_MAJOR)
ldflags-so += -lprotobuf-c
c/$(CRIU_SO): c/built-in.o
$(call msg-link, $@)
- $(Q) $(CC) -shared $(ccflags-so) -o $@ $^ $(ldflags-so) $(LDFLAGS)
+ $(Q) $(CC) -shared $(cflags-so) -o $@ $^ $(ldflags-so) $(LDFLAGS)
lib-c: c/$(CRIU_SO)
PHONY += lib-c
@@ -40,3 +40,5 @@ clean:
all: $(PHONY)
@true
PHONY += all
+
+.PHONY: $(PHONY) clean
diff --git a/lib/c/Makefile b/lib/c/Makefile
index 16b4ef194004..21bf856142cd 100644
--- a/lib/c/Makefile
+++ b/lib/c/Makefile
@@ -1,13 +1,7 @@
obj-y += criu.o
obj-y += $(SRC_DIR)/images/rpc.pb-c.o
-ccflags-y += -iquote $(SRC_DIR)/crtools/$(ARCH_DIR)/include
-ccflags-y += -iquote $(SRC_DIR)/crtools/include -iquote $(obj)/..
+ccflags-y += -iquote $(SRC_DIR)/criu/$(ARCH_DIR)/include
+ccflags-y += -iquote $(SRC_DIR)/criu/include -iquote $(obj)/..
ccflags-y += -iquote $(SRC_DIR)/images
ccflags-y += -fPIC -Wa,--noexecstack -fno-stack-protector
-
-#
-# Remove once criu moved into proper place.
-ccflags-y += -iquote $(SRC_DIR)/$(ARCH_DIR)/include
-ccflags-y += -iquote $(SRC_DIR)/include -iquote $(obj)/..
-ccflags-y += -iquote $(SRC_DIR)/images
diff --git a/lib/py/Makefile b/lib/py/Makefile
index 135fe1c01600..582cc93dea84 100644
--- a/lib/py/Makefile
+++ b/lib/py/Makefile
@@ -8,7 +8,7 @@ images:
# rpc_pb2.py doesn't depend on any other file, so
# it is safe to rename it, dropping ugly _pb2 suffix.
rpc.py:
- $(Q) protoc -I=$(SRC_DIR)/protobuf/ --python_out=./ $(SRC_DIR)/protobuf/$(@:.py=.proto)
+ $(Q) protoc -I=$(SRC_DIR)/images/ --python_out=./ $(SRC_DIR)/images/$(@:.py=.proto)
$(Q) mv $(@:.py=_pb2.py) $@
clean:
diff --git a/lib/py/images/Makefile b/lib/py/images/Makefile
index 98a450065465..c8a748e5c02c 100644
--- a/lib/py/images/Makefile
+++ b/lib/py/images/Makefile
@@ -1,22 +1,21 @@
-all: pb.py protobuf magic.py
+all: pb.py images magic.py
-.PHONY: all protobuf clean pb.py
+.PHONY: all images clean pb.py
-proto := $(filter-out $(SRC_DIR)/protobuf/rpc.proto, $(sort $(wildcard $(SRC_DIR)/protobuf/*.proto)))
+proto := $(filter-out $(SRC_DIR)/images/rpc.proto, $(sort $(wildcard $(SRC_DIR)/images/*.proto)))
proto-py-modules := $(foreach m,$(proto),$(subst -,_,$(notdir $(m:.proto=_pb2))))
-# We don't need rpc_pb2.py here, as it is not related to the
-# images.
+# We don't need rpc_pb2.py here, as it is not related to the images.
# Unfortunately, we can't drop ugly _pb2 suffixes here, because
# some _pb2 files depend on others _pb2 files.
-protobuf:
- $(Q) protoc -I=$(SRC_DIR)/protobuf -I=/usr/include/ --python_out=./ $(proto)
+images:
+ $(Q) protoc -I=$(SRC_DIR)/images -I=/usr/include/ --python_out=./ $(proto)
-magic.py: $(SRC_DIR)/scripts/magic-gen.py $(SRC_DIR)/include/magic.h
- $(E) " GEN " $@
+magic.py: $(SRC_DIR)/scripts/magic-gen.py $(SRC_DIR)/criu/include/magic.h
+ $(call msg-gen, $@)
$(Q) python $^ $@
-pb.py: protobuf
+pb.py: images
$(Q) echo "# Autogenerated. Do not edit!" > $@
$(Q) for m in $(proto-py-modules); do \
echo "from $$m import *" >> $@ ;\
diff --git a/libnetlink.c b/libnetlink.c
deleted file mode 100644
index 49c804fd7053..000000000000
--- a/libnetlink.c
+++ /dev/null
@@ -1,160 +0,0 @@
-#include <linux/types.h>
-#include <sys/socket.h>
-#include <linux/netlink.h>
-#include <linux/rtnetlink.h>
-#include <string.h>
-#include <unistd.h>
-
-#include "libnetlink.h"
-#include "util.h"
-
-int parse_rtattr(struct rtattr *tb[], int max, struct rtattr *rta, int len)
-{
- memset(tb, 0, sizeof(struct rtattr *) * (max + 1));
- while (RTA_OK(rta, len)) {
- if ((rta->rta_type <= max) && (!tb[rta->rta_type]))
- tb[rta->rta_type] = rta;
- rta = RTA_NEXT(rta, len);
- }
- if (len)
- pr_warn("Trimmed RTA: len %d, rta_len %d\n", len, rta->rta_len);
- return 0;
-}
-
-static int nlmsg_receive(char *buf, int len, int (*cb)(struct nlmsghdr *, void *),
- int (*err_cb)(int, void *), void *arg)
-{
- struct nlmsghdr *hdr;
-
- for (hdr = (struct nlmsghdr *)buf; NLMSG_OK(hdr, len); hdr = NLMSG_NEXT(hdr, len)) {
- if (hdr->nlmsg_seq != CR_NLMSG_SEQ)
- continue;
- if (hdr->nlmsg_type == NLMSG_DONE) {
- int *len = (int *)NLMSG_DATA(hdr);
-
- if (*len < 0) {
- pr_err("ERROR %d reported by netlink (%s)\n",
- *len, strerror(-*len));
- return *len;
- }
-
- return 0;
- }
- if (hdr->nlmsg_type == NLMSG_ERROR) {
- struct nlmsgerr *err = (struct nlmsgerr *)NLMSG_DATA(hdr);
-
- if (hdr->nlmsg_len - sizeof(*hdr) < sizeof(struct nlmsgerr)) {
- pr_err("ERROR truncated\n");
- return -1;
- }
-
- if (err->error == 0)
- return 0;
-
- return err_cb(err->error, arg);
- }
- if (cb(hdr, arg))
- return -1;
- }
-
- return 1;
-}
-
-static int rtnl_return_err(int err, void *arg)
-{
- pr_warn("ERROR %d reported by netlink\n", err);
- return err;
-}
-
-int do_rtnl_req(int nl, void *req, int size,
- int (*receive_callback)(struct nlmsghdr *h, void *),
- int (*error_callback)(int err, void *), void *arg)
-{
- struct msghdr msg;
- struct sockaddr_nl nladdr;
- struct iovec iov;
- static char buf[16384];
- int err;
-
- if (!error_callback)
- error_callback = rtnl_return_err;
-
- memset(&msg, 0, sizeof(msg));
- msg.msg_name = &nladdr;
- msg.msg_namelen = sizeof(nladdr);
- msg.msg_iov = &iov;
- msg.msg_iovlen = 1;
-
- memset(&nladdr, 0, sizeof(nladdr));
- nladdr.nl_family = AF_NETLINK;
-
- iov.iov_base = req;
- iov.iov_len = size;
-
- if (sendmsg(nl, &msg, 0) < 0) {
- err = -errno;
- pr_perror("Can't send request message");
- goto err;
- }
-
- iov.iov_base = buf;
- iov.iov_len = sizeof(buf);
-
- while (1) {
-
- memset(&msg, 0, sizeof(msg));
- msg.msg_name = &nladdr;
- msg.msg_namelen = sizeof(nladdr);
- msg.msg_iov = &iov;
- msg.msg_iovlen = 1;
-
- err = recvmsg(nl, &msg, 0);
- if (err < 0) {
- if (errno == EINTR)
- continue;
- else {
- err = -errno;
- pr_perror("Error receiving nl report");
- goto err;
- }
- }
- if (err == 0)
- break;
-
- if (msg.msg_flags & MSG_TRUNC) {
- pr_err("Message truncated\n");
- err = -EMSGSIZE;
- goto err;
- }
-
- err = nlmsg_receive(buf, err, receive_callback, error_callback, arg);
- if (err < 0)
- goto err;
- if (err == 0)
- break;
- }
-
- return 0;
-
-err:
- return err;
-}
-
-int addattr_l(struct nlmsghdr *n, int maxlen, int type, const void *data,
- int alen)
-{
- int len = RTA_LENGTH(alen);
- struct rtattr *rta;
-
- if (NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len) > maxlen) {
- pr_err("addattr_l ERROR: message exceeded bound of %d\n", maxlen);
- return -1;
- }
-
- rta = NLMSG_TAIL(n);
- rta->rta_type = type;
- rta->rta_len = len;
- memcpy(RTA_DATA(rta), data, alen);
- n->nlmsg_len = NLMSG_ALIGN(n->nlmsg_len) + RTA_ALIGN(len);
- return 0;
-}
diff --git a/log.c b/log.c
deleted file mode 100644
index 1435401abac5..000000000000
--- a/log.c
+++ /dev/null
@@ -1,199 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdarg.h>
-#include <errno.h>
-#include <unistd.h>
-#include <stdbool.h>
-#include <limits.h>
-
-#include <sys/types.h>
-#include <sys/time.h>
-#include <sys/resource.h>
-
-#include <fcntl.h>
-
-#include "compiler.h"
-#include "asm/types.h"
-#include "util.h"
-#include "cr_options.h"
-#include "servicefd.h"
-
-#define DEFAULT_LOGFD STDERR_FILENO
-/* Enable timestamps if verbosity is increased from default */
-#define LOG_TIMESTAMP (DEFAULT_LOGLEVEL + 1)
-
-static unsigned int current_loglevel = DEFAULT_LOGLEVEL;
-
-static char buffer[PAGE_SIZE * 2];
-static char buf_off = 0;
-
-static struct timeval start;
-/*
- * Manual buf len as sprintf will _always_ put '\0' at the end,
- * but we want a "constant" pid to be there on restore
- */
-#define TS_BUF_OFF 12
-
-static void timediff(struct timeval *from, struct timeval *to)
-{
- to->tv_sec -= from->tv_sec;
- if (to->tv_usec >= from->tv_usec)
- to->tv_usec -= from->tv_usec;
- else {
- to->tv_sec--;
- to->tv_usec += 1000000 - from->tv_usec;
- }
-}
-
-static void print_ts(void)
-{
- struct timeval t;
-
- gettimeofday(&t, NULL);
- timediff(&start, &t);
- snprintf(buffer, TS_BUF_OFF,
- "(%02u.%06u)", (unsigned)t.tv_sec, (unsigned)t.tv_usec);
- buffer[TS_BUF_OFF - 1] = ' '; /* kill the '\0' produced by snprintf */
-}
-
-int log_get_fd(void)
-{
- int fd = get_service_fd(LOG_FD_OFF);
-
- return fd < 0 ? DEFAULT_LOGFD : fd;
-}
-
-static void reset_buf_off(void)
-{
- if (current_loglevel >= LOG_TIMESTAMP)
- /* reserve space for a timestamp */
- buf_off = TS_BUF_OFF;
- else
- buf_off = 0;
-}
-
-int log_init(const char *output)
-{
- int new_logfd, fd;
-
- gettimeofday(&start, NULL);
- reset_buf_off();
-
- if (output) {
- new_logfd = open(output, O_CREAT|O_TRUNC|O_WRONLY|O_APPEND, 0600);
- if (new_logfd < 0) {
- pr_perror("Can't create log file %s", output);
- return -1;
- }
- } else {
- new_logfd = dup(DEFAULT_LOGFD);
- if (new_logfd < 0) {
- pr_perror("Can't dup log file");
- return -1;
- }
- }
-
- fd = install_service_fd(LOG_FD_OFF, new_logfd);
- close(new_logfd);
- if (fd < 0)
- goto err;
-
- return 0;
-
-err:
- pr_perror("Log engine failure, can't duplicate descriptor");
- return -1;
-}
-
-int log_init_by_pid(void)
-{
- char path[PATH_MAX];
-
- /*
- * reset buf_off as this fn is called on each fork while
- * restoring process tree
- */
- reset_buf_off();
-
- if (!opts.log_file_per_pid) {
- buf_off += snprintf(buffer + buf_off, sizeof buffer - buf_off, "%6d: ", getpid());
- return 0;
- }
-
- if (!opts.output)
- return 0;
-
- snprintf(path, PATH_MAX, "%s.%d", opts.output, getpid());
-
- return log_init(path);
-}
-
-void log_fini(void)
-{
- close_service_fd(LOG_FD_OFF);
-}
-
-void log_set_loglevel(unsigned int level)
-{
- if (level == LOG_UNSET)
- current_loglevel = DEFAULT_LOGLEVEL;
- else
- current_loglevel = level;
-}
-
-unsigned int log_get_loglevel(void)
-{
- return current_loglevel;
-}
-
-static void __print_on_level(unsigned int loglevel, const char *format, va_list params)
-{
- int fd, size, ret, off = 0;
- int __errno = errno;
-
- if (unlikely(loglevel == LOG_MSG)) {
- fd = STDOUT_FILENO;
- off = buf_off; /* skip dangling timestamp */
- } else {
- if (loglevel > current_loglevel)
- return;
- fd = log_get_fd();
- if (current_loglevel >= LOG_TIMESTAMP)
- print_ts();
- }
-
- size = vsnprintf(buffer + buf_off, sizeof buffer - buf_off, format, params);
- size += buf_off;
-
- while (off < size) {
- ret = write(fd, buffer + off, size - off);
- if (ret <= 0)
- break;
- off += ret;
- }
- errno = __errno;
-}
-
-void print_on_level(unsigned int loglevel, const char *format, ...)
-{
- va_list params;
-
- va_start(params, format);
- __print_on_level(loglevel, format, params);
- va_end(params);
-}
-
-int write_pidfile(int pid)
-{
- int fd;
-
- fd = open(opts.pidfile, O_WRONLY | O_EXCL | O_CREAT, 0600);
- if (fd == -1) {
- pr_perror("Can't open %s", opts.pidfile);
- return -1;
- }
-
- dprintf(fd, "%d", pid);
- close(fd);
- return 0;
-}
diff --git a/lsm.c b/lsm.c
deleted file mode 100644
index 158caf0733d5..000000000000
--- a/lsm.c
+++ /dev/null
@@ -1,251 +0,0 @@
-#include <errno.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <unistd.h>
-
-#include "config.h"
-#include "pstree.h"
-#include "util.h"
-#include "cr_options.h"
-
-#include "protobuf.h"
-#include "protobuf/inventory.pb-c.h"
-#include "protobuf/creds.pb-c.h"
-
-#undef CONFIG_HAS_SELINUX
-
-#ifdef CONFIG_HAS_SELINUX
-#include <selinux/selinux.h>
-#endif
-
-static Lsmtype lsmtype;
-static int (*get_label)(pid_t, char **) = NULL;
-static char *name = NULL;
-
-static int apparmor_get_label(pid_t pid, char **profile_name)
-{
- FILE *f;
- char *space;
-
- f = fopen_proc(pid, "attr/current");
- if (!f)
- return -1;
-
- if (fscanf(f, "%ms", profile_name) != 1) {
- fclose(f);
- pr_perror("err scanfing");
- return -1;
- }
-
- fclose(f);
-
- /*
- * A profile name can be followed by an enforcement mode, e.g.
- * lxc-default-with-nesting (enforced)
- * but the profile name is just the part before the space.
- */
- space = strstr(*profile_name, " ");
- if (space)
- *space = 0;
-
- /*
- * An "unconfined" value means there is no profile, so we don't need to
- * worry about trying to restore one.
- */
- if (strcmp(*profile_name, "unconfined") == 0) {
- free(*profile_name);
- *profile_name = NULL;
- }
-
- return 0;
-}
-
-#ifdef CONFIG_HAS_SELINUX
-static int selinux_get_label(pid_t pid, char **output)
-{
- security_context_t ctx;
- char *pos, *last;
- int i;
-
- if (getpidcon_raw(pid, &ctx) < 0) {
- pr_perror("getting selinux profile failed");
- return -1;
- }
-
- *output = NULL;
-
- /*
- * Since SELinux attributes can be finer grained than at the task
- * level, and we currently don't try to dump any of these other bits,
- * let's only allow unconfined profiles, which look something like:
- *
- * unconfined_u:unconfined_r:unconfined_t:s0-s0:c0.c1023
- */
- pos = (char*)ctx;
- for (i = 0; i < 3; i++) {
- last = pos;
- pos = strstr(pos, ":");
- if (!pos) {
- pr_err("Invalid selinux context %s\n", (char *)ctx);
- freecon(ctx);
- return -1;
- }
-
- *pos = 0;
- if (!strstartswith(last, "unconfined_")) {
- pr_err("Non unconfined selinux contexts not supported %s\n", last);
- freecon(ctx);
- return -1;
- }
-
- pos++;
- }
- freecon(ctx);
-
- return 0;
-}
-#endif
-
-void kerndat_lsm(void)
-{
- /* On restore, if someone passes --lsm-profile, we might end up doing
- * detection twice, once during flag parsing and once for
- * kerndat_init_rst(). Let's detect when we've already done detection
- * and not do it again.
- */
- if (name)
- return;
-
- if (access("/sys/kernel/security/apparmor", F_OK) == 0) {
- get_label = apparmor_get_label;
- lsmtype = LSMTYPE__APPARMOR;
- name = "apparmor";
- return;
- }
-
-#ifdef CONFIG_HAS_SELINUX
- /*
- * This seems to be the canonical place to mount this fs if it is
- * enabled, although we may (?) want to check /selinux for posterity as
- * well.
- */
- if (access("/sys/fs/selinux", F_OK) == 0) {
- get_label = selinux_get_label;
- lsmtype = LSMTYPE__SELINUX;
- name = "selinux";
- return;
- }
-#endif
-
- get_label = NULL;
- lsmtype = LSMTYPE__NO_LSM;
- name = "none";
-}
-
-Lsmtype host_lsm_type(void)
-{
- return lsmtype;
-}
-
-int collect_lsm_profile(pid_t pid, CredsEntry *ce)
-{
- ce->lsm_profile = NULL;
-
- if (lsmtype == LSMTYPE__NO_LSM)
- return 0;
-
- if (get_label(pid, &ce->lsm_profile) < 0)
- return -1;
-
- if (ce->lsm_profile)
- pr_info("%d has lsm profile %s\n", pid, ce->lsm_profile);
-
- return 0;
-}
-
-// in inventory.c
-extern Lsmtype image_lsm;
-
-int validate_lsm(char *lsm_profile)
-{
- if (image_lsm == LSMTYPE__NO_LSM || image_lsm == lsmtype)
- return 0;
-
- /*
- * This is really only a problem if the processes have actually
- * specified an LSM profile. If not, we won't restore anything anyway,
- * so it's fine.
- */
- if (lsm_profile) {
- pr_err("mismatched lsm types and lsm profile specified\n");
- return -1;
- }
-
- return 0;
-}
-
-int render_lsm_profile(char *profile, char **val)
-{
- *val = NULL;
-
- switch (lsmtype) {
- case LSMTYPE__APPARMOR:
- if (strcmp(profile, "unconfined") != 0 && asprintf(val, "changeprofile %s", profile) < 0) {
- *val = NULL;
- return -1;
- }
- break;
- case LSMTYPE__SELINUX:
- if (asprintf(val, "%s", profile) < 0) {
- *val = NULL;
- return -1;
- }
- break;
- default:
- return -1;
- }
-
- return 0;
-}
-
-int parse_lsm_arg(char *arg)
-{
- char *aux;
-
- kerndat_lsm();
-
- aux = strchr(arg, ':');
- if (aux == NULL) {
- pr_err("invalid argument %s for --lsm-profile", arg);
- return -1;
- }
-
- *aux = '\0';
- aux++;
-
- if (strcmp(arg, "apparmor") == 0) {
- if (lsmtype != LSMTYPE__APPARMOR) {
- pr_err("apparmor LSM specified but apparmor not supported by kernel\n");
- return -1;
- }
-
- opts.lsm_profile = aux;
- } else if (strcmp(arg, "selinux") == 0) {
- if (lsmtype != LSMTYPE__SELINUX) {
- pr_err("selinux LSM specified but selinux not supported by kernel\n");
- return -1;
- }
-
- opts.lsm_profile = aux;
- } else if (strcmp(arg, "none") == 0) {
- opts.lsm_profile = NULL;
- } else {
- pr_err("unknown lsm %s\n", arg);
- return -1;
- }
-
- opts.lsm_supplied = true;
-
- return 0;
-}
diff --git a/mem.c b/mem.c
deleted file mode 100644
index 332f1928bb2d..000000000000
--- a/mem.c
+++ /dev/null
@@ -1,473 +0,0 @@
-#include <unistd.h>
-#include <stdio.h>
-#include <sys/mman.h>
-#include <errno.h>
-#include <fcntl.h>
-
-#include "cr_options.h"
-#include "servicefd.h"
-#include "mem.h"
-#include "parasite-syscall.h"
-#include "parasite.h"
-#include "page-pipe.h"
-#include "page-xfer.h"
-#include "log.h"
-#include "kerndat.h"
-#include "stats.h"
-#include "vma.h"
-#include "shmem.h"
-#include "pstree.h"
-#include "restorer.h"
-#include "files-reg.h"
-#include "pagemap-cache.h"
-
-#include "protobuf.h"
-#include "protobuf/pagemap.pb-c.h"
-
-static int task_reset_dirty_track(int pid)
-{
- int ret;
-
- if (!opts.track_mem)
- return 0;
-
- BUG_ON(!kdat.has_dirty_track);
-
- ret = do_task_reset_dirty_track(pid);
- BUG_ON(ret == 1);
- return ret;
-}
-
-int do_task_reset_dirty_track(int pid)
-{
- int fd, ret;
- char cmd[] = "4";
-
- pr_info("Reset %d's dirty tracking\n", pid);
-
- fd = __open_proc(pid, EACCES, O_RDWR, "clear_refs");
- if (fd < 0)
- return errno == EACCES ? 1 : -1;
-
- ret = write(fd, cmd, sizeof(cmd));
- if (ret < 0) {
- if (errno == EINVAL) /* No clear-soft-dirty in kernel */
- ret = 1;
- else {
- pr_perror("Can't reset %d's dirty memory tracker (%d)\n", pid, errno);
- ret = -1;
- }
- } else {
- pr_info(" ... done\n");
- ret = 0;
- }
-
- close(fd);
- return ret;
-}
-
-unsigned int dump_pages_args_size(struct vm_area_list *vmas)
-{
- /* In the worst case I need one iovec for each page */
- return sizeof(struct parasite_dump_pages_args) +
- vmas->nr * sizeof(struct parasite_vma_entry) +
- (vmas->priv_size + 1) * sizeof(struct iovec);
-}
-
-static inline bool should_dump_page(VmaEntry *vmae, u64 pme)
-{
-#ifdef CONFIG_VDSO
- /*
- * vDSO area must be always dumped because on restore
- * we might need to generate a proxy.
- */
- if (vma_entry_is(vmae, VMA_AREA_VDSO))
- return true;
- /*
- * In turn VVAR area is special and referenced from
- * vDSO area by IP addressing (at least on x86) thus
- * never ever dump its content but always use one provided
- * by the kernel on restore, ie runtime VVAR area must
- * be remapped into proper place..
- */
- if (vma_entry_is(vmae, VMA_AREA_VVAR))
- return false;
-#endif
- /*
- * Optimisation for private mapping pages, that haven't
- * yet being COW-ed
- */
- if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE))
- return false;
- if (pme & PME_SWAP)
- return true;
- if ((pme & PME_PRESENT) && ((pme & PME_PFRAME_MASK) != kdat.zero_page_pfn))
- return true;
-
- return false;
-}
-
-static inline bool page_in_parent(u64 pme)
-{
- /*
- * If we do memory tracking, but w/o parent images,
- * then we have to dump all memory
- */
-
- return opts.track_mem && opts.img_parent && !(pme & PME_SOFT_DIRTY);
-}
-
-/*
- * This routine finds out what memory regions to grab from the
- * dumpee. The iovs generated are then fed into vmsplice to
- * put the memory into the page-pipe's pipe.
- *
- * "Holes" in page-pipe are regions, that should be dumped, but
- * the memory contents is present in the pagent image set.
- */
-
-static int generate_iovs(struct vma_area *vma, struct page_pipe *pp, u64 *map, u64 *off, bool has_parent)
-{
- u64 *at = &map[PAGE_PFN(*off)];
- unsigned long pfn, nr_to_scan;
- unsigned long pages[2] = {};
-
- nr_to_scan = (vma_area_len(vma) - *off) / PAGE_SIZE;
-
- for (pfn = 0; pfn < nr_to_scan; pfn++) {
- unsigned long vaddr;
- int ret;
-
- if (!should_dump_page(vma->e, at[pfn]))
- continue;
-
- vaddr = vma->e->start + *off + pfn * PAGE_SIZE;
-
- /*
- * If we're doing incremental dump (parent images
- * specified) and page is not soft-dirty -- we dump
- * hole and expect the parent images to contain this
- * page. The latter would be checked in page-xfer.
- */
-
- if (has_parent && page_in_parent(at[pfn])) {
- ret = page_pipe_add_hole(pp, vaddr);
- pages[0]++;
- } else {
- ret = page_pipe_add_page(pp, vaddr);
- pages[1]++;
- }
-
- if (ret) {
- *off += pfn * PAGE_SIZE;
- return ret;
- }
- }
-
- *off += pfn * PAGE_SIZE;
-
- cnt_add(CNT_PAGES_SCANNED, nr_to_scan);
- cnt_add(CNT_PAGES_SKIPPED_PARENT, pages[0]);
- cnt_add(CNT_PAGES_WRITTEN, pages[1]);
-
- pr_info("Pagemap generated: %lu pages %lu holes\n", pages[1], pages[0]);
- return 0;
-}
-
-static struct parasite_dump_pages_args *prep_dump_pages_args(struct parasite_ctl *ctl,
- struct vm_area_list *vma_area_list)
-{
- struct parasite_dump_pages_args *args;
- struct parasite_vma_entry *p_vma;
- struct vma_area *vma;
-
- args = parasite_args_s(ctl, dump_pages_args_size(vma_area_list));
-
- p_vma = pargs_vmas(args);
- args->nr_vmas = 0;
-
- list_for_each_entry(vma, &vma_area_list->h, list) {
- if (!vma_area_is_private(vma, kdat.task_size))
- continue;
- if (vma->e->prot & PROT_READ)
- continue;
-
- p_vma->start = vma->e->start;
- p_vma->len = vma_area_len(vma);
- p_vma->prot = vma->e->prot;
-
- args->nr_vmas++;
- p_vma++;
- }
-
- return args;
-}
-
-static int dump_pages(struct page_pipe *pp, struct parasite_ctl *ctl,
- struct parasite_dump_pages_args *args, struct page_xfer *xfer)
-{
- struct page_pipe_buf *ppb;
- int ret = 0;
-
- debug_show_page_pipe(pp);
-
- /* Step 2 -- grab pages into page-pipe */
- list_for_each_entry(ppb, &pp->bufs, l) {
- args->nr_segs = ppb->nr_segs;
- args->nr_pages = ppb->pages_in;
- pr_debug("PPB: %d pages %d segs %u pipe %d off\n",
- args->nr_pages, args->nr_segs, ppb->pipe_size, args->off);
-
- ret = __parasite_execute_daemon(PARASITE_CMD_DUMPPAGES, ctl);
- if (ret < 0)
- return -1;
- ret = parasite_send_fd(ctl, ppb->p[1]);
- if (ret)
- return -1;
-
- ret = __parasite_wait_daemon_ack(PARASITE_CMD_DUMPPAGES, ctl);
- if (ret < 0)
- return -1;
-
- args->off += args->nr_segs;
- }
-
- /*
- * Step 3 -- write pages into image (or delay writing for
- * pre-dump action (see pre_dump_one_task)
- */
- if (xfer) {
- timing_start(TIME_MEMWRITE);
- ret = page_xfer_dump_pages(xfer, pp, 0);
- timing_stop(TIME_MEMWRITE);
- }
-
- return ret;
-}
-
-static int __parasite_dump_pages_seized(struct parasite_ctl *ctl,
- struct parasite_dump_pages_args *args,
- struct vm_area_list *vma_area_list,
- struct page_pipe **pp_ret)
-{
- pmc_t pmc = PMC_INIT;
- struct page_pipe *pp;
- struct vma_area *vma_area;
- struct page_xfer xfer = { .parent = NULL };
- int ret = -1;
-
- pr_info("\n");
- pr_info("Dumping pages (type: %d pid: %d)\n", CR_FD_PAGES, ctl->pid.real);
- pr_info("----------------------------------------\n");
-
- timing_start(TIME_MEMDUMP);
-
- pr_debug(" Private vmas %lu/%lu pages\n",
- vma_area_list->longest, vma_area_list->priv_size);
-
- /*
- * Step 0 -- prepare
- */
-
- if (pmc_init(&pmc, ctl->pid.real, &vma_area_list->h,
- vma_area_list->longest * PAGE_SIZE))
- return -1;
-
- ret = -1;
- pp = create_page_pipe(vma_area_list->priv_size,
- pargs_iovs(args), pp_ret == NULL);
- if (!pp)
- goto out;
-
- if (pp_ret == NULL) {
- ret = open_page_xfer(&xfer, CR_FD_PAGEMAP, ctl->pid.virt);
- if (ret < 0)
- goto out_pp;
- } else {
- ret = check_parent_page_xfer(CR_FD_PAGEMAP, ctl->pid.virt);
- if (ret < 0)
- goto out_pp;
-
- if (ret)
- xfer.parent = NULL + 1;
- }
-
- /*
- * Step 1 -- generate the pagemap
- */
- args->off = 0;
- list_for_each_entry(vma_area, &vma_area_list->h, list) {
- u64 off = 0;
- u64 *map;
-
- if (!vma_area_is_private(vma_area, kdat.task_size))
- continue;
-
- map = pmc_get_map(&pmc, vma_area);
- if (!map)
- goto out_xfer;
-again:
- ret = generate_iovs(vma_area, pp, map, &off, xfer.parent);
- if (ret == -EAGAIN) {
- BUG_ON(pp_ret);
-
- ret = dump_pages(pp, ctl, args, &xfer);
- if (ret)
- goto out_xfer;
- page_pipe_reinit(pp);
- goto again;
- }
- if (ret < 0)
- goto out_xfer;
- }
-
- ret = dump_pages(pp, ctl, args, pp_ret ? NULL : &xfer);
- if (ret)
- goto out_xfer;
-
- timing_stop(TIME_MEMDUMP);
-
- if (pp_ret)
- *pp_ret = pp;
-
- /*
- * Step 4 -- clean up
- */
-
- ret = task_reset_dirty_track(ctl->pid.real);
-out_xfer:
- if (pp_ret == NULL)
- xfer.close(&xfer);
-out_pp:
- if (ret || !pp_ret)
- destroy_page_pipe(pp);
-out:
- pmc_fini(&pmc);
- pr_info("----------------------------------------\n");
- return ret;
-}
-
-int parasite_dump_pages_seized(struct parasite_ctl *ctl,
- struct vm_area_list *vma_area_list, struct page_pipe **pp)
-{
- int ret;
- struct parasite_dump_pages_args *pargs;
-
- pargs = prep_dump_pages_args(ctl, vma_area_list);
-
- /*
- * Add PROT_READ protection for all VMAs we're about to
- * dump if they don't have one. Otherwise we'll not be
- * able to read the memory contents.
- *
- * Afterwards -- reprotect memory back.
- */
-
- pargs->add_prot = PROT_READ;
- ret = parasite_execute_daemon(PARASITE_CMD_MPROTECT_VMAS, ctl);
- if (ret) {
- pr_err("Can't dump unprotect vmas with parasite\n");
- return ret;
- }
-
- ret = __parasite_dump_pages_seized(ctl, pargs, vma_area_list, pp);
- if (ret)
- pr_err("Can't dump page with parasite\n");
-
- pargs->add_prot = 0;
- if (parasite_execute_daemon(PARASITE_CMD_MPROTECT_VMAS, ctl)) {
- pr_err("Can't rollback unprotected vmas with parasite\n");
- ret = -1;
- }
-
- return ret;
-}
-
-static inline int collect_filemap(struct vma_area *vma)
-{
- struct file_desc *fd;
-
- fd = collect_special_file(vma->e->shmid);
- if (!fd)
- return -1;
-
- vma->vmfd = fd;
- return 0;
-}
-
-int prepare_mm_pid(struct pstree_item *i)
-{
- pid_t pid = i->pid.virt;
- int ret = -1, vn = 0;
- struct cr_img *img;
- struct rst_info *ri = rsti(i);
-
- img = open_image(CR_FD_MM, O_RSTR, pid);
- if (!img)
- return -1;
-
- ret = pb_read_one_eof(img, &ri->mm, PB_MM);
- close_image(img);
- if (ret <= 0)
- return ret;
-
- if (collect_special_file(ri->mm->exe_file_id) == NULL)
- return -1;
-
- pr_debug("Found %zd VMAs in image\n", ri->mm->n_vmas);
- img = NULL;
- if (ri->mm->n_vmas == 0) {
- /*
- * Old image. Read VMAs from vma-.img
- */
- img = open_image(CR_FD_VMAS, O_RSTR, pid);
- if (!img)
- return -1;
- }
-
-
- while (vn < ri->mm->n_vmas || img != NULL) {
- struct vma_area *vma;
-
- ret = -1;
- vma = alloc_vma_area();
- if (!vma)
- break;
-
- ret = 0;
- ri->vmas.nr++;
- if (!img)
- vma->e = ri->mm->vmas[vn++];
- else {
- ret = pb_read_one_eof(img, &vma->e, PB_VMA);
- if (ret <= 0) {
- xfree(vma);
- close_image(img);
- break;
- }
- }
- list_add_tail(&vma->list, &ri->vmas.h);
-
- if (vma_area_is_private(vma, kdat.task_size)) {
- ri->vmas.priv_size += vma_area_len(vma);
- if (vma->e->flags & MAP_GROWSDOWN)
- ri->vmas.priv_size += PAGE_SIZE;
- }
-
- pr_info("vma 0x%"PRIx64" 0x%"PRIx64"\n", vma->e->start, vma->e->end);
-
- if (vma_area_is(vma, VMA_ANON_SHARED) &&
- !vma_area_is(vma, VMA_AREA_SYSVIPC))
- ret = collect_shmem(pid, vma->e);
- else if (vma_area_is(vma, VMA_FILE_PRIVATE) ||
- vma_area_is(vma, VMA_FILE_SHARED))
- ret = collect_filemap(vma);
- else
- ret = 0;
- if (ret)
- break;
- }
-
- return ret;
-}
-
diff --git a/mount.c b/mount.c
deleted file mode 100644
index 05cf6cf0a0f9..000000000000
--- a/mount.c
+++ /dev/null
@@ -1,3455 +0,0 @@
-#include <stdio.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <dirent.h>
-#include <errno.h>
-#include <sys/stat.h>
-#include <string.h>
-#include <stdlib.h>
-#include <sys/mount.h>
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <sched.h>
-
-#include "cr_options.h"
-#include "asm/types.h"
-#include "util.h"
-#include "util-pie.h"
-#include "log.h"
-#include "plugin.h"
-#include "mount.h"
-#include "pstree.h"
-#include "proc_parse.h"
-#include "image.h"
-#include "namespaces.h"
-#include "protobuf.h"
-#include "kerndat.h"
-#include "fs-magic.h"
-#include "sysfs_parse.h"
-
-#include "protobuf/mnt.pb-c.h"
-#include "protobuf/binfmt-misc.pb-c.h"
-
-#define AUTODETECTED_MOUNT "CRIU:AUTOGENERATED"
-#define MS_PROPAGATE (MS_SHARED | MS_PRIVATE | MS_UNBINDABLE | MS_SLAVE)
-
-#undef LOG_PREFIX
-#define LOG_PREFIX "mnt: "
-
-int ext_mount_add(char *key, char *val)
-{
- struct ext_mount *em;
-
- em = xmalloc(sizeof(*em));
- if (!em)
- return -1;
-
- em->key = key;
- em->val = val;
- list_add_tail(&em->list, &opts.ext_mounts);
- pr_info("Added %s:%s ext mount mapping\n", key, val);
- return 0;
-}
-
-/* Lookup ext_mount by key field */
-static struct ext_mount *ext_mount_lookup(char *key)
-{
- struct ext_mount *em;
-
- list_for_each_entry(em, &opts.ext_mounts, list)
- if (!strcmp(em->key, key))
- return em;
-
- return NULL;
-}
-
-/*
- * Single linked list of mount points get from proc/images
- */
-struct mount_info *mntinfo;
-
-static void mntinfo_add_list(struct mount_info *new)
-{
- if (!mntinfo)
- mntinfo = new;
- else {
- struct mount_info *pm;
-
- /* Add to the tail. (FIXME -- make O(1) ) */
- for (pm = mntinfo; pm->next != NULL; pm = pm->next)
- ;
- pm->next = new;
- }
-}
-
-static int open_mountpoint(struct mount_info *pm);
-
-static struct mount_info *mnt_build_tree(struct mount_info *list, struct mount_info *roots_mp);
-static int validate_mounts(struct mount_info *info, bool for_dump);
-
-/* Asolute paths are used on dump and relative paths are used on restore */
-static inline int is_root(char *p)
-{
- return (!strcmp(p, "/"));
-}
-
-/* True for the root mount (the topmost one) */
-static inline int is_root_mount(struct mount_info *mi)
-{
- return is_root(mi->mountpoint + 1);
-}
-
-/*
- * True if the mountpoint target is root on its FS.
- *
- * This is used to determine whether we need to postpone
- * mounting. E.g. one can bind mount some subdir from a
- * disk, and in this case we'll have to get the root disk
- * mount first, then bind-mount it. See do_mount_one().
- */
-static inline int fsroot_mounted(struct mount_info *mi)
-{
- return is_root(mi->root);
-}
-
-static struct mount_info *__lookup_overlayfs(struct mount_info *list, char *rpath,
- unsigned int st_dev, unsigned int st_ino,
- unsigned int mnt_id)
-{
- /*
- * Goes through all entries in the mountinfo table
- * looking for a mount point that contains the file specified
- * in rpath. Uses the device number st_dev and the inode number st_ino
- * to make sure the file is correct.
- */
- struct mount_info *mi_ret = NULL;
- struct mount_info *m;
- int mntns_root = -1;
-
- for (m = list; m != NULL; m = m->next) {
- struct stat f_stat;
- int ret_stat;
-
- if (m->fstype->code != FSTYPE__OVERLAYFS)
- continue;
-
- /*
- * We need the mntns root fd of the process to be dumped,
- * to make sure we stat the correct file
- */
- if (mntns_root == -1) {
- mntns_root = __mntns_get_root_fd(root_item->pid.real);
- if (mntns_root < 0) {
- pr_err("Unable to get the root file descriptor of pid %d\n", root_item->pid.real);
- return ERR_PTR(-ENOENT);
- }
- }
-
- /* Concatenates m->mountpoint with rpath and attempts to stat the resulting path */
- if (is_root_mount(m)) {
- ret_stat = fstatat(mntns_root, rpath, &f_stat, 0);
- } else {
- char _full_path[PATH_MAX];
- int n = snprintf(_full_path, PATH_MAX, "%s/%s", m->mountpoint, rpath);
-
- if (n >= PATH_MAX) {
- pr_err("Not enough space to concatenate %s and %s\n", m->mountpoint, rpath);
- return ERR_PTR(-ENOSPC);
- }
- ret_stat = fstatat(mntns_root, _full_path, &f_stat, 0);
- }
-
- if (ret_stat == 0 && st_dev == f_stat.st_dev && st_ino == f_stat.st_ino)
- mi_ret = m;
- }
-
- return mi_ret;
-}
-
-/*
- * Looks up the mnt_id and path of a file in an overlayFS directory.
- *
- * This is useful in order to fix the OverlayFS bug present in the
- * Linux Kernel before version 4.2. See fixup_overlayfs for details.
- *
- * We first check to see if the mnt_id and st_dev numbers currently match
- * some entry in the mountinfo table. If so, we already have the correct mnt_id
- * and no fixup is needed.
- *
- * Then we proceed to see if there are any overlayFS mounted directories
- * in the mountinfo table. If so, we concatenate the mountpoint with the
- * name of the file, and stat the resulting path to check if we found the
- * correct device id and node number. If that is the case, we update the
- * mount id and link variables with the correct values.
- */
-struct mount_info *lookup_overlayfs(char *rpath, unsigned int st_dev,
- unsigned int st_ino, unsigned int mnt_id)
-{
- struct mount_info *m;
-
- /* If the mnt_id and device number match for some entry, no fixup is needed */
- for (m = mntinfo; m != NULL; m = m->next)
- if (st_dev == m->s_dev && mnt_id == m->mnt_id)
- return NULL;
-
- return __lookup_overlayfs(mntinfo, rpath, st_dev, st_ino, mnt_id);
-}
-
-static struct mount_info *__lookup_mnt_id(struct mount_info *list, int id)
-{
- struct mount_info *m;
-
- for (m = list; m != NULL; m = m->next)
- if (m->mnt_id == id)
- return m;
-
- return NULL;
-}
-
-struct mount_info *lookup_mnt_id(unsigned int id)
-{
- return __lookup_mnt_id(mntinfo, id);
-}
-
-struct mount_info *lookup_mnt_sdev(unsigned int s_dev)
-{
- struct mount_info *m;
-
- for (m = mntinfo; m != NULL; m = m->next)
- if (m->s_dev == s_dev)
- return m;
-
- return NULL;
-}
-
-static struct mount_info *mount_resolve_path(struct mount_info *mntinfo_tree, const char *path)
-{
- size_t pathlen = strlen(path);
- struct mount_info *m = mntinfo_tree, *c;
-
- while (1) {
- list_for_each_entry(c, &m->children, siblings) {
- size_t n;
-
- n = strlen(c->mountpoint + 1);
- if (n > pathlen)
- continue;
-
- if (strncmp(c->mountpoint + 1, path, min(n, pathlen)))
- continue;
- if (n < pathlen && path[n] != '/')
- continue;
-
- m = c;
- break;
- }
- if (&c->siblings == &m->children)
- break;
- }
-
- pr_debug("Path `%s' resolved to `%s' mountpoint\n", path, m->mountpoint);
- return m;
-}
-
-dev_t phys_stat_resolve_dev(struct ns_id *ns, dev_t st_dev, const char *path)
-{
- struct mount_info *m;
-
- m = mount_resolve_path(ns->mnt.mntinfo_tree, path);
- /*
- * BTRFS returns subvolume dev-id instead of
- * superblock dev-id, in such case return device
- * obtained from mountinfo (ie subvolume0).
- */
- return strcmp(m->fstype->name, "btrfs") ?
- MKKDEV(major(st_dev), minor(st_dev)) : m->s_dev;
-}
-
-bool phys_stat_dev_match(dev_t st_dev, dev_t phys_dev,
- struct ns_id *ns, const char *path)
-{
- if (st_dev == kdev_to_odev(phys_dev))
- return true;
-
- return phys_dev == phys_stat_resolve_dev(ns, st_dev, path);
-}
-
-/*
- * Compare super-blocks mounted at two places
- */
-static bool mounts_sb_equal(struct mount_info *a, struct mount_info *b)
-{
- return a->s_dev == b->s_dev && a->fstype == b->fstype &&
- !strcmp(a->source, b->source) && !strcmp(a->options, b->options);
-}
-
-/*
- * Compare superblocks AND the way they are mounted
- */
-static bool mounts_equal(struct mount_info *a, struct mount_info *b)
-{
- if (!mounts_sb_equal(a, b))
- return false;
- if (strcmp(a->root, b->root))
- return false;
- if (strcmp(basename(a->mountpoint), basename(b->mountpoint)))
- return false;
-
- return true;
-}
-
-/*
- * mnt_roots is a temporary directory for restoring sub-trees of
- * non-root namespaces.
- */
-static char *mnt_roots;
-
-static struct mount_info *mnt_build_ids_tree(struct mount_info *list, struct mount_info *tmp_root_mount)
-{
- struct mount_info *m, *root = NULL;
-
- /*
- * Just resolve the mnt_id:parent_mnt_id relations
- */
-
- pr_debug("\tBuilding plain mount tree\n");
- for (m = list; m != NULL; m = m->next) {
- struct mount_info *parent;
-
- pr_debug("\t\tWorking on %d->%d\n", m->mnt_id, m->parent_mnt_id);
-
- if (m->mnt_id != m->parent_mnt_id)
- parent = __lookup_mnt_id(list, m->parent_mnt_id);
- else /* a circular mount reference. It's rootfs or smth like it. */
- parent = NULL;
-
- if (!parent) {
- /* This should be / */
- if (root == NULL && is_root_mount(m)) {
- root = m;
- continue;
- }
-
- pr_debug("Mountpoint %d (@%s) w/o parent %d\n",
- m->mnt_id, m->mountpoint, m->parent_mnt_id);
-
- if (root && m->is_ns_root) {
- if (!mounts_sb_equal(root, m) ||
- strcmp(root->root, m->root)) {
- pr_err("Nested mount namespaces with different "
- "roots %d (@%s %s) %d (@%s %s) are not supported yet\n",
- root->mnt_id, root->mountpoint, root->root,
- m->mnt_id, m->mountpoint, m->root);
- return NULL;
- }
-
- /*
- * A root of a sub mount namespace is
- * mounted in a temporary directory in the
- * root mount namespace, so its parent is
- * the main root.
- */
- parent = tmp_root_mount;
- if (unlikely(!tmp_root_mount)) {
- pr_err("Nested mount %d (@%s %s) w/o root insertion detected\n",
- m->mnt_id, m->mountpoint, m->root);
- return NULL;
- }
-
- pr_debug("Mountpoint %d (@%s) get parent %d (@%s)\n",
- m->mnt_id, m->mountpoint,
- parent->mnt_id, parent->mountpoint);
- } else {
- pr_err("No root found for mountpoint %d (@%s)\n",
- m->mnt_id, m->mountpoint);
- return NULL;
- }
- }
-
- m->parent = parent;
- list_add_tail(&m->siblings, &parent->children);
- }
-
- if (!root) {
- pr_err("No root found for tree\n");
- return NULL;
- }
-
- if (tmp_root_mount) {
- tmp_root_mount->parent = root;
- list_add_tail(&tmp_root_mount->siblings, &root->children);
- }
-
- return root;
-}
-
-static unsigned int mnt_depth(struct mount_info *m)
-{
- unsigned int depth = 0;
- char *c;
-
- for (c = m->mountpoint; *c != '\0'; c++)
- if (*c == '/')
- depth++;
-
- return depth;
-}
-
-static void mnt_resort_siblings(struct mount_info *tree)
-{
- struct mount_info *m, *p;
- LIST_HEAD(list);
-
- /*
- * Put siblings of each node in an order they can be (u)mounted
- * I.e. if we have mounts on foo/bar/, foo/bar/foobar/ and foo/
- * we should put them in the foo/bar/foobar/, foo/bar/, foo/ order.
- * Otherwise we will not be able to (u)mount them in a sequence.
- *
- * Funny, but all we need for this is to sort them in the descending
- * order of the amount of /-s in a path =)
- *
- * Use stupid insertion sort here, we're not expecting mount trees
- * to contain hundreds (or more) elements.
- */
-
- pr_info("\tResorting siblings on %d\n", tree->mnt_id);
- while (!list_empty(&tree->children)) {
- unsigned int depth;
-
- m = list_first_entry(&tree->children, struct mount_info, siblings);
- list_del(&m->siblings);
-
- depth = mnt_depth(m);
- list_for_each_entry(p, &list, siblings)
- if (mnt_depth(p) <= depth)
- break;
-
- list_add(&m->siblings, &p->siblings);
- mnt_resort_siblings(m);
- }
-
- list_splice(&list, &tree->children);
-}
-
-static void mnt_tree_show(struct mount_info *tree, int off)
-{
- struct mount_info *m;
-
- pr_info("%*s[%s](%d->%d)\n", off, "",
- tree->mountpoint, tree->mnt_id, tree->parent_mnt_id);
-
- list_for_each_entry(m, &tree->children, siblings)
- mnt_tree_show(m, off + 1);
-
- pr_info("%*s<--\n", off, "");
-}
-
-static int try_resolve_ext_mount(struct mount_info *info)
-{
- struct ext_mount *em;
-
- em = ext_mount_lookup(info->mountpoint + 1 /* trim the . */);
- if (em == NULL)
- return -ENOTSUP;
-
- pr_info("Found %s mapping for %s mountpoint\n",
- em->val, info->mountpoint);
- info->external = em;
- return 0;
-}
-
-static struct mount_info *find_widest_shared(struct mount_info *m)
-{
- struct mount_info *p;
-
- /*
- * Try to find a mount, which is wider or equal.
- * A is wider than B, if A->root is a subpath of B->root.
- */
- list_for_each_entry(p, &m->mnt_share, mnt_share)
- if (issubpath(m->root, p->root))
- return p;
-
- return NULL;
-}
-
-static struct mount_info *find_shared_peer(struct mount_info *m,
- struct mount_info *ct, char *ct_mountpoint, int m_mpnt_l)
-{
- struct mount_info *cm;
-
- list_for_each_entry(cm, &m->children, siblings) {
- if (strcmp(ct_mountpoint, cm->mountpoint + m_mpnt_l))
- continue;
-
- if (!mounts_equal(cm, ct))
- break;
-
- return cm;
- }
-
- return NULL;
-}
-
-static inline int path_length(char *path)
-{
- int off;
-
- off = strlen(path);
- /*
- * If we're pure / then set lenght to zero so that adding this
- * value as sub-path offset would produce the correct result.
- * E.g. the tail path of the "/foo/bar" relative to the "/foo"
- * will be the "/foo/bar" + len("/foo") == "/bar", while the
- * same relative to the "/" should be +0 to be the "/foo/bar",
- * not +1 and the "foo/bar".
- */
- if (path[off - 1] == '/')
- off--;
-
- return off;
-}
-
-static int validate_shared(struct mount_info *m)
-{
- struct mount_info *t, *ct;
- int t_root_l, m_root_l, t_mpnt_l, m_mpnt_l;
- char *m_root_rpath;
- LIST_HEAD(children);
-
- /*
- * Check that all mounts in one shared group has the same set of
- * children. Only visible children are accounted. A non-root bind-mount
- * doesn't see children out of its root and it's excpected case.
- *
- * Here is a few conditions:
- * 1. t is wider than m
- * 2. We search a wider mount in the same direction, so when we
- * enumirate all mounts, we can't be sure that all of them
- * has the same set of children.
- */
-
- t = find_widest_shared(m);
- if (!t)
- /*
- * The current mount is the widest one in its shared group,
- * all others will be compared to it or with some other,
- * which will be compared to it.
- */
- return 0;
-
- /* A set of childrent which ar visiable for both should be the same */
-
- t_root_l = path_length(t->root);
- m_root_l = path_length(m->root);
- t_mpnt_l = path_length(t->mountpoint);
- m_mpnt_l = path_length(m->mountpoint);
-
- /* For example:
- * t->root = / t->mp = ./zdtm/live/static/mntns_root_bind.test
- * m->root = /test m->mp = ./zdtm/live/static/mntns_root_bind.test/test.bind
- * t_root_l = 0 t_mpnt_l = 39
- * m_root_l = 5 m_mpnt_l = 49
- * ct->root = / ct->mp = ./zdtm/live/static/mntns_root_bind.test/test/sub
- * tp = /test/sub mp = /test len=5
- */
-
- /*
- * ct: | t->root | child mount point |
- * cm: | m->root | child mount point |
- * ct: | | /test/sub |
- * cm: | /test | /sub |
- * | A | B |
- * | ct->mountpoint + t_mpnt_l
- * | m->root + strlen(t->root)
- */
-
- m_root_rpath = m->root + t_root_l; /* path from t->root to m->root */
-
- /* Search a child, which is visiable in both mounts. */
- list_for_each_entry(ct, &t->children, siblings) {
- char *ct_mpnt_rpath;
- struct mount_info *cm;
-
- if (ct->is_ns_root)
- continue;
-
- ct_mpnt_rpath = ct->mountpoint + t_mpnt_l; /* path from t->mountpoint to ct->mountpoint */
-
- /*
- * Check whether ct can be is visible at m, i.e. the
- * ct's rpath starts (as path) with m's rpath.
- */
-
- if (!issubpath(ct_mpnt_rpath, m_root_rpath))
- continue;
-
- /*
- * The ct has peer in m but with the mount path deeper according
- * to m's depth relavie to t. Thus -- trim this difference (the
- * lenght of m_root_rpath) from ct's mountpoint path.
- */
-
- ct_mpnt_rpath += m_root_l - t_root_l;
-
- /*
- * Find in m the mountpoint that fully matches with ct (with the
- * described above path corrections).
- */
-
- cm = find_shared_peer(m, ct, ct_mpnt_rpath, m_mpnt_l);
- if (!cm)
- goto err;
-
- /*
- * Keep this one aside. At the end of t's children scan we should
- * move _all_ m's children here (the list_empty check below).
- */
- list_move(&cm->siblings, &children);
- }
-
- if (!list_empty(&m->children))
- goto err;
-
- list_splice(&children, &m->children);
- return 0;
-
-err:
- list_splice(&children, &m->children);
- pr_err("%d:%s and %d:%s have different set of mounts\n",
- m->mnt_id, m->mountpoint, t->mnt_id, t->mountpoint);
- return -1;
-}
-
-/*
- * Find the mount_info from which the respective bind-mount
- * can be created. It can be either an FS-root mount, or the
- * root of the tree (the latter only if its root path is the
- * sub-path of the bind mount's root).
- */
-
-static struct mount_info *find_fsroot_mount_for(struct mount_info *bm)
-{
- struct mount_info *sm;
-
- list_for_each_entry(sm, &bm->mnt_bind, mnt_bind)
- if (fsroot_mounted(sm) ||
- (sm->parent == NULL &&
- strstartswith(bm->root, sm->root)))
- return sm;
-
- return NULL;
-}
-
-static int validate_mounts(struct mount_info *info, bool for_dump)
-{
- struct mount_info *m, *t;
-
- for (m = info; m; m = m->next) {
- if (m->parent == NULL || m->is_ns_root)
- /* root mount can be any */
- continue;
-
- if (m->shared_id && validate_shared(m))
- return -1;
-
- if (m->external)
- goto skip_fstype;
-
- /*
- * Mountpoint can point to / of an FS. In that case this FS
- * should be of some known type so that we can just mount one.
- *
- * Otherwise it's a bindmount mountpoint and we try to find
- * what fsroot mountpoint it's bound to. If this point is the
- * root mount, the path to bindmount root should be accessible
- * form the rootmount path (the strstartswith check in the
- * else branch below).
- */
-
- if (fsroot_mounted(m)) {
- if (m->fstype->code == FSTYPE__UNSUPPORTED) {
- pr_err("FS mnt %s dev %#x root %s unsupported id %d\n",
- m->mountpoint, m->s_dev, m->root, m->mnt_id);
- return -1;
- }
- } else {
- t = find_fsroot_mount_for(m);
- if (!t) {
- int ret;
-
- /*
- * No root-mount found for this bind and it's neither
- * marked nor auto-resolved as external one. So last
- * chance not to fail is to talk to plugins.
- */
-
- if (for_dump) {
- ret = run_plugins(DUMP_EXT_MOUNT, m->mountpoint, m->mnt_id);
- if (ret == 0)
- m->need_plugin = true;
- } else
- /*
- * Plugin should take care of this one
- * in restore_ext_mount, or do_bind_mount
- * will mount it as external
- */
- ret = m->need_plugin ? 0 : -ENOTSUP;
-
- if (ret < 0) {
- if (ret == -ENOTSUP)
- pr_err("%d:%s doesn't have a proper root mount\n",
- m->mnt_id, m->mountpoint);
- return -1;
- }
- }
- }
-skip_fstype:
- list_for_each_entry(t, &m->parent->children, siblings) {
- if (m == t)
- continue;
- if (!issubpath(m->mountpoint, t->mountpoint))
- continue;
-
- pr_err("%d:%s is overmounted\n", m->mnt_id, m->mountpoint);
- return -1;
- }
- }
-
- return 0;
-}
-
-static char *cut_root_for_bind(char *target_root, char *source_root)
-{
- int tok = 0;
- /*
- * Cut common part of root.
- * For non-root binds the source is always "/" (checked)
- * so this will result in this slash removal only.
- */
- while (target_root[tok] == source_root[tok]) {
- tok++;
- if (source_root[tok] == '\0')
- break;
- BUG_ON(target_root[tok] == '\0');
- }
-
- return target_root + tok;
-
-}
-
-static struct mount_info *find_best_external_match(struct mount_info *list, struct mount_info *info)
-{
- struct mount_info *it, *candidate = NULL;
-
- for (it = list; it; it = it->next) {
- if (!mounts_sb_equal(info, it))
- continue;
-
- /*
- * This means we have a situation like:
- *
- * root at criu:~# mount --bind bind1/subdir/ bind2
- * root at criu:~# mount --bind bind1/ bind3
- *
- * outside the container, and bind1 is directly bind mounted
- * inside the container. mounts_equal() considers these mounts
- * equal for bind purposes, but their roots are different, and
- * we want to match the one with the right root.
- */
- if (!issubpath(info->root, it->root))
- continue;
-
- candidate = it;
-
- /*
- * Consider the case of:
- *
- * mount /xxx
- * mount --bind /xxx /yyy
- * mount --make-shared /yyy
- * mount --bind /xxx /zzz
- * mount --make-shared /zzz
- * bind mount a shared mount into the namespace
- *
- * Here, we want to return the /right/ mount, not just a mount
- * that's equal. However, in the case:
- *
- * bind mount a shared mount into the namespace
- * inside the namespace, remount MS_PRIVATE
- * inside the namespace, remount MS_SHARED
- *
- * there will be no external mount with matching sharing
- * because the sharing is only internal; we still want to bind
- * mount from this mountinfo so we should return it, but we
- * should make the sharing namespace private after that bind
- * mount.
- *
- * Below are the cases where we found an exact match.
- */
- if (info->flags & MS_SHARED && info->shared_id == it->shared_id)
- return candidate;
-
- if (info->flags & MS_SLAVE && info->master_id == it->shared_id)
- return candidate;
- }
-
- return candidate;
-}
-
-static struct ns_id *find_ext_ns_id(void)
-{
- struct ns_id *ns;
-
- for (ns = ns_ids; ns->next; ns = ns->next)
- if (ns->type == NS_CRIU && ns->nd == &mnt_ns_desc) {
- if (!ns->mnt.mntinfo_list &&
- !collect_mntinfo(ns, true))
- break;
- return ns;
- }
-
- pr_err("Failed to find criu pid's mount ns\n");
- return NULL;
-}
-
-static int resolve_external_mounts(struct mount_info *info)
-{
- struct ns_id *ext_ns = NULL;
- struct mount_info *m;
-
- if (opts.autodetect_ext_mounts) {
- ext_ns = find_ext_ns_id();
- if (!ext_ns)
- return -1;
- }
-
- for (m = info; m; m = m->next) {
- int ret;
- char *p, *cut_root;
- struct ext_mount *em;
- struct mount_info *match;
-
- if (m->parent == NULL || m->is_ns_root)
- continue;
-
- ret = try_resolve_ext_mount(m);
- if (ret < 0 && ret != -ENOTSUP) {
- return -1;
- } else if (ret == -ENOTSUP && !ext_ns) {
- continue;
- } else if (ret == 0) {
- continue;
- }
-
- match = find_best_external_match(ext_ns->mnt.mntinfo_list, m);
- if (!match)
- continue;
-
- if (m->flags & MS_SHARED) {
- if (!opts.enable_external_sharing)
- continue;
-
- if (m->shared_id != match->shared_id)
- m->internal_sharing = true;
- }
-
- if (m->flags & MS_SLAVE) {
- if (!opts.enable_external_masters)
- continue;
-
- /*
- * In order to support something like internal slavery,
- * we need to teach can_mount_now and do_mount_one
- * about slavery relationships in external mounts. This
- * seems like an uncommon case, so we punt for not.
- */
- if (m->master_id != match->shared_id)
- continue;
- }
-
- cut_root = cut_root_for_bind(m->root, match->root);
-
- p = xsprintf("%s/%s", match->mountpoint + 1, cut_root);
- if (!p)
- return -1;
-
- em = xmalloc(sizeof(struct ext_mount));
- if (!em) {
- free(p);
- return -1;
- }
-
- em->val = AUTODETECTED_MOUNT;
- em->key = p;
-
- m->external = em;
-
- xfree(m->source);
- m->source = p;
-
- pr_info("autodetected external mount %s for %s\n", p, m->mountpoint);
- }
-
- return 0;
-}
-
-static int resolve_shared_mounts(struct mount_info *info, int root_master_id)
-{
- struct mount_info *m, *t;
-
- /*
- * If we have a shared mounts, both master
- * slave targets are to be present in mount
- * list, otherwise we can't be sure if we can
- * recreate the scheme later on restore.
- */
- for (m = info; m; m = m->next) {
- bool need_share, need_master;
-
- /* the root master_id can be ignored, because it's already created */
- if (root_master_id && root_master_id == m->master_id)
- m->master_id = -1;
-
- need_share = m->shared_id && list_empty(&m->mnt_share);
- need_master = m->master_id > 0;
-
- pr_debug("Inspecting sharing on %2d shared_id %d master_id %d (@%s)\n",
- m->mnt_id, m->shared_id, m->master_id, m->mountpoint);
-
- for (t = info; t && (need_share || need_master); t = t->next) {
- if (t == m)
- continue;
- if (need_master && t->shared_id == m->master_id) {
- pr_debug("\tThe mount %3d is slave for %3d (@%s -> @%s)\n",
- m->mnt_id, t->mnt_id,
- m->mountpoint, t->mountpoint);
- list_add(&m->mnt_slave, &t->mnt_slave_list);
- m->mnt_master = t;
- need_master = false;
- }
-
- /* Collect all mounts from this group */
- if (need_share && t->shared_id == m->shared_id) {
- pr_debug("\tMount %3d is shared with %3d group %3d (@%s -> @%s)\n",
- m->mnt_id, t->mnt_id, m->shared_id,
- t->mountpoint, m->mountpoint);
- list_add(&t->mnt_share, &m->mnt_share);
- }
- }
-
- /*
- * If we haven't already determined this mount is external,
- * then we don't know where it came from.
- */
- if (need_master && m->parent && !m->external) {
- pr_err("Mount %d %s (master_id: %d shared_id: %d) "
- "has unreachable sharing. Try --enable-external-masters.\n", m->mnt_id,
- m->mountpoint, m->master_id, m->shared_id);
- return -1;
- }
-
- /* Search bind-mounts */
- if (list_empty(&m->mnt_bind)) {
- /*
- * A first mounted point will be set up as a source point
- * for others. Look at propagate_mount()
- */
- for (t = m->next; t; t = t->next) {
- if (mounts_sb_equal(m, t)) {
- list_add(&t->mnt_bind, &m->mnt_bind);
- pr_debug("\tThe mount %3d is bind for %3d (@%s -> @%s)\n",
- t->mnt_id, m->mnt_id,
- t->mountpoint, m->mountpoint);
- }
- }
- }
- }
-
- return 0;
-}
-
-static struct mount_info *mnt_build_tree(struct mount_info *list, struct mount_info *roots_mp)
-{
- struct mount_info *tree;
-
- /*
- * Organize them in a sequence in which they can be mounted/umounted.
- */
-
- pr_info("Building mountpoints tree\n");
- tree = mnt_build_ids_tree(list, roots_mp);
- if (!tree)
- return NULL;
-
- mnt_resort_siblings(tree);
- pr_info("Done:\n");
- mnt_tree_show(tree, 0);
- return tree;
-}
-
-/*
- * mnt_fd is a file descriptor on the mountpoint, which is closed in an error case.
- * If mnt_fd is -1, the mountpoint will be opened by this function.
- */
-int __open_mountpoint(struct mount_info *pm, int mnt_fd)
-{
- dev_t dev;
- struct stat st;
- int ret;
-
- if (mnt_fd == -1) {
- int mntns_root;
-
- mntns_root = mntns_get_root_fd(pm->nsid);
- if (mntns_root < 0)
- return -1;
-
- mnt_fd = openat(mntns_root, pm->ns_mountpoint, O_RDONLY);
- if (mnt_fd < 0) {
- pr_perror("Can't open %s", pm->ns_mountpoint);
- return -1;
- }
- }
-
- ret = fstat(mnt_fd, &st);
- if (ret < 0) {
- pr_perror("fstat(%s) failed", pm->ns_mountpoint);
- goto err;
- }
-
- if (pm->s_dev_rt == MOUNT_INVALID_DEV) {
- pr_err("Resolving over unvalid device for %#x %s %s\n",
- pm->s_dev, pm->fstype->name, pm->ns_mountpoint);
- goto err;
- }
-
- dev = phys_stat_resolve_dev(pm->nsid, st.st_dev, pm->ns_mountpoint + 1);
- /*
- * Always check for @s_dev_rt here, because the @s_dev
- * from the image (in case of restore) has all rights
- * to not match the device (say it's migrated and kernel
- * allocates new device ID).
- */
- if (dev != pm->s_dev_rt) {
- pr_err("The file system %#x %#x (%#x) %s %s is inaccessible\n",
- pm->s_dev, pm->s_dev_rt, (int)dev,
- pm->fstype->name, pm->ns_mountpoint);
- goto err;
- }
-
- return mnt_fd;
-err:
- close(mnt_fd);
- return -1;
-}
-
-int open_mount(unsigned int s_dev)
-{
- struct mount_info *m;
-
- m = lookup_mnt_sdev(s_dev);
- if (!m)
- return -ENOENT;
-
- return __open_mountpoint(m, -1);
-}
-
-/* Bind-mount a mount point in a temporary place without children */
-static char *get_clean_mnt(struct mount_info *mi, char *mnt_path_tmp, char *mnt_path_root)
-{
- char *mnt_path;
-
- mnt_path = mkdtemp(mnt_path_tmp);
- if (mnt_path == NULL && errno == ENOENT)
- mnt_path = mkdtemp(mnt_path_root);
- if (mnt_path == NULL) {
- pr_perror("Can't create a temporary directory");
- return NULL;;
- }
-
- if (mount(mi->mountpoint, mnt_path, NULL, MS_BIND, NULL)) {
- pr_perror("Can't bind-mount %d:%s to %s",
- mi->mnt_id, mi->mountpoint, mnt_path);
- rmdir(mnt_path);
- return NULL;
- }
-
- return mnt_path;
-}
-
-static int open_mountpoint(struct mount_info *pm)
-{
- int fd = -1, ns_old = -1;
- char mnt_path_tmp[] = "/tmp/cr-tmpfs.XXXXXX";
- char mnt_path_root[] = "/cr-tmpfs.XXXXXX";
- char *mnt_path = mnt_path_tmp;
- int cwd_fd;
-
- /*
- * If a mount doesn't have children, we can open a mount point,
- * otherwise we need to create a "private" copy.
- */
- if (list_empty(&pm->children))
- return __open_mountpoint(pm, -1);
-
- pr_info("Something is mounted on top of %s\n", pm->mountpoint);
-
- /*
- * To create a "private" copy, the target mount is bind-mounted
- * in a temporary place w/o MS_REC (non-recursively).
- * A mount point can't be bind-mounted in criu's namespace, it will be
- * mounted in a target namespace. The sequence of actions is
- * mkdtemp, setns(tgt), mount, open, detach, setns(old).
- */
-
- cwd_fd = open(".", O_DIRECTORY);
- if (cwd_fd < 0) {
- pr_perror("Unable to open cwd");
- return -1;
- }
-
- if (switch_ns(pm->nsid->ns_pid, &mnt_ns_desc, &ns_old) < 0)
- goto out;
-
- mnt_path = get_clean_mnt(pm, mnt_path_tmp, mnt_path_root);
- if (mnt_path == NULL)
- goto out;
-
- fd = open_detach_mount(mnt_path);
- if (fd < 0)
- goto out;
-
- if (restore_ns(ns_old, &mnt_ns_desc)) {
- ns_old = -1;
- goto out;
- }
- if (fchdir(cwd_fd)) {
- pr_perror("Unable to restore cwd");
- close(cwd_fd);
- close(fd);
- return -1;
- }
- close(cwd_fd);
-
- return __open_mountpoint(pm, fd);
-out:
- if (ns_old >= 0)
- restore_ns(ns_old, &mnt_ns_desc);
- close_safe(&fd);
- if (fchdir(cwd_fd))
- pr_perror("Unable to restore cwd");
- close(cwd_fd);
- return -1;
-}
-
-static int attach_option(struct mount_info *pm, char *opt)
-{
- if (pm->options[0] == '\0')
- pm->options = xstrcat(pm->options, "%s", opt);
- else
- pm->options = xstrcat(pm->options, ",%s", opt);
- return pm->options ? 0 : -1;
-}
-
-/* Is it mounted w or w/o the newinstance option */
-static int devpts_parse(struct mount_info *pm)
-{
- int ret;
-
- ret = kerndat_fs_virtualized(KERNDAT_FS_STAT_DEVPTS, pm->s_dev);
- if (ret <= 0)
- return ret;
-
- /*
- * Kernel hides this option, but if the fs instance
- * is new (virtualized) we know that it was created
- * with -o newinstance.
- */
- return attach_option(pm, "newinstance");
-}
-
-static int tmpfs_dump(struct mount_info *pm)
-{
- int ret = -1, fd = -1, userns_pid = -1;
- char tmpfs_path[PSFDS];
- struct cr_img *img;
-
- fd = open_mountpoint(pm);
- if (fd < 0)
- return -1;
-
- /* if fd happens to be 0 here, we need to move it to something
- * non-zero, because cr_system_userns closes STDIN_FILENO as we are not
- * interested in passing stdin to tar.
- */
- if (move_img_fd(&fd, STDIN_FILENO) < 0)
- goto out;
-
- if (fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) & ~FD_CLOEXEC) == -1) {
- pr_perror("Can not drop FD_CLOEXEC");
- goto out;
- }
-
- img = open_image(CR_FD_TMPFS_DEV, O_DUMP, pm->s_dev);
- if (!img)
- goto out;
-
- sprintf(tmpfs_path, "/proc/self/fd/%d", fd);
-
- if (root_ns_mask & CLONE_NEWUSER)
- userns_pid = root_item->pid.real;
-
- ret = cr_system_userns(-1, img_raw_fd(img), -1, "tar", (char *[])
- { "tar", "--create",
- "--gzip",
- "--no-unquote",
- "--no-wildcards",
- "--one-file-system",
- "--check-links",
- "--preserve-permissions",
- "--sparse",
- "--numeric-owner",
- "--directory", tmpfs_path, ".", NULL }, 0, userns_pid);
-
- if (ret)
- pr_err("Can't dump tmpfs content\n");
-
- close_image(img);
-out:
- close_safe(&fd);
- return ret;
-}
-
-/*
- * Virtualized devtmpfs on any side (dump or restore)
- * means, that we should try to handle it as a plain
- * tmpfs.
- *
- * Interesting case -- shared on dump and virtual on
- * restore -- will fail, since no tarball with the fs
- * contents will be found.
- */
-
-static int devtmpfs_virtual(struct mount_info *pm)
-{
- return kerndat_fs_virtualized(KERNDAT_FS_STAT_DEVTMPFS, pm->s_dev);
-}
-
-static int devtmpfs_dump(struct mount_info *pm)
-{
- int ret;
-
- ret = devtmpfs_virtual(pm);
- if (ret == 1)
- ret = tmpfs_dump(pm);
-
- return ret;
-}
-
-static int tmpfs_restore(struct mount_info *pm)
-{
- int ret;
- struct cr_img *img;
-
- img = open_image(CR_FD_TMPFS_DEV, O_RSTR, pm->s_dev);
- if (empty_image(img)) {
- close_image(img);
- img = open_image(CR_FD_TMPFS_IMG, O_RSTR, pm->mnt_id);
- }
- if (!img)
- return -1;
- if (empty_image(img)) {
- close_image(img);
- return -1;
- }
-
- ret = cr_system(img_raw_fd(img), -1, -1, "tar",
- (char *[]) {"tar", "--extract", "--gzip",
- "--no-unquote", "--no-wildcards",
- "--directory", pm->mountpoint, NULL}, 0);
- close_image(img);
-
- if (ret) {
- pr_err("Can't restore tmpfs content\n");
- return -1;
- }
-
- return 0;
-}
-
-static int devtmpfs_restore(struct mount_info *pm)
-{
- int ret;
-
- ret = devtmpfs_virtual(pm);
- if (ret == 1)
- ret = tmpfs_restore(pm);
-
- return ret;
-}
-
-static int binfmt_misc_virtual(struct mount_info *pm)
-{
- return kerndat_fs_virtualized(KERNDAT_FS_STAT_BINFMT_MISC, pm->s_dev);
-}
-
-static int parse_binfmt_misc_entry(struct bfd *f, BinfmtMiscEntry *bme)
-{
- while (1) {
- char *str;
-
- str = breadline(f);
- if (IS_ERR(str))
- return -1;
- if (!str)
- break;
-
- if (!strncmp(str, "enabled", 7)) {
- bme->enabled = true;
- continue;
- }
-
- if (!strncmp(str, "disabled", 8))
- continue;
-
- if (!strncmp(str, "offset ", 7)) {
- if (sscanf(str + 7, "%i", &bme->offset) != 1)
- return -1;
- bme->has_offset = true;
- continue;
- }
-
-#define DUP_EQUAL_AS(key, member) \
- if (!strncmp(str, key, strlen(key))) { \
- bme->member = xstrdup(str + strlen(key)); \
- if (!bme->member) \
- return -1; \
- continue; \
- }
- DUP_EQUAL_AS("interpreter ", interpreter)
- DUP_EQUAL_AS("flags: ", flags)
- DUP_EQUAL_AS("extension .", extension)
- DUP_EQUAL_AS("magic ", magic)
- DUP_EQUAL_AS("mask ", mask)
-#undef DUP_EQUAL_AS
-
- pr_perror("binfmt_misc: unsupported feature %s\n", str);
- return -1;
- }
-
- return 0;
-}
-
-static int dump_binfmt_misc_entry(int dfd, char *name, struct cr_img *img)
-{
- BinfmtMiscEntry bme = BINFMT_MISC_ENTRY__INIT;
- struct bfd f;
- int ret = -1;
-
- f.fd = openat(dfd, name, O_RDONLY);
- if (f.fd < 0) {
- pr_perror("binfmt_misc: can't open %s", name);
- return -1;
- }
-
- if (bfdopenr(&f))
- return -1;
-
- if (parse_binfmt_misc_entry(&f, &bme))
- goto err;
-
- bme.name = name;
-
- if (pb_write_one(img, &bme, PB_BINFMT_MISC))
- goto err;
- ret = 0;
-err:
- free(bme.interpreter);
- free(bme.flags);
- free(bme.extension);
- free(bme.magic);
- free(bme.mask);
- bclose(&f);
- return ret;
-
-}
-
-static int binfmt_misc_dump(struct mount_info *pm)
-{
- struct cr_img *img;
- struct dirent *de;
- DIR *fdir = NULL;
- int fd, ret;
-
- ret = binfmt_misc_virtual(pm);
- if (ret <= 0)
- return ret;
-
- fd = open_mountpoint(pm);
- if (fd < 0)
- return -1;
-
- fdir = fdopendir(fd);
- if (fdir == NULL) {
- close(fd);
- return -1;
- }
-
- ret = -1;
- img = open_image(CR_FD_BINFMT_MISC, O_DUMP, pm->s_dev);
- if (!img)
- goto out;
-
- while ((de = readdir(fdir))) {
- if (dir_dots(de))
- continue;
- if (!strcmp(de->d_name, "register"))
- continue;
- if (!strcmp(de->d_name, "status"))
- continue;
-
- if (dump_binfmt_misc_entry(fd, de->d_name, img))
- goto out;
- }
-
- ret = 0;
-out:
- if (img)
- close_image(img);
- closedir(fdir);
- return ret;
-}
-
-static int restore_binfmt_misc_entry(char *mp, char *buf, BinfmtMiscEntry *bme)
-{
- int fd, len, ret = -1;
- char path[PATH_MAX+1];
-
- snprintf(path, PATH_MAX, "%s/register", mp);
-
- fd = open(path, O_WRONLY);
- if (fd < 0) {
- pr_perror("binfmt_misc: can't open %s", path);
- return -1;
- }
-
- len = strlen(buf);
-
- if (write(fd, buf, len) != len) {
- pr_perror("binfmt_misc: can't write to %s", path);
- goto close;
- }
-
- if (!bme->enabled) {
- close(fd);
- snprintf(path, PATH_MAX, "%s/%s", mp, bme->name);
-
- fd = open(path, O_WRONLY);
- if (!fd) {
- pr_perror("binfmt_misc: can't open %s", path);
- goto out;
- }
- if (write(fd, "0", 1) != 1) {
- pr_perror("binfmt_misc: can't write to %s", path);
- goto close;
- }
- }
-
- ret = 0;
-close:
- close(fd);
-out:
- return ret;
-}
-
-#define BINFMT_MISC_STR (1920 + 1)
-static int make_bfmtm_magic_str(char *buf, BinfmtMiscEntry *bme)
-{
- int i, len;
-
- /*
- * Format is ":name:type(M):offset:magic:mask:interpreter:flags".
- * Magic and mask are special fields. Kernel outputs them as
- * a sequence of hexidecimal numbers (abc -> 616263), and we
- * dump them without changes. But for registering a new entry
- * it expects every byte is prepended with \x, i.e. \x61\x62\x63.
- */
- len = strlen(bme->name) + 3 /* offset < 128 */ + 2 * strlen(bme->magic)
- + (bme->mask ? 2 * strlen(bme->mask) : 0) + strlen(bme->interpreter)
- + (bme->flags ? strlen(bme->flags) : 0) + strlen(":::::::");
-
- if ((len > BINFMT_MISC_STR - 1) || bme->offset > 128)
- return -1;
-
- buf += sprintf(buf, ":%s:M:%d:", bme->name, bme->offset);
-
- len = strlen(bme->magic);
- for (i = 0; i < len; i += 2)
- buf += sprintf(buf, "\\x%c%c", bme->magic[i], bme->magic[i + 1]);
-
- buf += sprintf(buf, ":");
-
- if (bme->mask) {
- len = strlen(bme->mask);
- for (i = 0; i < len; i += 2)
- buf += sprintf(buf, "\\x%c%c", bme->mask[i], bme->mask[i + 1]);
- }
-
- sprintf(buf, ":%s:%s", bme->interpreter, bme->flags ? : "\0");
-
- return 1;
-}
-
-static int binfmt_misc_restore(struct mount_info *mi)
-{
- struct cr_img *img;
- char *buf;
- int ret = -1;;
-
- buf = xmalloc(BINFMT_MISC_STR);
- if (!buf)
- return -1;
-
- img = open_image(CR_FD_BINFMT_MISC, O_RSTR, mi->s_dev);
- if (!img) {
- goto free_buf;
- }
-
- ret = 0;
- while (ret == 0) {
- BinfmtMiscEntry *bme;
-
- ret = pb_read_one_eof(img, &bme, PB_BINFMT_MISC);
- if (ret <= 0)
- break;
-
- /* :name:type:offset:magic/extension:mask:interpreter:flags */
- if ((!bme->magic && !bme->extension) || !bme->interpreter) {
- pr_perror("binfmt_misc: bad dump");
- ret = -1;
- } else if (bme->magic) {
- ret = make_bfmtm_magic_str(buf, bme);
- } else if (bme->extension) {
- /* :name:E::extension::interpreter:flags */
- ret = snprintf(buf, BINFMT_MISC_STR, ":%s:E::%s::%s:%s",
- bme->name, bme->extension, bme->interpreter,
- bme->flags ? : "\0");
- }
-
- if (ret > 0) {
- pr_debug("binfmt_misc_pattern=%s\n", buf);
- ret = restore_binfmt_misc_entry(mi->mountpoint, buf, bme);
- }
-
- binfmt_misc_entry__free_unpacked(bme, NULL);
- }
-
- close_image(img);
-free_buf:
- free(buf);
- return ret;
-}
-
-static int fusectl_dump(struct mount_info *pm)
-{
- int fd, ret = -1;
- struct dirent *de;
- DIR *fdir = NULL;
-
- fd = open_mountpoint(pm);
- if (fd < 0)
- return -1;
-
- fdir = fdopendir(fd);
- if (fdir == NULL) {
- close(fd);
- return -1;
- }
-
- while ((de = readdir(fdir))) {
- int id;
- struct mount_info *it;
-
- if (dir_dots(de))
- continue;
-
- if (sscanf(de->d_name, "%d", &id) != 1) {
- pr_err("wrong number of items scanned in fusectl dump\n");
- goto out;
- }
-
- for (it = mntinfo; it; it = it->next) {
- if (it->fstype->code == FSTYPE__FUSE && id == minor(it->s_dev) && !it->external) {
- pr_err("%s is a fuse mount but not external\n", it->mountpoint);
- goto out;
- }
- }
- }
-
- ret = 0;
-out:
- closedir(fdir);
- return ret;
-}
-
-static int dump_empty_fs(struct mount_info *pm)
-{
- int fd, ret = -1;
- fd = open_mountpoint(pm);
-
- if (fd < 0)
- return -1;
-
- ret = is_empty_dir(fd);
- close(fd);
- if (ret < 0) {
- pr_err("%s isn't empty\n", pm->fstype->name);
- return -1;
- }
-
- return ret ? 0 : -1;
-}
-
-/*
- * Some fses (fuse) cannot be dumped, so we should always fail on dump/restore
- * of these fses.
- */
-static int always_fail(struct mount_info *pm)
-{
- pr_err("failed to dump fs %s (%s): always fail\n", pm->mountpoint,
- pm->fstype->name);
- return -1;
-}
-
-static struct fstype fstypes[32] = {
- {
- .name = "unsupported",
- .code = FSTYPE__UNSUPPORTED,
- }, {
- .name = "proc",
- .code = FSTYPE__PROC,
- }, {
- .name = "sysfs",
- .code = FSTYPE__SYSFS,
- }, {
- .name = "devtmpfs",
- .code = FSTYPE__DEVTMPFS,
- .dump = devtmpfs_dump,
- .restore = devtmpfs_restore,
- }, {
- .name = "binfmt_misc",
- .code = FSTYPE__BINFMT_MISC,
- .dump = binfmt_misc_dump,
- .restore = binfmt_misc_restore,
- }, {
- .name = "tmpfs",
- .code = FSTYPE__TMPFS,
- .dump = tmpfs_dump,
- .restore = tmpfs_restore,
- }, {
- .name = "devpts",
- .parse = devpts_parse,
- .code = FSTYPE__DEVPTS,
- }, {
- .name = "simfs",
- .code = FSTYPE__SIMFS,
- }, {
- .name = "btrfs",
- .code = FSTYPE__UNSUPPORTED,
- }, {
- .name = "pstore",
- .dump = dump_empty_fs,
- .code = FSTYPE__PSTORE,
- }, {
- .name = "mqueue",
- .dump = dump_empty_fs,
- .code = FSTYPE__MQUEUE,
- }, {
- .name = "securityfs",
- .code = FSTYPE__SECURITYFS,
- }, {
- .name = "fusectl",
- .dump = fusectl_dump,
- .code = FSTYPE__FUSECTL,
- }, {
- .name = "debugfs",
- .code = FSTYPE__DEBUGFS,
- }, {
- .name = "cgroup",
- .code = FSTYPE__CGROUP,
- }, {
- .name = "aufs",
- .code = FSTYPE__AUFS,
- .parse = aufs_parse,
- }, {
- .name = "fuse",
- .code = FSTYPE__FUSE,
- .dump = always_fail,
- .restore = always_fail,
- }, {
- .name = "overlay",
- .code = FSTYPE__OVERLAYFS,
- .parse = overlayfs_parse,
- },
-};
-
-static char fsauto_all[] = "all";
-static char *fsauto_names;
-
-static bool css_contains(const char *css, const char *str)
-{
- int len = strlen(str);
- const char *cur;
-
- if (!len)
- return false;
-
- for (cur = css; (cur = strstr(cur, str)); cur += len) {
- if (cur > css && cur[-1] != ',')
- continue;
- if (cur[len] && cur[len] != ',')
- continue;
- return true;
- }
-
- return false;
-}
-
-static bool fsname_is_auto(const char *name)
-{
- if (!fsauto_names)
- return false;
-
- if (fsauto_names == fsauto_all)
- return true;
-
- return css_contains(fsauto_names, name);
-}
-
-bool add_fsname_auto(const char *names)
-{
- char *old = fsauto_names;
-
- if (old == fsauto_all)
- return true;
-
- if (css_contains(names, fsauto_all))
- fsauto_names = fsauto_all;
- else if (!old)
- fsauto_names = xstrdup(names);
- else {
- if (asprintf(&fsauto_names, "%s,%s", old, names) < 0)
- fsauto_names = NULL;
- }
-
- xfree(old);
- return fsauto_names != NULL;
-}
-
-static struct fstype *__find_fstype_by_name(char *fst, bool force_auto)
-{
- int i;
-
- /*
- * This fn is required for two things.
- * 1st -- to check supported filesystems (as just mounting
- * anything is wrong, almost every fs has its own features)
- * 2nd -- save some space in the image (since we scan all
- * names anyway)
- */
- for (i = 1; i < ARRAY_SIZE(fstypes); i++) {
- struct fstype *fstype = fstypes + i;
-
- if (!fstype->name) {
- if (!force_auto && !fsname_is_auto(fst))
- break;
-
- fstype->name = xstrdup(fst);
- fstype->code = FSTYPE__AUTO;
- return fstype;
- }
-
- if (!strcmp(fstype->name, fst))
- return fstype;
- }
-
- if (i == ARRAY_SIZE(fstypes)) /* ensure we have a room for auto */
- pr_err_once("fstypes[] overflow!\n");
-
- return &fstypes[0];
-}
-
-struct fstype *find_fstype_by_name(char *fst)
-{
- return __find_fstype_by_name(fst, false);
-}
-
-static struct fstype *decode_fstype(u32 fst, char *fsname)
-{
- int i;
-
- if (fst == FSTYPE__AUTO)
- return __find_fstype_by_name(fsname, true);
-
- if (fst == FSTYPE__UNSUPPORTED)
- goto uns;
-
- for (i = 1; i < ARRAY_SIZE(fstypes); i++) {
- struct fstype *fstype = fstypes + i;
-
- if (!fstype->name)
- break;
-
- if (fstype->code == fst)
- return fstype;
- }
-uns:
- return &fstypes[0];
-}
-
-static int dump_one_mountpoint(struct mount_info *pm, struct cr_img *img)
-{
- MntEntry me = MNT_ENTRY__INIT;
-
- pr_info("\t%d: %x:%s @ %s\n", pm->mnt_id, pm->s_dev,
- pm->root, pm->mountpoint);
-
- me.fstype = pm->fstype->code;
-
- if (me.fstype == FSTYPE__AUTO)
- me.fsname = pm->fstype->name;
-
- if (pm->parent && !pm->dumped && !pm->need_plugin && !pm->external &&
- pm->fstype->dump && fsroot_mounted(pm)) {
- struct mount_info *t;
-
- if (pm->fstype->dump(pm))
- return -1;
-
- list_for_each_entry(t, &pm->mnt_bind, mnt_bind)
- t->dumped = true;
- }
-
- me.mnt_id = pm->mnt_id;
- me.root_dev = pm->s_dev;
- me.parent_mnt_id = pm->parent_mnt_id;
- me.flags = pm->flags;
- me.sb_flags = pm->sb_flags;
- me.has_sb_flags = true;
- me.mountpoint = pm->mountpoint + 1;
- me.source = pm->source;
- me.options = pm->options;
- me.shared_id = pm->shared_id;
- me.has_shared_id = true;
- me.master_id = pm->master_id;
- me.has_master_id = true;
- if (pm->need_plugin) {
- me.has_with_plugin = true;
- me.with_plugin = true;
- }
- if (pm->deleted) {
- me.has_deleted = true;
- me.deleted = true;
- }
-
- if (pm->internal_sharing) {
- me.has_internal_sharing = true;
- me.internal_sharing = true;
- }
-
- if (pm->external) {
- /*
- * For external mount points dump the mapping's
- * value instead of root. See collect_mnt_from_image
- * for reverse mapping details.
- */
- me.root = pm->external->val;
- me.has_ext_mount = true;
- me.ext_mount = true;
- } else
- me.root = pm->root;
-
- if (pb_write_one(img, &me, PB_MNT))
- return -1;
-
- return 0;
-}
-
-static void free_mntinfo(struct mount_info *pms)
-{
- while (pms) {
- struct mount_info *pm;
-
- pm = pms->next;
- mnt_entry_free(pms);
- pms = pm;
- }
-}
-
-struct mount_info *collect_mntinfo(struct ns_id *ns, bool for_dump)
-{
- struct mount_info *pm;
-
- pm = parse_mountinfo(ns->ns_pid, ns, for_dump);
- if (!pm) {
- pr_err("Can't parse %d's mountinfo\n", ns->ns_pid);
- return NULL;
- }
-
- ns->mnt.mntinfo_tree = mnt_build_tree(pm, NULL);
- if (ns->mnt.mntinfo_tree == NULL)
- goto err;
-
- ns->mnt.mntinfo_list = pm;
- return pm;
-err:
- free_mntinfo(pm);
- return NULL;
-}
-
-static int dump_mnt_ns(struct ns_id *ns, struct mount_info *pms)
-{
- struct mount_info *pm;
- int ret = -1;
- struct cr_img *img;
- int ns_id = ns->id;
-
- pr_info("Dumping mountpoints\n");
- img = open_image(CR_FD_MNTS, O_DUMP, ns_id);
- if (!img)
- goto err;
-
- for (pm = pms; pm && pm->nsid == ns; pm = pm->next)
- if (dump_one_mountpoint(pm, img))
- goto err_i;
-
- ret = 0;
-err_i:
- close_image(img);
-err:
- return ret;
-}
-
-/*
- * _fn_f - pre-order traversal function
- * _fn_f - post-order traversal function
- * _plist - a postpone list. _el is added to this list, if _fn_f returns
- * a positive value, and all lower elements are not enumirated.
- */
-#define MNT_TREE_WALK(_r, _el, _fn_f, _fn_r, _plist, _prgs) do { \
- struct mount_info *_mi = _r; \
- \
- while (1) { \
- int ret; \
- \
- list_del_init(&_mi->postpone); \
- \
- ret = _fn_f(_mi); \
- if (ret < 0) \
- return -1; \
- else if (ret > 0) { \
- list_add_tail(&_mi->postpone, _plist); \
- goto up; \
- } \
- \
- _prgs++; \
- \
- if (!list_empty(&_mi->children)) { \
- _mi = list_entry(_mi->children._el, \
- struct mount_info, siblings); \
- continue; \
- } \
- up: \
- if (_fn_r(_mi)) \
- return -1; \
- if (_mi == _r) \
- break; \
- if (_mi->siblings._el == &_mi->parent->children) { \
- _mi = _mi->parent; \
- goto up; \
- } \
- _mi = list_entry(_mi->siblings._el, \
- struct mount_info, siblings); \
- } \
- } while (0)
-
-#define MNT_WALK_NONE 0 &&
-
-
-static int mnt_tree_for_each(struct mount_info *start,
- int (*fn)(struct mount_info *))
-{
- struct mount_info *tmp;
- LIST_HEAD(postpone);
- LIST_HEAD(postpone2);
- int progress;
-
- pr_debug("Start with %d:%s\n", start->mnt_id, start->mountpoint);
- list_add(&start->postpone, &postpone);
-
-again:
- progress = 0;
-
- list_for_each_entry_safe(start, tmp, &postpone, postpone)
- MNT_TREE_WALK(start, next, fn, MNT_WALK_NONE, &postpone2, progress);
-
- if (!progress) {
- struct mount_info *m;
-
- pr_err("A few mount points can't be mounted\n");
- list_for_each_entry(m, &postpone2, postpone) {
- pr_err("%d:%d %s %s %s\n", m->mnt_id,
- m->parent_mnt_id, m->root,
- m->mountpoint, m->source);
- }
- return -1;
- }
-
- list_splice_init(&postpone2, &postpone);
-
- if (!list_empty(&postpone))
- goto again;
-
- return 0;
-
-}
-
-static int mnt_tree_for_each_reverse(struct mount_info *m,
- int (*fn)(struct mount_info *))
-{
- int progress = 0;
-
- MNT_TREE_WALK(m, prev, MNT_WALK_NONE, fn, (struct list_head *) NULL, progress);
-
- return 0;
-}
-
-static char *resolve_source(struct mount_info *mi)
-{
- if (kdev_major(mi->s_dev) == 0)
- /*
- * Anonymous block device. Kernel creates them for
- * diskless mounts.
- */
- return mi->source;
-
- if (mi->fstype->code == FSTYPE__AUTO) {
- struct stat st;
-
- if (!stat(mi->source, &st) && S_ISBLK(st.st_mode) &&
- major(st.st_rdev) == kdev_major(mi->s_dev) &&
- minor(st.st_rdev) == kdev_minor(mi->s_dev))
- return mi->source;
- }
-
- pr_err("No device for %s mount\n", mi->mountpoint);
- return NULL;
-}
-
-static int restore_shared_options(struct mount_info *mi, bool private, bool shared, bool slave)
-{
- pr_debug("%d:%s private %d shared %d slave %d\n",
- mi->mnt_id, mi->mountpoint, private, shared, slave);
-
- if (mi->flags & MS_UNBINDABLE) {
- if (shared || slave)
- pr_warn("%s has both unbindable and sharing, ignoring unbindable\n", mi->mountpoint);
- else
- return mount(NULL, mi->mountpoint, NULL, MS_UNBINDABLE, NULL);
- }
-
- if (private && mount(NULL, mi->mountpoint, NULL, MS_PRIVATE, NULL)) {
- pr_perror("Unable to make %s private", mi->mountpoint);
- return -1;
- }
- if (slave && mount(NULL, mi->mountpoint, NULL, MS_SLAVE, NULL)) {
- pr_perror("Unable to make %s slave", mi->mountpoint);
- return -1;
- }
- if (shared && mount(NULL, mi->mountpoint, NULL, MS_SHARED, NULL)) {
- pr_perror("Unable to make %s shared", mi->mountpoint);
- return -1;
- }
-
- return 0;
-}
-
-/*
- * Umount points, which are propagated in slave parents, because
- * we can't be sure, that they were inherited in a real life.
- */
-static int umount_from_slaves(struct mount_info *mi)
-{
- struct mount_info *t;
- char mpath[PATH_MAX];
-
- list_for_each_entry(t, &mi->parent->mnt_slave_list, mnt_slave) {
- if (!t->mounted)
- continue;
-
- snprintf(mpath, sizeof(mpath), "%s/%s",
- t->mountpoint, basename(mi->mountpoint));
- pr_debug("\t\tUmount slave %s\n", mpath);
- if (umount(mpath) == -1) {
- pr_perror("Can't umount slave %s", mpath);
- return -1;
- }
- }
-
- return 0;
-}
-
-/*
- * If something is mounted in one shared point, it will be spread in
- * all other points from this shared group.
- *
- * Look at Documentation/filesystems/sharedsubtree.txt for more details
- */
-static int propagate_siblings(struct mount_info *mi)
-{
- struct mount_info *t;
-
- /*
- * Find all mounts, which must be bind-mounted from this one
- * to inherite shared group or master id
- */
- list_for_each_entry(t, &mi->mnt_share, mnt_share) {
- if (t->mounted)
- continue;
- pr_debug("\t\tBind share %s\n", t->mountpoint);
- t->bind = mi;
- t->s_dev_rt = mi->s_dev_rt;
- }
-
- list_for_each_entry(t, &mi->mnt_slave_list, mnt_slave) {
- if (t->mounted)
- continue;
- pr_debug("\t\tBind slave %s\n", t->mountpoint);
- t->bind = mi;
- t->s_dev_rt = mi->s_dev_rt;
- }
-
- return 0;
-}
-
-static int propagate_mount(struct mount_info *mi)
-{
- struct mount_info *t;
-
- propagate_siblings(mi);
-
- if (!mi->parent)
- goto skip_parent;
-
- umount_from_slaves(mi);
-
- /* Propagate this mount to everyone from a parent group */
-
- list_for_each_entry(t, &mi->parent->mnt_share, mnt_share) {
- struct mount_info *c;
-
- list_for_each_entry(c, &t->children, siblings) {
- if (mounts_equal(mi, c)) {
- pr_debug("\t\tPropagate %s\n", c->mountpoint);
- c->mounted = true;
- propagate_siblings(c);
- umount_from_slaves(c);
- }
- }
- }
-
-skip_parent:
- /*
- * FIXME Currently non-root mounts can be restored
- * only if a proper root mount exists
- */
- if (fsroot_mounted(mi) || mi->parent == NULL) {
- list_for_each_entry(t, &mi->mnt_bind, mnt_bind) {
- if (t->mounted)
- continue;
- if (t->bind)
- continue;
- if (t->master_id > 0)
- continue;
- t->bind = mi;
- t->s_dev_rt = mi->s_dev_rt;
- }
- }
-
- return 0;
-}
-
-static int fetch_rt_stat(struct mount_info *m, const char *where)
-{
- struct stat st;
-
- if (stat(where, &st)) {
- pr_perror("Can't stat on %s\n", where);
- return -1;
- }
-
- m->s_dev_rt = MKKDEV(major(st.st_dev), minor(st.st_dev));
- return 0;
-}
-
-/*
- * Here are a set of flags which we know how to handle for the one mount call.
- * All of them except MS_RDONLY are set only as mnt flags.
- * MS_RDONLY is set for both mnt ans sb flags, so we can restore it for one
- * mount call only if it set for both masks.
- */
-#define MS_MNT_KNOWN_FLAGS (MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_NOATIME | \
- MS_NODIRATIME | MS_RELATIME | MS_RDONLY)
-
-static int do_simple_mount(struct mount_info *mi, const char *src, const
- char *fstype, unsigned long mountflags)
-{
- return mount(src, mi->mountpoint, fstype, mountflags, mi->options);
-}
-
-static int do_new_mount(struct mount_info *mi)
-{
- unsigned long sflags = mi->sb_flags;
- unsigned long mflags = mi->flags & (~MS_PROPAGATE);
- char *src;
- struct fstype *tp = mi->fstype;
- bool remount_ro = (tp->restore && mi->sb_flags & MS_RDONLY);
- mount_fn_t do_mount = (tp->mount) ? tp->mount : do_simple_mount;
-
- src = resolve_source(mi);
- if (!src)
- return -1;
-
- /* Merge superblock and mount flags if it's posiable */
- if (!(mflags & ~MS_MNT_KNOWN_FLAGS) && !((sflags ^ mflags) & MS_RDONLY)) {
- sflags |= mflags;
- mflags = 0;
- }
-
- if (remount_ro)
- sflags &= ~MS_RDONLY;
-
- if (do_mount(mi, src, tp->name, sflags) < 0) {
- pr_perror("Can't mount at %s", mi->mountpoint);
- return -1;
- }
-
- if (tp->restore && tp->restore(mi))
- return -1;
-
- if (remount_ro)
- return mount(NULL, mi->mountpoint, tp->name,
- MS_REMOUNT | MS_RDONLY, NULL);
-
- if (mflags && mount(NULL, mi->mountpoint, NULL,
- MS_REMOUNT | MS_BIND | mflags, NULL)) {
- pr_perror("Unable to apply bind-mount options");
- return -1;
- }
-
- if (restore_shared_options(mi, !mi->shared_id && !mi->master_id,
- mi->shared_id,
- mi->master_id))
- return -1;
-
- mi->mounted = true;
-
- return 0;
-}
-
-static int restore_ext_mount(struct mount_info *mi)
-{
- int ret;
-
- pr_debug("Restoring external bind mount %s\n", mi->mountpoint);
- ret = run_plugins(RESTORE_EXT_MOUNT, mi->mnt_id, mi->mountpoint, "/", NULL);
- if (ret)
- pr_err("Can't restore ext mount (%d)\n", ret);
- return ret;
-}
-
-static int do_bind_mount(struct mount_info *mi)
-{
- char mnt_path_tmp[] = "/tmp/cr-tmpfs.XXXXXX";
- char mnt_path_root[] = "/cr-tmpfs.XXXXXX";
- char *root, *cut_root, rpath[PATH_MAX];
- unsigned long mflags;
- int exit_code = -1;
- bool shared = false;
- bool master = false;
- bool private = false;
- char *mnt_path = NULL;
- struct stat st;
- bool umount_mnt_path = false;
-
- if (mi->need_plugin) {
- if (restore_ext_mount(mi))
- return -1;
- goto out;
- }
-
- if (mi->external) {
- /*
- * We have / pointing to criu's ns root still,
- * so just use the mapping's path. The mountpoint
- * is tuned in collect_mnt_from_image to refer
- * to proper location in the namespace we restore.
- */
- root = mi->external->val;
- private = !mi->master_id && (mi->internal_sharing || !mi->shared_id);
- goto do_bind;
- }
-
- shared = mi->shared_id && mi->shared_id == mi->bind->shared_id;
- master = mi->master_id && mi->master_id == mi->bind->master_id;
- private = !mi->master_id && !shared;
- cut_root = cut_root_for_bind(mi->root, mi->bind->root);
-
- if (list_empty(&mi->bind->children))
- mnt_path = mi->bind->mountpoint;
- else {
- mnt_path = get_clean_mnt(mi->bind, mnt_path_tmp, mnt_path_root);
- umount_mnt_path = true;
- }
- if (mnt_path == NULL)
- return -1;
-
- snprintf(rpath, sizeof(rpath), "%s/%s",
- mnt_path, cut_root);
- root = rpath;
-do_bind:
- pr_info("\tBind %s to %s\n", root, mi->mountpoint);
-
- if (unlikely(mi->deleted)) {
- if (stat(mi->mountpoint, &st)) {
- pr_perror("Can't fetch stat on %s", mi->mountpoint);
- goto err;
- }
-
- if (S_ISDIR(st.st_mode)) {
- if (mkdir(root, (st.st_mode & ~S_IFMT))) {
- pr_perror("Can't re-create deleted directory %s", root);
- goto err;
- }
- } else if (S_ISREG(st.st_mode)) {
- int fd = open(root, O_WRONLY | O_CREAT | O_EXCL,
- st.st_mode & ~S_IFMT);
- if (fd < 0) {
- pr_perror("Can't re-create deleted file %s", root);
- goto err;
- }
- close(fd);
- } else {
- pr_err("Unsupported st_mode 0%o deleted root %s\n",
- (int)st.st_mode, root);
- goto err;
- }
- }
-
- if (mount(root, mi->mountpoint, NULL, MS_BIND, NULL) < 0) {
- pr_perror("Can't mount at %s", mi->mountpoint);
- goto err;
- }
-
- mflags = mi->flags & (~MS_PROPAGATE);
- if (!mi->bind || mflags != (mi->bind->flags & (~MS_PROPAGATE)))
- if (mount(NULL, mi->mountpoint, NULL, MS_BIND | MS_REMOUNT | mflags, NULL)) {
- pr_perror("Can't mount at %s", mi->mountpoint);
- goto err;
- }
-
- if (unlikely(mi->deleted)) {
- if (S_ISDIR(st.st_mode)) {
- if (rmdir(root)) {
- pr_perror("Can't remove deleted directory %s", root);
- goto err;
- }
- } else if (S_ISREG(st.st_mode)) {
- if (unlink(root)) {
- pr_perror("Can't unlink deleted file %s", root);
- goto err;
- }
- }
- }
-out:
- /*
- * shared - the mount is in the same shared group with mi->bind
- * mi->shared_id && !shared - create a new shared group
- */
- if (restore_shared_options(mi, private,
- mi->shared_id && !shared,
- mi->master_id && !master))
- return -1;
-
- mi->mounted = true;
- exit_code = 0;
-err:
- if (umount_mnt_path) {
- /*
- * If mnt_path was shared, a new mount may be propagated
- * into it.
- */
- if (mount(NULL, mnt_path, NULL, MS_PRIVATE, NULL)) {
- pr_perror("Unable to make %s private", mnt_path);
- return -1;
- }
- if (umount2(mnt_path, MNT_DETACH)) {
- pr_perror("Unable to umount %s", mnt_path);
- return -1;
- }
- if (rmdir(mnt_path)) {
- pr_perror("Unable to remove %s", mnt_path);
- return -1;
- }
- }
- return exit_code;
-}
-
-static bool can_mount_now(struct mount_info *mi)
-{
- /* The root mount */
- if (!mi->parent)
- return true;
-
- if (mi->external)
- return true;
-
- /*
- * We're the slave peer:
- * - Make sure the master peer is already mounted
- * - Make sure all children is mounted as well to
- * eliminame mounts duplications
- */
- if (mi->master_id > 0) {
- struct mount_info *c;
-
- if (mi->bind == NULL)
- return false;
-
- list_for_each_entry(c, &mi->bind->children, siblings) {
- if (!c->mounted)
- return false;
- }
- }
-
- if (!fsroot_mounted(mi) && (mi->bind == NULL && !mi->need_plugin && !mi->external))
- return false;
-
- if (mi->parent->shared_id) {
- struct mount_info *p = mi->parent, *n;
-
- if (mi->parent->shared_id == mi->shared_id) {
- int rlen = strlen(mi->root);
- list_for_each_entry(n, &p->mnt_share, mnt_share)
- if (strlen(n->root) < rlen && !n->mounted)
- return false;
- } else {
- list_for_each_entry(n, &p->mnt_share, mnt_share)
- if (!n->mounted)
- return false;
- }
- }
-
- return true;
-}
-
-static int do_mount_root(struct mount_info *mi)
-{
- if (restore_shared_options(mi, !mi->shared_id && !mi->master_id,
- mi->shared_id, mi->master_id))
- return -1;
-
- return fetch_rt_stat(mi, mi->mountpoint);
-}
-
-static int do_mount_one(struct mount_info *mi)
-{
- int ret;
-
- if (mi->mounted)
- return 0;
-
- if (!can_mount_now(mi)) {
- pr_debug("Postpone slave %s\n", mi->mountpoint);
- return 1;
- }
-
- pr_debug("\tMounting %s @%s (%d)\n", mi->fstype->name, mi->mountpoint, mi->need_plugin);
-
- if (!mi->parent) {
- /* do_mount_root() is called from populate_mnt_ns() */
- mi->mounted = true;
- ret = 0;
- } else if (!mi->bind && !mi->need_plugin && !mi->external)
- ret = do_new_mount(mi);
- else
- ret = do_bind_mount(mi);
-
- if (ret == 0 && fetch_rt_stat(mi, mi->mountpoint))
- return -1;
-
- if (ret == 0 && propagate_mount(mi))
- return -1;
-
- if (mi->fstype->code == FSTYPE__UNSUPPORTED) {
- struct statfs st;
-
- if (statfs(mi->mountpoint, &st)) {
- pr_perror("Unable to statfs %s", mi->mountpoint);
- return -1;
- }
- if (st.f_type == BTRFS_SUPER_MAGIC)
- mi->fstype = find_fstype_by_name("btrfs");
- }
-
- return ret;
-}
-
-static int do_umount_one(struct mount_info *mi)
-{
- if (!mi->parent)
- return 0;
-
- if (mount("none", mi->parent->mountpoint, "none", MS_REC|MS_PRIVATE, NULL)) {
- pr_perror("Can't mark %s as private", mi->parent->mountpoint);
- return -1;
- }
-
- if (umount(mi->mountpoint)) {
- pr_perror("Can't umount at %s", mi->mountpoint);
- return -1;
- }
-
- pr_info("Umounted at %s\n", mi->mountpoint);
- return 0;
-}
-
-static int cr_pivot_root(char *root)
-{
- char put_root[] = "crtools-put-root.XXXXXX";
- int exit_code = -1;
-
- pr_info("Move the root to %s\n", root ? : ".");
-
- if (root) {
- if (chdir(root)) {
- pr_perror("chdir(%s) failed", root);
- return -1;
- }
- }
-
- if (mkdtemp(put_root) == NULL) {
- pr_perror("Can't create a temporary directory");
- return -1;
- }
-
- if (mount(put_root, put_root, NULL, MS_BIND, NULL)) {
- pr_perror("Unable to mount tmpfs in %s", put_root);
- goto err_root;
- }
-
- if (mount(NULL, put_root, NULL, MS_PRIVATE, NULL)) {
- pr_perror("Can't remount %s with MS_PRIVATE", put_root);
- goto err_tmpfs;
- }
-
- if (pivot_root(".", put_root)) {
- pr_perror("pivot_root(., %s) failed", put_root);
- goto err_tmpfs;
- }
-
- if (mount("none", put_root, "none", MS_REC|MS_PRIVATE, NULL)) {
- pr_perror("Can't remount root with MS_PRIVATE");
- return -1;
- }
-
- exit_code = 0;
-
- if (umount2(put_root, MNT_DETACH)) {
- pr_perror("Can't umount %s", put_root);
- return -1;
- }
-
-err_tmpfs:
- if (umount2(put_root, MNT_DETACH)) {
- pr_perror("Can't umount %s", put_root);
- return -1;
- }
-
-err_root:
- if (rmdir(put_root)) {
- pr_perror("Can't remove the directory %s", put_root);
- return -1;
- }
-
- return exit_code;
-}
-
-struct mount_info *mnt_entry_alloc()
-{
- struct mount_info *new;
-
- /*
- * We rely on xzalloc here for MOUNT_INVALID_DEV.
- */
- BUILD_BUG_ON(MOUNT_INVALID_DEV);
-
- new = xzalloc(sizeof(struct mount_info));
- if (new) {
- INIT_LIST_HEAD(&new->children);
- INIT_LIST_HEAD(&new->siblings);
- INIT_LIST_HEAD(&new->mnt_slave_list);
- INIT_LIST_HEAD(&new->mnt_share);
- INIT_LIST_HEAD(&new->mnt_bind);
- INIT_LIST_HEAD(&new->postpone);
- }
- return new;
-}
-
-void mnt_entry_free(struct mount_info *mi)
-{
- if (mi) {
- xfree(mi->root);
- xfree(mi->mountpoint);
- xfree(mi->source);
- xfree(mi->options);
- xfree(mi);
- }
-}
-
-/*
- * Helper for getting a path to where the namespace's root
- * is re-constructed.
- */
-static inline int print_ns_root(struct ns_id *ns, char *buf, int bs)
-{
- return snprintf(buf, bs, "%s/%d", mnt_roots, ns->id);
-}
-
-static int create_mnt_roots(void)
-{
- int exit_code = -1, cwd_fd;
-
- if (mnt_roots)
- return 0;
-
- cwd_fd = open(".", O_DIRECTORY);
- if (cwd_fd < 0) {
- pr_perror("Unable to open cwd");
- return -1;
- }
-
- if (chdir(opts.root ? : "/")) {
- pr_perror("Unable to change working directory on %s", opts.root);
- goto out;
- }
-
- mnt_roots = strdup(".criu.mntns.XXXXXX");
- if (mnt_roots == NULL) {
- pr_perror("Can't allocate memory");
- goto out;
- }
-
- if (mkdtemp(mnt_roots) == NULL) {
- pr_perror("Unable to create a temporary directory");
- mnt_roots = NULL;
- goto out;
- }
-
- exit_code = 0;
-out:
- if (fchdir(cwd_fd)) {
- pr_perror("Unable to restore cwd");
- exit_code = -1;
- }
- close(cwd_fd);
-
- return exit_code;
-}
-
-static int rst_collect_local_mntns(enum ns_type typ)
-{
- struct ns_id *nsid;
-
- nsid = rst_new_ns_id(0, getpid(), &mnt_ns_desc, typ);
- if (!nsid)
- return -1;
-
- mntinfo = collect_mntinfo(nsid, false);
- if (!mntinfo)
- return -1;
-
- futex_set(&nsid->ns_populated, 1);
- return 0;
-}
-
-static int get_mp_root(MntEntry *me, struct mount_info *mi)
-{
- struct ext_mount *em = NULL;
-
- mi->root = xstrdup(me->root);
- if (!mi->root)
- return -1;
-
- if (!me->ext_mount)
- goto out;
-
- /*
- * External mount point -- get the reverse mapping
- * from the command line and put into root's place
- */
-
- em = ext_mount_lookup(me->root);
- if (!em) {
- if (!opts.autodetect_ext_mounts) {
- pr_err("No mapping for %s mountpoint\n", me->mountpoint);
- return -1;
- }
-
- /*
- * Make up an external mount entry for this
- * mount point, since we couldn't find a user
- * supplied one.
- */
- em = xmalloc(sizeof(struct ext_mount));
- if (!em)
- return -1;
-
- /*
- * Put a : in here since those are invalid on
- * the cli, so we know it's autogenerated in
- * debugging.
- */
- em->key = AUTODETECTED_MOUNT;
- em->val = mi->source;
- }
-
- mi->external = em;
-out:
- pr_debug("\t\tWill mount %d from %s%s\n",
- mi->mnt_id, em ? em->val : mi->root, em ? " (E)" : "");
- return 0;
-}
-
-static int get_mp_mountpoint(MntEntry *me, struct mount_info *mi, char *root, int root_len)
-{
- int len;
-
- len = strlen(me->mountpoint) + root_len + 1;
- mi->mountpoint = xmalloc(len);
- if (!mi->mountpoint)
- return -1;
-
- /*
- * For bind-mounts we would also fix the root here
- * too, but bind-mounts restore merges mountpoint
- * and root paths together, so there's no need in
- * that.
- */
-
- strcpy(mi->mountpoint, root);
- strcpy(mi->mountpoint + root_len, me->mountpoint);
-
- mi->ns_mountpoint = mi->mountpoint + root_len;
-
- pr_debug("\t\tWill mount %d @ %s\n", mi->mnt_id, mi->mountpoint);
- return 0;
-}
-
-static int collect_mnt_from_image(struct mount_info **pms, struct ns_id *nsid)
-{
- MntEntry *me = NULL;
- int ret, root_len = 1;
- struct cr_img *img;
- char root[PATH_MAX] = ".";
-
- img = open_image(CR_FD_MNTS, O_RSTR, nsid->id);
- if (!img)
- return -1;
-
- if (nsid->type == NS_OTHER)
- root_len = print_ns_root(nsid, root, sizeof(root));
-
- pr_debug("Reading mountpoint images (id %d pid %d)\n",
- nsid->id, (int)nsid->ns_pid);
-
- while (1) {
- struct mount_info *pm;
-
- ret = pb_read_one_eof(img, &me, PB_MNT);
- if (ret <= 0)
- break;
-
- pm = mnt_entry_alloc();
- if (!pm)
- goto err;
-
- pm->nsid = nsid;
- pm->next = *pms;
- *pms = pm;
-
- pm->mnt_id = me->mnt_id;
- pm->parent_mnt_id = me->parent_mnt_id;
- pm->s_dev = me->root_dev;
- pm->flags = me->flags;
- pm->sb_flags = me->sb_flags;
- if (!me->has_sb_flags) {
- const unsigned int mflags = MS_SHARED | MS_PRIVATE |
- MS_SLAVE | MS_UNBINDABLE |
- MS_NOSUID | MS_NODEV | MS_NOEXEC |
- MS_NOATIME | MS_NODIRATIME | MS_RELATIME;
-
- /*
- * In old images mnt and sb flags are saved together.
- * Here we separate them and save the old logic about MS_RDONLY.
- */
-
- pm->sb_flags = pm->flags & ~mflags;
- pm->flags = pm->flags & mflags;
- }
- pm->shared_id = me->shared_id;
- pm->master_id = me->master_id;
- pm->need_plugin = me->with_plugin;
- pm->deleted = me->deleted;
- pm->is_ns_root = is_root(me->mountpoint);
- if (me->has_internal_sharing)
- pm->internal_sharing = me->internal_sharing;
-
- pm->source = xstrdup(me->source);
- if (!pm->source)
- goto err;
-
- pm->options = xstrdup(me->options);
- if (!pm->options)
- goto err;
-
- /* FIXME: abort unsupported early */
- pm->fstype = decode_fstype(me->fstype, me->fsname);
-
- if (get_mp_root(me, pm))
- goto err;
-
- if (get_mp_mountpoint(me, pm, root, root_len))
- goto err;
-
- pr_debug("\tRead %d mp @ %s\n", pm->mnt_id, pm->mountpoint);
- }
-
- if (me)
- mnt_entry__free_unpacked(me, NULL);
-
- close_image(img);
-
- return 0;
-err:
- close_image(img);
- return -1;
-}
-
-int read_mnt_ns_img(void)
-{
- struct mount_info *pms = NULL;
- struct ns_id *nsid;
-
- for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
- if (nsid->nd != &mnt_ns_desc)
- continue;
-
- if (collect_mnt_from_image(&pms, nsid))
- return -1;
- }
-
- mntinfo = pms;
- return 0;
-}
-
-int rst_get_mnt_root(int mnt_id, char *path, int plen)
-{
- struct mount_info *m;
-
- if (!(root_ns_mask & CLONE_NEWNS) || mnt_id == -1)
- goto rroot;
-
- m = lookup_mnt_id(mnt_id);
- if (m == NULL)
- return -1;
-
- if (m->nsid->type == NS_OTHER)
- return print_ns_root(m->nsid, path, plen);
-
-rroot:
- path[0] = '/';
- path[1] = '\0';
- return 1;
-}
-
-int mntns_maybe_create_roots(void)
-{
- struct ns_id *ns;
-
- if (!(root_ns_mask & CLONE_NEWNS))
- return 0;
-
- for (ns = ns_ids; ns != NULL; ns = ns->next) {
- if (ns->nd != &mnt_ns_desc)
- continue;
-
- if (ns->type != NS_ROOT) {
- BUG_ON(ns->type == NS_CRIU);
-
- /*
- * If we have more than one (root) namespace,
- * then we'll need the roots yard.
- */
- return create_mnt_roots();
- }
- }
-
- /* No "other" mntns found, just go ahead, we don't need roots yard. */
- return 0;
-}
-
-static int do_restore_task_mnt_ns(struct ns_id *nsid, struct pstree_item *current)
-{
- int fd;
-
- fd = open_proc(root_item->pid.virt, "fd/%d", nsid->mnt.ns_fd);
- if (fd < 0)
- return -1;
-
- if (setns(fd, CLONE_NEWNS)) {
- pr_perror("Can't restore mntns");
- close(fd);
- return -1;
- }
- close(fd);
-
- if (nsid->ns_pid == current->pid.virt)
- futex_set_and_wake(&nsid->ns_populated, 1);
-
- return 0;
-}
-
-int restore_task_mnt_ns(struct pstree_item *current)
-{
- if (current->ids && current->ids->has_mnt_ns_id) {
- unsigned int id = current->ids->mnt_ns_id;
- struct ns_id *nsid;
-
- /*
- * Regardless of the namespace a task wants to
- * live in, by that point they all will live in
- * root's one (see prepare_pstree_kobj_ids() +
- * get_clone_mask()). So if the current task's
- * target namespace is the root's one -- it's
- * already there, otherwise it will have to do
- * setns().
- */
- if (!current->parent || id == current->parent->ids->mnt_ns_id)
- return 0;
-
- nsid = lookup_ns_by_id(id, &mnt_ns_desc);
- if (nsid == NULL) {
- pr_err("Can't find mount namespace %d\n", id);
- return -1;
- }
-
- BUG_ON(nsid->type == NS_CRIU);
-
- if (do_restore_task_mnt_ns(nsid, current))
- return -1;
- }
-
- return 0;
-}
-
-void fini_restore_mntns(void)
-{
- struct ns_id *nsid;
-
- if (!(root_ns_mask & CLONE_NEWNS))
- return;
-
- for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
- if (nsid->nd != &mnt_ns_desc)
- continue;
- close(nsid->mnt.ns_fd);
- if (nsid->type != NS_ROOT)
- close(nsid->mnt.root_fd);
- }
-}
-
-/*
- * All nested mount namespaces are restore as sub-trees of the root namespace.
- */
-static int populate_roots_yard(void)
-{
- char path[PATH_MAX];
- struct ns_id *nsid;
-
- if (mnt_roots == NULL)
- return 0;
-
- if (make_yard(mnt_roots))
- return -1;
-
- for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
- if (nsid->nd != &mnt_ns_desc)
- continue;
-
- print_ns_root(nsid, path, sizeof(path));
- if (mkdir(path, 0600)) {
- pr_perror("Unable to create %s", path);
- return -1;
- }
- }
-
- return 0;
-}
-
-static int populate_mnt_ns(void)
-{
- struct mount_info *pms;
- struct ns_id *nsid;
- struct mount_info *roots_mp = NULL;
-
- if (mnt_roots) {
- /* mnt_roots is a tmpfs mount and it's private */
- roots_mp = mnt_entry_alloc();
- if (!roots_mp)
- return -1;
-
- roots_mp->mountpoint = mnt_roots;
- roots_mp->mounted = true;
- }
-
- pms = mnt_build_tree(mntinfo, roots_mp);
- if (!pms)
- return -1;
-
- if (resolve_shared_mounts(mntinfo, pms->master_id))
- return -1;
-
- for (nsid = ns_ids; nsid; nsid = nsid->next) {
- if (nsid->nd != &mnt_ns_desc)
- continue;
-
- /*
- * Make trees of all namespaces look the
- * same, so that manual paths resolution
- * works on them.
- */
- nsid->mnt.mntinfo_tree = pms;
- }
-
- if (validate_mounts(mntinfo, false))
- return -1;
-
- /*
- * Set properties for the root before mounting a root yard,
- * otherwise the root yard can be propagated into the host
- * mntns and remain there.
- */
- if (do_mount_root(pms))
- return -1;
-
- if (populate_roots_yard())
- return -1;
-
- return mnt_tree_for_each(pms, do_mount_one);
-}
-
-int depopulate_roots_yard(void)
-{
- int ret = 0;
-
- if (mnt_roots == NULL)
- return 0;
-
- if (mount("none", mnt_roots, "none", MS_REC|MS_PRIVATE, NULL)) {
- pr_perror("Can't remount root with MS_PRIVATE");
- ret = 1;
- }
- /*
- * Don't exit after a first error, becuase this function
- * can be used to rollback in a error case.
- * Don't worry about MNT_DETACH, because files are restored after this
- * and nobody will not be restored from a wrong mount namespace.
- */
- if (umount2(mnt_roots, MNT_DETACH)) {
- pr_perror("Can't unmount %s", mnt_roots);
- ret = 1;
- }
-
- return ret;
-}
-
-void cleanup_mnt_ns(void)
-{
- char path[PATH_MAX], *root = opts.root ? : "/";
-
- if (mnt_roots == NULL)
- return;
-
- snprintf(path, sizeof(path), "%s/%s", root, mnt_roots);
- if (rmdir(path))
- pr_perror("Can't remove the directory %s", mnt_roots);
-}
-
-int prepare_mnt_ns(void)
-{
- int ret = -1, rst = -1;
- struct mount_info *old;
- struct ns_id ns = { .type = NS_CRIU, .ns_pid = PROC_SELF, .nd = &mnt_ns_desc };
- struct ns_id *nsid;
-
- if (!(root_ns_mask & CLONE_NEWNS))
- return rst_collect_local_mntns(NS_CRIU);
-
- pr_info("Restoring mount namespace\n");
-
- old = collect_mntinfo(&ns, false);
- if (old == NULL)
- return -1;
-
- if (!opts.root) {
- if (chdir("/")) {
- pr_perror("chdir(\"/\") failed");
- return -1;
- }
-
- /*
- * The new mount namespace is filled with the mountpoint
- * clones from the original one. We have to umount them
- * prior to recreating new ones.
- */
- pr_info("Cleaning mount namespace\n");
- if (mnt_tree_for_each_reverse(ns.mnt.mntinfo_tree, do_umount_one))
- return -1;
- } else {
- struct mount_info *mi;
-
- /*
- * The whole tree of mountpoints is to be moved into one
- * place with the pivot_root() call. Don't do manual
- * umount (as we do above), all this stuff will go away
- * with a single umount call later.
- */
-
- /* moving a mount residing under a shared mount is invalid. */
- mi = mount_resolve_path(ns.mnt.mntinfo_tree, opts.root);
- if (mi == NULL) {
- pr_err("Unable to find mount point for %s\n", opts.root);
- return -1;
- }
- if (mi->parent == NULL) {
- pr_err("New root and old root are the same\n");
- return -1;
- }
-
- /* Our root is mounted over the parent (in the same directory) */
- if (!strcmp(mi->parent->mountpoint, mi->mountpoint)) {
- pr_err("The parent of the new root is unreachable\n");
- return -1;
- }
-
- if (mount("none", mi->parent->mountpoint + 1, "none", MS_SLAVE, NULL)) {
- pr_perror("Can't remount the parent of the new root with MS_SLAVE");
- return -1;
- }
-
- /* Unprivileged users can't reveal what is under a mount */
- if (root_ns_mask & CLONE_NEWUSER) {
- if (mount(opts.root, opts.root, NULL, MS_BIND | MS_REC, NULL)) {
- pr_perror("Can't remount bind-mount %s into itself", opts.root);
- return -1;
- }
- }
- if (chdir(opts.root)) {
- pr_perror("chdir(%s) failed", opts.root ? : "/");
- return -1;
- }
- }
-
- free_mntinfo(old);
-
- ret = populate_mnt_ns();
- if (!ret && opts.root)
- ret = cr_pivot_root(NULL);
- if (ret)
- return -1;
-
- rst = open_proc(PROC_SELF, "ns/mnt");
- if (rst < 0)
- return -1;
-
- /* resotre non-root namespaces */
- for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
- char path[PATH_MAX];
-
- if (nsid->nd != &mnt_ns_desc)
- continue;
- if (nsid->type == NS_ROOT) {
- /* Pin one with a file descriptor */
- nsid->mnt.ns_fd = open_proc(PROC_SELF, "ns/mnt");
- if (nsid->mnt.ns_fd < 0)
- goto err;
- /* we set ns_populated so we don't need to open root_fd */
- futex_set(&nsid->ns_populated, 1);
- continue;
- }
-
- /* Create the new mount namespace */
- if (unshare(CLONE_NEWNS)) {
- pr_perror("Unable to create a new mntns");
- goto err;
- }
-
- /* Set its root */
- path[0] = '/';
- print_ns_root(nsid, path + 1, sizeof(path) - 1);
- if (cr_pivot_root(path))
- goto err;
-
- /* Pin one with a file descriptor */
- nsid->mnt.ns_fd = open_proc(PROC_SELF, "ns/mnt");
- if (nsid->mnt.ns_fd < 0)
- goto err;
-
- /* root_fd is used to restore file mappings */
- nsid->mnt.root_fd = open_proc(PROC_SELF, "root");
- if (nsid->mnt.root_fd < 0)
- goto err;
-
- /* And return back to regain the access to the roots yard */
- if (setns(rst, CLONE_NEWNS)) {
- pr_perror("Can't restore mntns back");
- goto err;
- }
- }
- close(rst);
-
- return ret;
-err:
- if (rst >= 0)
- restore_ns(rst, &mnt_ns_desc);
- return -1;
-}
-
-static int mntns_root_pid = -1;
-static int mntns_set_root_fd(pid_t pid, int fd)
-{
- int ret;
-
- ret = install_service_fd(ROOT_FD_OFF, fd);
- if (ret >= 0)
- mntns_root_pid = pid;
- close(fd);
-
- return ret;
-}
-
-int __mntns_get_root_fd(pid_t pid)
-{
-
- int fd, pfd;
- int ret;
- char path[PATH_MAX + 1];
-
- if (mntns_root_pid == pid) /* The required root is already opened */
- return get_service_fd(ROOT_FD_OFF);
-
- close_service_fd(ROOT_FD_OFF);
-
- if (!(root_ns_mask & CLONE_NEWNS)) {
- /*
- * If criu and tasks we dump live in the same mount
- * namespace, we can just open the root directory.
- * All paths resolution would occur relative to criu's
- * root. Even if it is not namespace's root, provided
- * file paths are resolved, we'd get consistent dump.
- */
- fd = open("/", O_RDONLY | O_DIRECTORY);
- if (fd < 0) {
- pr_perror("Can't open root");
- return -1;
- }
-
- goto set_root;
- }
-
- /*
- * If /proc/pid/root links on '/', it signs that a root of the task
- * and a root of mntns is the same.
- */
-
- pfd = open_pid_proc(pid);
- ret = readlinkat(pfd, "root", path, sizeof(path) - 1);
- if (ret < 0) {
- close_pid_proc();
- return ret;
- }
-
- path[ret] = '\0';
-
- if (ret != 1 || path[0] != '/') {
- pr_err("The root task has another root than mntns: %s\n", path);
- close_pid_proc();
- return -1;
- }
-
- fd = openat(pfd, "root", O_RDONLY | O_DIRECTORY, 0);
- close_pid_proc();
- if (fd < 0) {
- pr_perror("Can't open the task root");
- return -1;
- }
-
-set_root:
- return mntns_set_root_fd(pid, fd);
-}
-
-int mntns_get_root_fd(struct ns_id *mntns) {
- /*
- * All namespaces are restored from the root task and during the
- * CR_STATE_FORKING stage the root task has two file descriptors for
- * each mntns. One is associated with a namespace and another one is a
- * root of this mntns.
- *
- * When a non-root task is forked, it enters into a proper mount
- * namespace, restores private mappings and forks children. Some of
- * these mappings can be associated with files from other namespaces.
- *
- * After the CR_STATE_FORKING stage the root task has to close all
- * mntns file descriptors to restore its descriptors and at this moment
- * we know that all tasks live in their mount namespaces.
- *
- * If we find that a mount namespace isn't populated, we can get its
- * root from the root task.
- */
-
- if (!futex_get(&mntns->ns_populated)) {
- int fd;
-
- fd = open_proc(root_item->pid.virt, "fd/%d", mntns->mnt.root_fd);
- if (fd < 0)
- return -1;
-
- return mntns_set_root_fd(mntns->ns_pid, fd);
- }
-
- return __mntns_get_root_fd(mntns->ns_pid);
-}
-
-struct ns_id *lookup_nsid_by_mnt_id(int mnt_id)
-{
- struct mount_info *mi;
-
- /*
- * Kernel before 3.15 doesn't show mnt_id for file descriptors.
- * mnt_id isn't saved for files, if mntns isn't dumped.
- * In both these cases we have only one root, so here
- * is not matter which mount will be restured.
- */
- if (mnt_id == -1)
- mi = mntinfo;
- else
- mi = lookup_mnt_id(mnt_id);
- return mi ? mi->nsid : NULL;
-}
-
-int mntns_get_root_by_mnt_id(int mnt_id)
-{
- struct ns_id *mntns;
-
- mntns = lookup_nsid_by_mnt_id(mnt_id);
- BUG_ON(mntns == NULL);
-
- return mntns_get_root_fd(mntns);
-}
-
-struct collect_mntns_arg {
- bool need_to_validate;
- bool for_dump;
- int root_master_id;
-};
-
-static int collect_mntns(struct ns_id *ns, void *__arg)
-{
- struct collect_mntns_arg *arg = __arg;
- struct mount_info *pms;
-
- pms = collect_mntinfo(ns, arg->for_dump);
- if (!pms)
- return -1;
-
- if (arg->for_dump && ns->type != NS_CRIU)
- arg->need_to_validate = true;
-
- mntinfo_add_list(pms);
-
- if (arg->need_to_validate && ns->id == root_item->ids->mnt_ns_id)
- arg->root_master_id = ns->mnt.mntinfo_tree->master_id;
-
- return 0;
-}
-
-int collect_mnt_namespaces(bool for_dump)
-{
- struct collect_mntns_arg arg;
- int ret;
-
- arg.for_dump = for_dump;
- arg.need_to_validate = false;
-
- ret = walk_namespaces(&mnt_ns_desc, collect_mntns, &arg);
- if (ret)
- goto err;
-
- ret = resolve_external_mounts(mntinfo);
- if (ret)
- goto err;
-
- if (arg.need_to_validate) {
- ret = -1;
-
- if (resolve_shared_mounts(mntinfo, arg.root_master_id))
- goto err;
- if (validate_mounts(mntinfo, true))
- goto err;
- }
-
- ret = 0;
-err:
- return ret;
-}
-
-int dump_mnt_namespaces(void)
-{
- struct ns_id *nsid;
-
- if (!(root_ns_mask & CLONE_NEWNS))
- return 0;
-
- for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
- if (nsid->nd != &mnt_ns_desc || nsid->type == NS_CRIU)
- continue;
-
- if ((nsid->type == NS_OTHER) && check_mnt_id()) {
- pr_err("Nested mount namespaces are not supported "
- "without mnt_id in fdinfo\n");
- return -1;
- }
-
- if (dump_mnt_ns(nsid, nsid->mnt.mntinfo_list))
- return -1;
- }
-
- return 0;
-}
-
-struct ns_desc mnt_ns_desc = NS_DESC_ENTRY(CLONE_NEWNS, "mnt");
diff --git a/namespaces.c b/namespaces.c
deleted file mode 100644
index 9a7836bcad89..000000000000
--- a/namespaces.c
+++ /dev/null
@@ -1,1403 +0,0 @@
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/wait.h>
-#include <stdlib.h>
-#include <sys/prctl.h>
-#include <grp.h>
-#include <sys/socket.h>
-#include <sys/un.h>
-#include <stdarg.h>
-#include <signal.h>
-#include <sched.h>
-
-#include "cr-show.h"
-#include "util.h"
-#include "imgset.h"
-#include "uts_ns.h"
-#include "ipc_ns.h"
-#include "mount.h"
-#include "pstree.h"
-#include "namespaces.h"
-#include "net.h"
-
-#include "protobuf.h"
-#include "protobuf/ns.pb-c.h"
-#include "protobuf/userns.pb-c.h"
-
-static struct ns_desc *ns_desc_array[] = {
- &net_ns_desc,
- &uts_ns_desc,
- &ipc_ns_desc,
- &pid_ns_desc,
- &user_ns_desc,
- &mnt_ns_desc,
-};
-
-static unsigned int parse_ns_link(char *link, size_t len, struct ns_desc *d)
-{
- unsigned long kid = 0;
- char *end;
-
- if (len >= d->len + 2) {
- if (link[d->len] == ':' && !memcmp(link, d->str, d->len)) {
- kid = strtoul(&link[d->len + 2], &end, 10);
- if (end && *end == ']')
- BUG_ON(kid > UINT_MAX);
- else
- kid = 0;
- }
- }
-
- return (unsigned int)kid;
-}
-
-bool check_ns_proc(struct fd_link *link)
-{
- unsigned int i, kid;
-
- for (i = 0; i < ARRAY_SIZE(ns_desc_array); i++) {
- kid = parse_ns_link(link->name + 1, link->len - 1, ns_desc_array[i]);
- if (!kid)
- continue;
-
- link->ns_d = ns_desc_array[i];
- link->ns_kid = kid;
- return true;
- }
-
- return false;
-}
-
-int switch_ns(int pid, struct ns_desc *nd, int *rst)
-{
- char buf[32];
- int nsfd;
- int ret = -1;
-
- nsfd = open_proc(pid, "ns/%s", nd->str);
- if (nsfd < 0) {
- pr_perror("Can't open ipcns file");
- goto err_ns;
- }
-
- if (rst) {
- snprintf(buf, sizeof(buf), "/proc/self/ns/%s", nd->str);
- *rst = open(buf, O_RDONLY);
- if (*rst < 0) {
- pr_perror("Can't open ns file");
- goto err_rst;
- }
- }
-
- ret = setns(nsfd, nd->cflag);
- if (ret < 0) {
- pr_perror("Can't setns %d/%s", pid, nd->str);
- goto err_set;
- }
-
- close(nsfd);
- return 0;
-
-err_set:
- if (rst)
- close(*rst);
-err_rst:
- close(nsfd);
-err_ns:
- return -1;
-}
-
-int restore_ns(int rst, struct ns_desc *nd)
-{
- int ret;
-
- ret = setns(rst, nd->cflag);
- if (ret < 0)
- pr_perror("Can't restore ns back");
-
- close(rst);
-
- return ret;
-}
-
-struct ns_id *ns_ids = NULL;
-static unsigned int ns_next_id = 1;
-unsigned long root_ns_mask = 0;
-
-static void nsid_add(struct ns_id *ns, struct ns_desc *nd, unsigned int id, pid_t pid)
-{
- ns->nd = nd;
- ns->id = id;
- ns->ns_pid = pid;
- ns->next = ns_ids;
- ns_ids = ns;
-
- pr_info("Add %s ns %d pid %d\n", nd->str, ns->id, ns->ns_pid);
-}
-
-struct ns_id *rst_new_ns_id(unsigned int id, pid_t pid,
- struct ns_desc *nd, enum ns_type type)
-{
- struct ns_id *nsid;
-
- nsid = shmalloc(sizeof(*nsid));
- if (nsid) {
- nsid->type = type;
- nsid_add(nsid, nd, id, pid);
- futex_set(&nsid->ns_populated, 0);
- }
-
- return nsid;
-}
-
-int rst_add_ns_id(unsigned int id, struct pstree_item *i, struct ns_desc *nd)
-{
- pid_t pid = i->pid.virt;
- struct ns_id *nsid;
-
- nsid = lookup_ns_by_id(id, nd);
- if (nsid) {
- if (pid_rst_prio(pid, nsid->ns_pid))
- nsid->ns_pid = pid;
- return 0;
- }
-
- nsid = rst_new_ns_id(id, pid, nd,
- i == root_item ? NS_ROOT : NS_OTHER);
- if (nsid == NULL)
- return -1;
-
- return 0;
-}
-
-static struct ns_id *lookup_ns_by_kid(unsigned int kid, struct ns_desc *nd)
-{
- struct ns_id *nsid;
-
- for (nsid = ns_ids; nsid != NULL; nsid = nsid->next)
- if (nsid->kid == kid && nsid->nd == nd)
- return nsid;
-
- return NULL;
-}
-
-struct ns_id *lookup_ns_by_id(unsigned int id, struct ns_desc *nd)
-{
- struct ns_id *nsid;
-
- for (nsid = ns_ids; nsid != NULL; nsid = nsid->next)
- if (nsid->id == id && nsid->nd == nd)
- return nsid;
-
- return NULL;
-}
-
-/*
- * For all namespaces we support, there are two supported
- * tasks-to-namespaces layout.
- *
- * If root task lives in the same namespace as criu does
- * all other tasks should live in it too and we do NOT dump
- * this namespace. On restore tasks inherit the respective
- * namespace from criu.
- *
- * If root task lives in its own namespace, then all other
- * tasks may live in it. Sometimes (CLONE_SUBNS) there can
- * be more than one namespace of that type. For this case
- * we dump all namespace's info and recreate them on restore.
- */
-
-int walk_namespaces(struct ns_desc *nd, int (*cb)(struct ns_id *, void *), void *oarg)
-{
- int ret = 0;
- struct ns_id *ns;
-
- for (ns = ns_ids; ns != NULL; ns = ns->next) {
- if (ns->nd != nd)
- continue;
-
- if (ns->type == NS_CRIU) {
- if (root_ns_mask & nd->cflag)
- continue;
-
- ret = cb(ns, oarg);
- break;
- }
-
- ret = cb(ns, oarg);
- if (ret)
- break;
- }
-
- return ret;
-}
-
-static unsigned int generate_ns_id(int pid, unsigned int kid, struct ns_desc *nd,
- struct ns_id **ns_ret)
-{
- struct ns_id *nsid;
- enum ns_type type;
-
- nsid = lookup_ns_by_kid(kid, nd);
- if (nsid)
- goto found;
-
- if (pid != getpid()) {
- type = NS_OTHER;
- if (pid == root_item->pid.real) {
- BUG_ON(root_ns_mask & nd->cflag);
- pr_info("Will take %s namespace in the image\n", nd->str);
- root_ns_mask |= nd->cflag;
- type = NS_ROOT;
- } else if (nd->cflag & ~CLONE_SUBNS) {
- pr_err("Can't dump nested %s namespace for %d\n",
- nd->str, pid);
- return 0;
- }
- } else
- type = NS_CRIU;
-
- nsid = xmalloc(sizeof(*nsid));
- if (!nsid)
- return 0;
-
- nsid->type = type;
- nsid->kid = kid;
- futex_set(&nsid->ns_populated, 1);
- nsid_add(nsid, nd, ns_next_id++, pid);
-
-found:
- if (ns_ret)
- *ns_ret = nsid;
- return nsid->id;
-}
-
-static unsigned int __get_ns_id(int pid, struct ns_desc *nd, struct ns_id **ns)
-{
- int proc_dir, ret;
- unsigned int kid;
- char ns_path[10], ns_id[32];
-
- proc_dir = open_pid_proc(pid);
- if (proc_dir < 0)
- return 0;
-
- sprintf(ns_path, "ns/%s", nd->str);
- ret = readlinkat(proc_dir, ns_path, ns_id, sizeof(ns_id) - 1);
- if (ret < 0) {
- if (errno == ENOENT) {
- /* The namespace is unsupported */
- kid = 0;
- goto out;
- }
- pr_perror("Can't readlink ns link");
- return 0;
- }
- ns_id[ret] = '\0';
-
- kid = parse_ns_link(ns_id, ret, nd);
- BUG_ON(!kid);
-
-out:
- return generate_ns_id(pid, kid, nd, ns);
-}
-
-static unsigned int get_ns_id(int pid, struct ns_desc *nd)
-{
- return __get_ns_id(pid, nd, NULL);
-}
-
-int dump_one_ns_file(int lfd, u32 id, const struct fd_parms *p)
-{
- struct cr_img *img = img_from_set(glob_imgset, CR_FD_NS_FILES);
- NsFileEntry nfe = NS_FILE_ENTRY__INIT;
- struct fd_link *link = p->link;
- struct ns_id *nsid;
-
- nsid = lookup_ns_by_kid(link->ns_kid, link->ns_d);
- if (!nsid) {
- pr_err("No NS ID with kid %u\n", link->ns_kid);
- return -1;
- }
-
- nfe.id = id;
- nfe.ns_id = nsid->id;
- nfe.ns_cflag = link->ns_d->cflag;
- nfe.flags = p->flags;
-
- return pb_write_one(img, &nfe, PB_NS_FILE);
-}
-
-const struct fdtype_ops nsfile_dump_ops = {
- .type = FD_TYPES__NS,
- .dump = dump_one_ns_file,
-};
-
-struct ns_file_info {
- struct file_desc d;
- NsFileEntry *nfe;
-};
-
-static int open_ns_fd(struct file_desc *d)
-{
- struct ns_file_info *nfi = container_of(d, struct ns_file_info, d);
- struct pstree_item *item, *t;
- struct ns_desc *nd = NULL;
- char path[64];
- int fd;
-
- /*
- * Find out who can open us.
- *
- * FIXME I need a hash or RBtree here.
- */
- for_each_pstree_item(t) {
- TaskKobjIdsEntry *ids = t->ids;
-
- if (ids->pid_ns_id == nfi->nfe->ns_id) {
- item = t;
- nd = &pid_ns_desc;
- break;
- } else if (ids->net_ns_id == nfi->nfe->ns_id) {
- item = t;
- nd = &net_ns_desc;
- break;
- } else if (ids->ipc_ns_id == nfi->nfe->ns_id) {
- item = t;
- nd = &ipc_ns_desc;
- break;
- } else if (ids->uts_ns_id == nfi->nfe->ns_id) {
- item = t;
- nd = &uts_ns_desc;
- break;
- } else if (ids->mnt_ns_id == nfi->nfe->ns_id) {
- item = t;
- nd = &mnt_ns_desc;
- break;
- }
- }
-
- if (!nd || !item) {
- pr_err("Can't find suitable NS ID for %#x\n", nfi->nfe->ns_id);
- return -1;
- }
-
- if (nd->cflag != nfi->nfe->ns_cflag) {
- pr_err("Clone flag mismatch for %#x\n", nfi->nfe->ns_id);
- return -1;
- }
-
- snprintf(path, sizeof(path) - 1, "/proc/%d/ns/%s", item->pid.virt, nd->str);
- path[sizeof(path) - 1] = '\0';
-
- fd = open(path, nfi->nfe->flags);
- if (fd < 0) {
- pr_perror("Can't open file %s on restore", path);
- return fd;
- }
-
- return fd;
-}
-
-static struct file_desc_ops ns_desc_ops = {
- .type = FD_TYPES__NS,
- .open = open_ns_fd,
-};
-
-static int collect_one_nsfile(void *o, ProtobufCMessage *base)
-{
- struct ns_file_info *nfi = o;
-
- nfi->nfe = pb_msg(base, NsFileEntry);
- pr_info("Collected ns file ID %#x NS-ID %#x\n", nfi->nfe->id, nfi->nfe->ns_id);
- return file_desc_add(&nfi->d, nfi->nfe->id, &ns_desc_ops);
-}
-
-struct collect_image_info nsfile_cinfo = {
- .fd_type = CR_FD_NS_FILES,
- .pb_type = PB_NS_FILE,
- .priv_size = sizeof(struct ns_file_info),
- .collect = collect_one_nsfile,
-};
-
-/*
- * Same as dump_task_ns_ids(), but
- * a) doesn't keep IDs (don't need them)
- * b) generates them for mount and netns only
- * mnt ones are needed for open_mount() in
- * inotify pred-dump
- * net ones are needed for parasite socket
- */
-
-int predump_task_ns_ids(struct pstree_item *item)
-{
- int pid = item->pid.real;
-
- if (!__get_ns_id(pid, &net_ns_desc, &dmpi(item)->netns))
- return -1;
-
- if (!get_ns_id(pid, &mnt_ns_desc))
- return -1;
-
- return 0;
-}
-
-int dump_task_ns_ids(struct pstree_item *item)
-{
- int pid = item->pid.real;
- TaskKobjIdsEntry *ids = item->ids;
-
- ids->has_pid_ns_id = true;
- ids->pid_ns_id = get_ns_id(pid, &pid_ns_desc);
- if (!ids->pid_ns_id) {
- pr_err("Can't make pidns id\n");
- return -1;
- }
-
- ids->has_net_ns_id = true;
- ids->net_ns_id = __get_ns_id(pid, &net_ns_desc, &dmpi(item)->netns);
- if (!ids->net_ns_id) {
- pr_err("Can't make netns id\n");
- return -1;
- }
-
- ids->has_ipc_ns_id = true;
- ids->ipc_ns_id = get_ns_id(pid, &ipc_ns_desc);
- if (!ids->ipc_ns_id) {
- pr_err("Can't make ipcns id\n");
- return -1;
- }
-
- ids->has_uts_ns_id = true;
- ids->uts_ns_id = get_ns_id(pid, &uts_ns_desc);
- if (!ids->uts_ns_id) {
- pr_err("Can't make utsns id\n");
- return -1;
- }
-
- ids->has_mnt_ns_id = true;
- ids->mnt_ns_id = get_ns_id(pid, &mnt_ns_desc);
- if (!ids->mnt_ns_id) {
- pr_err("Can't make mntns id\n");
- return -1;
- }
-
- ids->has_user_ns_id = true;
- ids->user_ns_id = get_ns_id(pid, &user_ns_desc);
- if (!ids->user_ns_id) {
- pr_err("Can't make userns id\n");
- return -1;
- }
-
- return 0;
-}
-
-static UsernsEntry userns_entry = USERNS_ENTRY__INIT;
-
-static int userns_id(int id, UidGidExtent **map, int n)
-{
- int i;
-
- if (!(root_ns_mask & CLONE_NEWUSER))
- return id;
-
- for (i = 0; i < n; i++) {
- if (map[i]->lower_first <= id &&
- map[i]->lower_first + map[i]->count > id)
- return map[i]->first + (id - map[i]->lower_first);
- }
-
- return -1;
-}
-
-int userns_uid(int uid)
-{
- UsernsEntry *e = &userns_entry;
- return userns_id(uid, e->uid_map, e->n_uid_map);
-}
-
-int userns_gid(int gid)
-{
- UsernsEntry *e = &userns_entry;
- return userns_id(gid, e->gid_map, e->n_gid_map);
-}
-
-static int parse_id_map(pid_t pid, char *name, UidGidExtent ***pb_exts)
-{
- UidGidExtent *extents = NULL;
- int len = 0, size = 0, ret, i;
- FILE *f;
-
- f = fopen_proc(pid, "%s", name);
- if (f == NULL)
- return -1;
-
- ret = -1;
- while (1) {
- UidGidExtent *ext;
-
- if (len == size) {
- UidGidExtent *t;
-
- size = size * 2 + 1;
- t = xrealloc(extents, size * sizeof(UidGidExtent));
- if (t == NULL)
- break;
- extents = t;
- }
-
- ext = &extents[len];
-
- uid_gid_extent__init(ext);
- ret = fscanf(f, "%d %d %d", &ext->first,
- &ext->lower_first, &ext->count);
- if (ret != 3) {
- if (errno != 0) {
- pr_perror("Unable to parse extents");
- ret = -1;
- } else
- ret = 0;
- break;
- }
- pr_info("id_map: %d %d %d\n", ext->first, ext->lower_first, ext->count);
- len++;
- }
-
- fclose(f);
-
- if (ret)
- goto err;
-
- if (len) {
- *pb_exts = xmalloc(sizeof(UidGidExtent *) * len);
- if (*pb_exts == NULL)
- goto err;
-
- for (i = 0; i < len; i++)
- (*pb_exts)[i] = &extents[i];
- } else {
- xfree(extents);
- *pb_exts = NULL;
- }
-
- return len;
-err:
- xfree(extents);
- return -1;
-}
-
-int collect_user_ns(struct ns_id *ns, void *oarg)
-{
- /*
- * User namespace is dumped before files to get uid and gid
- * mappings, which are used for convirting local id-s to
- * userns id-s (userns_uid(), userns_gid())
- */
- if (dump_user_ns(root_item->pid.real, root_item->ids->user_ns_id))
- return -1;
-
- return 0;
-}
-
-int collect_user_namespaces(bool for_dump)
-{
- if (!for_dump)
- return 0;
-
- if (!(root_ns_mask & CLONE_NEWUSER))
- return 0;
-
- return walk_namespaces(&net_ns_desc, collect_user_ns, NULL);
-}
-
-static int check_user_ns(int pid)
-{
- int status;
- pid_t chld;
-
- chld = fork();
- if (chld == -1) {
- pr_perror("Unable to fork a process");
- return -1;
- }
-
- if (chld == 0) {
- /*
- * Check that we are able to enter into other namespaces
- * from the target userns namespace. This signs that these
- * namespaces were created from the target userns.
- */
-
- if (switch_ns(pid, &user_ns_desc, NULL))
- exit(-1);
-
- if ((root_ns_mask & CLONE_NEWNET) &&
- switch_ns(pid, &net_ns_desc, NULL))
- exit(-1);
- if ((root_ns_mask & CLONE_NEWUTS) &&
- switch_ns(pid, &uts_ns_desc, NULL))
- exit(-1);
- if ((root_ns_mask & CLONE_NEWIPC) &&
- switch_ns(pid, &ipc_ns_desc, NULL))
- exit(-1);
- if ((root_ns_mask & CLONE_NEWNS) &&
- switch_ns(pid, &mnt_ns_desc, NULL))
- exit(-1);
- exit(0);
- }
-
- if (waitpid(chld, &status, 0) != chld) {
- pr_perror("Unable to wait the %d process", pid);
- return -1;
- }
-
- if (status) {
- pr_err("One or more namespaces doesn't belong to the target user namespace\n");
- return -1;
- }
-
- return 0;
-}
-
-int dump_user_ns(pid_t pid, int ns_id)
-{
- int ret, exit_code = -1;
- UsernsEntry *e = &userns_entry;
- struct cr_img *img;
-
- if (check_user_ns(pid))
- return -1;
-
- ret = parse_id_map(pid, "uid_map", &e->uid_map);
- if (ret < 0)
- goto err;
- e->n_uid_map = ret;
-
- ret = parse_id_map(pid, "gid_map", &e->gid_map);
- if (ret < 0)
- goto err;
- e->n_gid_map = ret;
-
- img = open_image(CR_FD_USERNS, O_DUMP, ns_id);
- if (!img)
- goto err;
- ret = pb_write_one(img, e, PB_USERNS);
- close_image(img);
- if (ret < 0)
- goto err;
-
- return 0;
-err:
- if (e->uid_map) {
- xfree(e->uid_map[0]);
- xfree(e->uid_map);
- }
- if (e->gid_map) {
- xfree(e->gid_map[0]);
- xfree(e->gid_map);
- }
- return exit_code;
-}
-
-void free_userns_maps()
-{
- if (userns_entry.n_uid_map > 0) {
- xfree(userns_entry.uid_map[0]);
- xfree(userns_entry.uid_map);
- }
- if (userns_entry.n_gid_map > 0) {
- xfree(userns_entry.gid_map[0]);
- xfree(userns_entry.gid_map);
- }
-}
-
-static int do_dump_namespaces(struct ns_id *ns)
-{
- int ret;
-
- ret = switch_ns(ns->ns_pid, ns->nd, NULL);
- if (ret)
- return ret;
-
- switch (ns->nd->cflag) {
- case CLONE_NEWUTS:
- pr_info("Dump UTS namespace %d via %d\n",
- ns->id, ns->ns_pid);
- ret = dump_uts_ns(ns->id);
- break;
- case CLONE_NEWIPC:
- pr_info("Dump IPC namespace %d via %d\n",
- ns->id, ns->ns_pid);
- ret = dump_ipc_ns(ns->id);
- break;
- case CLONE_NEWNET:
- pr_info("Dump NET namespace info %d via %d\n",
- ns->id, ns->ns_pid);
- ret = dump_net_ns(ns->id);
- break;
- default:
- pr_err("Unknown namespace flag %x\n", ns->nd->cflag);
- break;
- }
-
- return ret;
-
-}
-
-int dump_namespaces(struct pstree_item *item, unsigned int ns_flags)
-{
- struct pid *ns_pid = &item->pid;
- struct ns_id *ns;
- int pid, nr = 0;
- int ret = 0;
-
- /*
- * The setns syscall is cool, we can switch to the other
- * namespace and then return back to our initial one, but
- * for me it's much easier just to fork another task and
- * let it do the job, all the more so it can be done in
- * parallel with task dumping routine.
- *
- * However, the question how to dump sockets from the target
- * net namespace with this is still open
- */
-
- pr_info("Dumping %d(%d)'s namespaces\n", ns_pid->virt, ns_pid->real);
-
- if ((ns_flags & CLONE_NEWPID) && ns_pid->virt != 1) {
- pr_err("Can't dump a pid namespace without the process init\n");
- return -1;
- }
-
- for (ns = ns_ids; ns; ns = ns->next) {
- /* Skip current namespaces, which are in the list too */
- if (ns->type == NS_CRIU)
- continue;
-
- switch (ns->nd->cflag) {
- /* No data for pid namespaces to dump */
- case CLONE_NEWPID:
- /* Dumped explicitly with dump_mnt_namespaces() */
- case CLONE_NEWNS:
- /* Userns is dumped before dumping tasks */
- case CLONE_NEWUSER:
- continue;
- }
-
- pid = fork();
- if (pid < 0) {
- pr_perror("Can't fork ns dumper");
- return -1;
- }
-
- if (pid == 0) {
- ret = do_dump_namespaces(ns);
- exit(ret);
- }
-
- nr++;
- }
-
- while (nr > 0) {
- int status;
-
- ret = waitpid(-1, &status, 0);
- if (ret < 0) {
- pr_perror("Can't wait ns dumper");
- return -1;
- }
-
- if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
- pr_err("Namespaces dumping finished with error %d\n", status);
- return -1;
- }
-
- nr--;
- }
-
- pr_info("Namespaces dump complete\n");
- return 0;
-}
-
-static int write_id_map(pid_t pid, UidGidExtent **extents, int n, char *id_map)
-{
- char buf[PAGE_SIZE];
- int off = 0, i;
- int fd;
-
- /*
- * We can perform only a single write (that may contain multiple
- * newline-delimited records) to a uid_map and a gid_map files.
- */
- for (i = 0; i < n; i++)
- off += snprintf(buf + off, sizeof(buf) - off,
- "%u %u %u\n", extents[i]->first,
- extents[i]->lower_first,
- extents[i]->count);
-
- fd = open_proc_rw(pid, "%s", id_map);
- if (fd < 0)
- return -1;
- if (write(fd, buf, off) != off) {
- pr_perror("Unable to write into %s", id_map);
- close(fd);
- return -1;
- }
- close(fd);
-
- return 0;
-}
-
-struct unsc_msg {
- struct msghdr h;
- /*
- * 0th is the call address
- * 1st is the flags
- * 2nd is the optional (NULL in responce) arguments
- */
- struct iovec iov[3];
- char c[CMSG_SPACE(sizeof(struct ucred)) + CMSG_SPACE(sizeof(int))];
-};
-
-static int usernsd_pid;
-
-static inline void unsc_msg_init(struct unsc_msg *m, uns_call_t *c,
- int *x, void *arg, size_t asize, int fd)
-{
- struct cmsghdr *ch;
- struct ucred *ucred;
-
- m->h.msg_iov = m->iov;
- m->h.msg_iovlen = 2;
-
- m->iov[0].iov_base = c;
- m->iov[0].iov_len = sizeof(*c);
- m->iov[1].iov_base = x;
- m->iov[1].iov_len = sizeof(*x);
-
- if (arg) {
- m->iov[2].iov_base = arg;
- m->iov[2].iov_len = asize;
- m->h.msg_iovlen++;
- }
-
- m->h.msg_name = NULL;
- m->h.msg_namelen = 0;
- m->h.msg_flags = 0;
-
- m->h.msg_control = &m->c;
-
- /* Need to memzero because of:
- * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=514917
- */
- memzero(&m->c, sizeof(m->c));
-
- m->h.msg_controllen = CMSG_SPACE(sizeof(struct ucred));
-
- ch = CMSG_FIRSTHDR(&m->h);
- ch->cmsg_len = CMSG_LEN(sizeof(struct ucred));
- ch->cmsg_level = SOL_SOCKET;
- ch->cmsg_type = SCM_CREDENTIALS;
-
- ucred = (struct ucred *) CMSG_DATA(ch);
- ucred->pid = getpid();
- ucred->uid = getuid();
- ucred->gid = getgid();
-
- if (fd >= 0) {
- m->h.msg_controllen += CMSG_SPACE(sizeof(int));
- ch = CMSG_NXTHDR(&m->h, ch);
- BUG_ON(!ch);
- ch->cmsg_len = CMSG_LEN(sizeof(int));
- ch->cmsg_level = SOL_SOCKET;
- ch->cmsg_type = SCM_RIGHTS;
- *((int *)CMSG_DATA(ch)) = fd;
- }
-}
-
-static void unsc_msg_pid_fd(struct unsc_msg *um, pid_t *pid, int *fd)
-{
- struct cmsghdr *ch;
- struct ucred *ucred;
-
- ch = CMSG_FIRSTHDR(&um->h);
- BUG_ON(!ch);
- BUG_ON(ch->cmsg_len != CMSG_LEN(sizeof(struct ucred)));
- BUG_ON(ch->cmsg_level != SOL_SOCKET);
- BUG_ON(ch->cmsg_type != SCM_CREDENTIALS);
-
- if (pid) {
- ucred = (struct ucred *) CMSG_DATA(ch);
- *pid = ucred->pid;
- }
-
- ch = CMSG_NXTHDR(&um->h, ch);
-
- if (ch && ch->cmsg_len == CMSG_LEN(sizeof(int))) {
- BUG_ON(ch->cmsg_level != SOL_SOCKET);
- BUG_ON(ch->cmsg_type != SCM_RIGHTS);
- *fd = *((int *)CMSG_DATA(ch));
- } else {
- *fd = -1;
- }
-}
-
-static int usernsd(int sk)
-{
- pr_info("uns: Daemon started\n");
-
- while (1) {
- struct unsc_msg um;
- static char msg[MAX_UNSFD_MSG_SIZE];
- uns_call_t call;
- int flags, fd, ret;
- pid_t pid;
-
- unsc_msg_init(&um, &call, &flags, msg, sizeof(msg), 0);
- if (recvmsg(sk, &um.h, 0) <= 0) {
- pr_perror("uns: recv req error");
- return -1;
- }
-
- unsc_msg_pid_fd(&um, &pid, &fd);
- pr_debug("uns: daemon calls %p (%d, %d, %x)\n", call, pid, fd, flags);
-
- BUG_ON(fd < 0 && flags & UNS_FDOUT);
-
- /*
- * Caller has sent us bare address of the routine it
- * wants to call. Since the caller is fork()-ed from the
- * same process as the daemon is, the latter has exactly
- * the same code at exactly the same address as the
- * former guy has. So go ahead and just call one!
- */
-
- ret = call(msg, fd, pid);
-
- if (fd >= 0)
- close(fd);
-
- if (flags & UNS_ASYNC) {
- /*
- * Async call failed and the called doesn't know
- * about it. Exit now and let the stop_usernsd()
- * check the exit code and abort the restoration.
- *
- * We'd get there either by the end of restore or
- * from the next userns_call() due to failed
- * sendmsg() in there.
- */
- if (ret < 0) {
- pr_err("uns: Async call failed. Exiting\n");
- return -1;
- }
-
- continue;
- }
-
- if (flags & UNS_FDOUT)
- fd = ret;
- else
- fd = -1;
-
- unsc_msg_init(&um, &call, &ret, NULL, 0, fd);
- if (sendmsg(sk, &um.h, 0) <= 0) {
- pr_perror("uns: send resp error");
- return -1;
- }
-
- if (fd >= 0)
- close(fd);
- }
-}
-
-int __userns_call(const char *func_name, uns_call_t call, int flags,
- void *arg, size_t arg_size, int fd)
-{
- int ret, res, sk;
- bool async = flags & UNS_ASYNC;
- struct unsc_msg um;
-
- if (unlikely(arg_size > MAX_UNSFD_MSG_SIZE)) {
- pr_err("uns: message size exceeded\n");
- return -1;
- }
-
- if (!usernsd_pid)
- return call(arg, fd, getpid());
-
- sk = get_service_fd(USERNSD_SK);
- pr_debug("uns: calling %s (%d, %x)\n", func_name, fd, flags);
-
- if (!async)
- /*
- * Why don't we lock for async requests? Because
- * they just put the request in the daemon's
- * queue and do not wait for the responce. Thus
- * when daemon responce there's only one client
- * waiting for it in recvmsg below, so he
- * responces to proper caller.
- */
- mutex_lock(&task_entries->userns_sync_lock);
- else
- /*
- * If we want the callback to give us and FD then
- * we should NOT do the asynchronous call.
- */
- BUG_ON(flags & UNS_FDOUT);
-
- /* Send the request */
-
- unsc_msg_init(&um, &call, &flags, arg, arg_size, fd);
- ret = sendmsg(sk, &um.h, 0);
- if (ret <= 0) {
- pr_perror("uns: send req error");
- ret = -1;
- goto out;
- }
-
- if (async) {
- ret = 0;
- goto out;
- }
-
- /* Get the response back */
-
- unsc_msg_init(&um, &call, &res, NULL, 0, 0);
- ret = recvmsg(sk, &um.h, 0);
- if (ret <= 0) {
- pr_perror("uns: recv resp error");
- ret = -1;
- goto out;
- }
-
- /* Decode the result and return */
-
- if (flags & UNS_FDOUT)
- unsc_msg_pid_fd(&um, NULL, &ret);
- else
- ret = res;
-out:
- if (!async)
- mutex_unlock(&task_entries->userns_sync_lock);
-
- return ret;
-}
-
-static int start_usernsd(void)
-{
- int sk[2];
- int one = 1;
-
- if (!(root_ns_mask & CLONE_NEWUSER))
- return 0;
-
- /*
- * Seqpacket to
- *
- * a) Help daemon distinguish individual requests from
- * each other easily. Stream socket require manual
- * messages boundaries.
- *
- * b) Make callers note the damon death by seeing the
- * disconnected socket. In case of dgram socket
- * callers would just get stuck in receiving the
- * responce.
- */
-
- if (socketpair(PF_UNIX, SOCK_SEQPACKET, 0, sk)) {
- pr_perror("Can't make usernsd socket");
- return -1;
- }
-
- if (setsockopt(sk[0], SOL_SOCKET, SO_PASSCRED, &one, sizeof(one)) < 0) {
- pr_perror("failed to setsockopt");
- return -1;
- }
-
- if (setsockopt(sk[1], SOL_SOCKET, SO_PASSCRED, &one, sizeof(1)) < 0) {
- pr_perror("failed to setsockopt");
- return -1;
- }
-
- usernsd_pid = fork();
- if (usernsd_pid < 0) {
- pr_perror("Can't fork usernsd");
- close(sk[0]);
- close(sk[1]);
- return -1;
- }
-
- if (usernsd_pid == 0) {
- int ret;
-
- close(sk[0]);
- ret = usernsd(sk[1]);
- exit(ret);
- }
-
- close(sk[1]);
- if (install_service_fd(USERNSD_SK, sk[0]) < 0) {
- kill(usernsd_pid, SIGKILL);
- waitpid(usernsd_pid, NULL, 0);
- close(sk[0]);
- return -1;
- }
-
- close(sk[0]);
- return 0;
-}
-
-static int exit_usernsd(void *arg, int fd, pid_t pid)
-{
- int code = *(int *)arg;
- pr_info("uns: `- daemon exits w/ %d\n", code);
- exit(code);
-}
-
-int stop_usernsd(void)
-{
- int ret = 0;
-
- if (usernsd_pid) {
- int status = -1;
- sigset_t blockmask, oldmask;
-
- /*
- * Don't let the sigchld_handler() mess with us
- * calling waitpid() on the exited daemon. The
- * same is done in cr_system().
- */
-
- sigemptyset(&blockmask);
- sigaddset(&blockmask, SIGCHLD);
- sigprocmask(SIG_BLOCK, &blockmask, &oldmask);
-
- /*
- * Send a message to make sure the daemon _has_
- * proceeded all its queue of asynchronous requests.
- *
- * All the restoring processes might have already
- * closed their USERNSD_SK descriptors, but daemon
- * still has its in connected state -- this is us
- * who hold the last reference on the peer.
- *
- * If daemon has exited "in advance" due to async
- * call or socket error, the userns_call() and the
- * waitpid() below would both fail and we'll see
- * bad exit status.
- */
-
- userns_call(exit_usernsd, UNS_ASYNC, &ret, sizeof(ret), -1);
- waitpid(usernsd_pid, &status, 0);
-
- if (WIFEXITED(status))
- ret = WEXITSTATUS(status);
- else
- ret = -1;
-
- usernsd_pid = 0;
- sigprocmask(SIG_SETMASK, &oldmask, NULL);
-
- if (ret != 0)
- pr_err("uns: daemon exited abnormally\n");
- else
- pr_info("uns: daemon stopped\n");
- }
-
- return ret;
-}
-
-int prepare_userns(struct pstree_item *item)
-{
- struct cr_img *img;
- UsernsEntry *e;
- int ret;
-
- img = open_image(CR_FD_USERNS, O_RSTR, item->ids->user_ns_id);
- if (!img)
- return -1;
- ret = pb_read_one(img, &e, PB_USERNS);
- close_image(img);
- if (ret < 0)
- return -1;
-
- if (write_id_map(item->pid.real, e->uid_map, e->n_uid_map, "uid_map"))
- return -1;
-
- if (write_id_map(item->pid.real, e->gid_map, e->n_gid_map, "gid_map"))
- return -1;
-
- return 0;
-}
-
-int collect_namespaces(bool for_dump)
-{
- int ret;
-
- ret = collect_user_namespaces(for_dump);
- if (ret < 0)
- return ret;
-
- ret = collect_mnt_namespaces(for_dump);
- if (ret < 0)
- return ret;
-
- ret = collect_net_namespaces(for_dump);
- if (ret < 0)
- return ret;
-
- return 0;
-}
-
-static int prepare_userns_creds()
-{
- /* UID and GID must be set after restoring /proc/PID/{uid,gid}_maps */
- if (setuid(0) || setgid(0) || setgroups(0, NULL)) {
- pr_perror("Unable to initialize id-s");
- return -1;
- }
-
- /*
- * This flag is dropped after entering userns, but is
- * required to access files in /proc, so put one here
- * temoprarily. It will be set to proper value at the
- * very end.
- */
- if (prctl(PR_SET_DUMPABLE, 1, 0)) {
- pr_perror("Unable to set PR_SET_DUMPABLE");
- exit(1);
- }
-
- return 0;
-}
-
-int prepare_namespace(struct pstree_item *item, unsigned long clone_flags)
-{
- pid_t pid = item->pid.virt;
- int id;
-
- pr_info("Restoring namespaces %d flags 0x%lx\n",
- item->pid.virt, clone_flags);
-
- if ((clone_flags & CLONE_NEWUSER) && prepare_userns_creds())
- return -1;
-
- /*
- * On netns restore we launch an IP tool, thus we
- * have to restore it _before_ altering the mount
- * tree (i.e. -- mnt_ns restoring)
- */
-
- id = ns_per_id ? item->ids->net_ns_id : pid;
- if ((clone_flags & CLONE_NEWNET) && prepare_net_ns(id))
- return -1;
- id = ns_per_id ? item->ids->uts_ns_id : pid;
- if ((clone_flags & CLONE_NEWUTS) && prepare_utsns(id))
- return -1;
- id = ns_per_id ? item->ids->ipc_ns_id : pid;
- if ((clone_flags & CLONE_NEWIPC) && prepare_ipc_ns(id))
- return -1;
-
- /*
- * This one is special -- there can be several mount
- * namespaces and prepare_mnt_ns handles them itself.
- */
- if (prepare_mnt_ns())
- return -1;
-
- return 0;
-}
-
-int prepare_namespace_before_tasks(void)
-{
- if (start_usernsd())
- goto err_unds;
-
- if (netns_keep_nsfd())
- goto err_netns;
-
- if (mntns_maybe_create_roots())
- goto err_mnt;
-
- if (read_mnt_ns_img())
- goto err_img;
-
- return 0;
-
-err_img:
- cleanup_mnt_ns();
-err_mnt:
- /*
- * Nothing, netns' descriptor will be closed
- * on criu exit
- */
-err_netns:
- stop_usernsd();
-err_unds:
- return -1;
-}
-
-int try_show_namespaces(int ns_pid)
-{
- struct cr_imgset *imgset;
- int i, ret;
- struct cr_img *img;
- TaskKobjIdsEntry *ids;
-
- pr_msg("Namespaces for %d:\n", ns_pid);
-
- img = open_image(CR_FD_IDS, O_RSTR, ns_pid);
- if (!img)
- return -1;
- ret = pb_read_one(img, &ids, PB_IDS);
- close_image(img);
- if (ret < 0)
- return -1;
-
- imgset = cr_imgset_open(ids->net_ns_id, NETNS, O_SHOW);
- if (imgset) {
- pr_msg("-------------------NETNS---------------------\n");
- for (i = _CR_FD_NETNS_FROM + 1; i < _CR_FD_NETNS_TO; i++) {
- img = img_from_set(imgset, i);
- if (!img)
- continue;
-
- cr_parse_fd(img, imgset_template[i].magic);
- }
- close_cr_imgset(&imgset);
- }
-
- imgset = cr_imgset_open(ids->ipc_ns_id, IPCNS, O_SHOW);
- if (imgset) {
- pr_msg("-------------------IPCNS---------------------\n");
- for (i = _CR_FD_IPCNS_FROM + 1; i < _CR_FD_IPCNS_TO; i++) {
- img = img_from_set(imgset, i);
- if (!img)
- continue;
-
- cr_parse_fd(img, imgset_template[i].magic);
- }
- close_cr_imgset(&imgset);
- }
-
- img = open_image(CR_FD_UTSNS, O_SHOW, ids->uts_ns_id);
- if (img) {
- pr_msg("-------------------UTSNS---------------------\n");
- cr_parse_fd(img, imgset_template[CR_FD_UTSNS].magic);
- close_image(img);
- }
-
- img = open_image(CR_FD_MNTS, O_SHOW, ids->mnt_ns_id);
- if (img) {
- pr_msg("-------------------MNTNS---------------------\n");
- cr_parse_fd(img, imgset_template[CR_FD_MNTS].magic);
- close_image(img);
- }
-
- pr_msg("---[ end of %d namespaces ]---\n", ns_pid);
- return 0;
-}
-
-struct ns_desc pid_ns_desc = NS_DESC_ENTRY(CLONE_NEWPID, "pid");
-struct ns_desc user_ns_desc = NS_DESC_ENTRY(CLONE_NEWUSER, "user");
diff --git a/net.c b/net.c
deleted file mode 100644
index 9f62cd0c6ba2..000000000000
--- a/net.c
+++ /dev/null
@@ -1,1429 +0,0 @@
-#include <unistd.h>
-#include <sys/socket.h>
-#include <linux/netlink.h>
-#include <linux/rtnetlink.h>
-#include <linux/netfilter/nfnetlink.h>
-#include <linux/netfilter/nfnetlink_conntrack.h>
-#include <linux/netfilter/nf_conntrack_tcp.h>
-#include <string.h>
-#include <net/if_arp.h>
-#include <sys/wait.h>
-#include <sched.h>
-#include <sys/mount.h>
-#include <net/if.h>
-#include <linux/sockios.h>
-#include <libnl3/netlink/msg.h>
-
-#include "imgset.h"
-#include "namespaces.h"
-#include "net.h"
-#include "libnetlink.h"
-#include "cr_options.h"
-#include "sk-inet.h"
-#include "tun.h"
-#include "util-pie.h"
-#include "plugin.h"
-#include "action-scripts.h"
-#include "sockets.h"
-#include "pstree.h"
-#include "string.h"
-#include "sysctl.h"
-#include "kerndat.h"
-
-#include "protobuf.h"
-#include "protobuf/netdev.pb-c.h"
-
-static int ns_sysfs_fd = -1;
-
-int read_ns_sys_file(char *path, char *buf, int len)
-{
- int fd, rlen;
-
- BUG_ON(ns_sysfs_fd == -1);
-
- fd = openat(ns_sysfs_fd, path, O_RDONLY, 0);
- if (fd < 0) {
- pr_perror("Can't open ns' %s", path);
- return -1;
- }
-
- rlen = read(fd, buf, len);
- close(fd);
-
- if (rlen == len) {
- pr_err("Too small buffer to read ns sys file %s\n", path);
- return -1;
- }
-
- if (rlen > 0)
- buf[rlen - 1] = '\0';
-
- return rlen;
-}
-
-static char *devconfs[] = {
- "accept_local",
- "accept_redirects",
- "accept_source_route",
- "arp_accept",
- "arp_announce",
- "arp_filter",
- "arp_ignore",
- "arp_notify",
- "bootp_relay",
- "disable_policy",
- "disable_xfrm",
- "force_igmp_version",
- "forwarding",
- "igmpv2_unsolicited_report_interval",
- "igmpv3_unsolicited_report_interval",
- "log_martians",
- "medium_id",
- "promote_secondaries",
- "proxy_arp",
- "proxy_arp_pvlan",
- "route_localnet",
- "rp_filter",
- "secure_redirects",
- "send_redirects",
- "shared_media",
- "src_valid_mark",
- "tag",
- "ignore_routes_with_linkdown",
-};
-
-/*
- * I case if some entry is missing in
- * the kernel, simply write DEVCONFS_UNUSED
- * into the image so we would skip it.
- */
-#define DEVCONFS_UNUSED (-1u)
-
-#define NET_CONF_PATH "net/ipv4/conf"
-#define MAX_CONF_OPT_PATH IFNAMSIZ+50
-
-static int ipv4_conf_op(char *tgt, int *conf, int n, int op, NetnsEntry **netns)
-{
- int i, ri;
- int ret, flags = op == CTL_READ ? CTL_FLAGS_OPTIONAL : 0;
- struct sysctl_req req[ARRAY_SIZE(devconfs)];
- char path[ARRAY_SIZE(devconfs)][MAX_CONF_OPT_PATH];
-
- if (n > ARRAY_SIZE(devconfs))
- pr_warn("The image contains unknown sysctl-s\n");
-
- for (i = 0, ri = 0; i < ARRAY_SIZE(devconfs); i++) {
- if (i >= n) {
- pr_warn("Skip %s/%s\n", tgt, devconfs[i]);
- continue;
- }
- /*
- * If dev conf value is the same as default skip restoring it
- */
- if (netns && conf[i] == (*netns)->def_conf[i]) {
- pr_debug("DEBUG Skip %s/%s, val =%d\n", tgt, devconfs[i], conf[i]);
- continue;
- }
-
- if (op == CTL_WRITE && conf[i] == DEVCONFS_UNUSED)
- continue;
- else if (op == CTL_READ)
- conf[i] = DEVCONFS_UNUSED;
-
- snprintf(path[i], MAX_CONF_OPT_PATH, "%s/%s/%s", NET_CONF_PATH, tgt, devconfs[i]);
- req[ri].name = path[i];
- req[ri].arg = &conf[i];
- req[ri].type = CTL_32;
- req[ri].flags = flags;
- ri++;
- }
-
- ret = sysctl_op(req, ri, op, CLONE_NEWNET);
- if (ret < 0) {
- pr_err("Failed to %s %s/<confs>\n", (op == CTL_READ)?"read":"write", tgt);
- return -1;
- }
- return 0;
-}
-
-int write_netdev_img(NetDeviceEntry *nde, struct cr_imgset *fds)
-{
- return pb_write_one(img_from_set(fds, CR_FD_NETDEV), nde, PB_NETDEV);
-}
-
-static int dump_one_netdev(int type, struct ifinfomsg *ifi,
- struct rtattr **tb, struct cr_imgset *fds,
- int (*dump)(NetDeviceEntry *, struct cr_imgset *))
-{
- int ret;
- NetDeviceEntry netdev = NET_DEVICE_ENTRY__INIT;
-
- if (!tb[IFLA_IFNAME]) {
- pr_err("No name for link %d\n", ifi->ifi_index);
- return -1;
- }
-
- netdev.type = type;
- netdev.ifindex = ifi->ifi_index;
- netdev.mtu = *(int *)RTA_DATA(tb[IFLA_MTU]);
- netdev.flags = ifi->ifi_flags;
- netdev.name = RTA_DATA(tb[IFLA_IFNAME]);
-
- if (tb[IFLA_ADDRESS] && (type != ND_TYPE__LOOPBACK)) {
- netdev.has_address = true;
- netdev.address.data = RTA_DATA(tb[IFLA_ADDRESS]);
- netdev.address.len = RTA_PAYLOAD(tb[IFLA_ADDRESS]);
- pr_info("Found ll addr (%02x:../%d) for %s\n",
- (int)netdev.address.data[0],
- (int)netdev.address.len, netdev.name);
- }
-
- netdev.n_conf = ARRAY_SIZE(devconfs);
- netdev.conf = xmalloc(sizeof(int) * netdev.n_conf);
- if (!netdev.conf)
- return -1;
-
- ret = ipv4_conf_op(netdev.name, netdev.conf, netdev.n_conf, CTL_READ, NULL);
- if (ret < 0)
- goto err_free;
-
- if (!dump)
- dump = write_netdev_img;
-
- ret = dump(&netdev, fds);
-err_free:
- xfree(netdev.conf);
- return ret;
-}
-
-static char *link_kind(struct ifinfomsg *ifi, struct rtattr **tb)
-{
- struct rtattr *linkinfo[IFLA_INFO_MAX + 1];
-
- if (!tb[IFLA_LINKINFO]) {
- pr_err("No linkinfo for eth link %d\n", ifi->ifi_index);
- return NULL;
- }
-
- parse_rtattr_nested(linkinfo, IFLA_INFO_MAX, tb[IFLA_LINKINFO]);
- if (!linkinfo[IFLA_INFO_KIND]) {
- pr_err("No kind for eth link %d\n", ifi->ifi_index);
- return NULL;
- }
-
- return RTA_DATA(linkinfo[IFLA_INFO_KIND]);
-}
-
-static int dump_unknown_device(struct ifinfomsg *ifi, char *kind,
- struct rtattr **tb, struct cr_imgset *fds)
-{
- int ret;
-
- ret = run_plugins(DUMP_EXT_LINK, ifi->ifi_index, ifi->ifi_type, kind);
- if (ret == 0)
- return dump_one_netdev(ND_TYPE__EXTLINK, ifi, tb, fds, NULL);
-
- if (ret == -ENOTSUP)
- pr_err("Unsupported link %d (type %d kind %s)\n",
- ifi->ifi_index, ifi->ifi_type, kind);
- return -1;
-}
-
-static int dump_bridge(NetDeviceEntry *nde, struct cr_imgset *imgset)
-{
- char spath[IFNAMSIZ + 16]; /* len("class/net//brif") + 1 for null */
- int ret, fd;
-
- ret = snprintf(spath, sizeof(spath), "class/net/%s/brif", nde->name);
- if (ret < 0 || ret >= sizeof(spath))
- return -1;
-
- /* Let's only allow dumping empty bridges for now. To do a full bridge
- * restore, we need to make sure the bridge and slaves are restored in
- * the right order and attached correctly. It looks like the veth code
- * supports this, but we need some way to do ordering.
- */
- fd = openat(ns_sysfs_fd, spath, O_DIRECTORY, 0);
- if (fd < 0) {
- pr_perror("opening %s failed", spath);
- return -1;
- }
-
- ret = is_empty_dir(fd);
- close(fd);
- if (ret < 0) {
- pr_perror("problem testing %s for emptiness", spath);
- return -1;
- }
-
- if (!ret) {
- pr_err("dumping bridges with attached slaves not supported currently\n");
- return -1;
- }
-
- return write_netdev_img(nde, imgset);
-}
-
-static int dump_one_ethernet(struct ifinfomsg *ifi, char *kind,
- struct rtattr **tb, struct cr_imgset *fds)
-{
- if (!strcmp(kind, "veth"))
- /*
- * This is not correct. The peer of the veth device may
- * be either outside or inside the netns we're working
- * on, but there's currently no way of finding this out.
- *
- * Sigh... we have to assume, that the veth device is a
- * connection to the outer world and just dump this end :(
- */
- return dump_one_netdev(ND_TYPE__VETH, ifi, tb, fds, NULL);
- if (!strcmp(kind, "tun"))
- return dump_one_netdev(ND_TYPE__TUN, ifi, tb, fds, dump_tun_link);
- if (!strcmp(kind, "bridge"))
- return dump_one_netdev(ND_TYPE__BRIDGE, ifi, tb, fds, dump_bridge);
-
- return dump_unknown_device(ifi, kind, tb, fds);
-}
-
-static int dump_one_gendev(struct ifinfomsg *ifi, char *kind,
- struct rtattr **tb, struct cr_imgset *fds)
-{
- if (!strcmp(kind, "tun"))
- return dump_one_netdev(ND_TYPE__TUN, ifi, tb, fds, dump_tun_link);
-
- return dump_unknown_device(ifi, kind, tb, fds);
-}
-
-static int dump_one_voiddev(struct ifinfomsg *ifi, char *kind,
- struct rtattr **tb, struct cr_imgset *fds)
-{
- if (!strcmp(kind, "venet"))
- return dump_one_netdev(ND_TYPE__VENET, ifi, tb, fds, NULL);
-
- return dump_unknown_device(ifi, kind, tb, fds);
-}
-
-static int dump_one_link(struct nlmsghdr *hdr, void *arg)
-{
- struct cr_imgset *fds = arg;
- struct ifinfomsg *ifi;
- int ret = 0, len = hdr->nlmsg_len - NLMSG_LENGTH(sizeof(*ifi));
- struct rtattr *tb[IFLA_MAX + 1];
- char *kind;
-
- ifi = NLMSG_DATA(hdr);
-
- if (len < 0) {
- pr_err("No iflas for link %d\n", ifi->ifi_index);
- return -1;
- }
-
- parse_rtattr(tb, IFLA_MAX, IFLA_RTA(ifi), len);
- pr_info("\tLD: Got link %d, type %d\n", ifi->ifi_index, ifi->ifi_type);
-
- if (ifi->ifi_type == ARPHRD_LOOPBACK)
- return dump_one_netdev(ND_TYPE__LOOPBACK, ifi, tb, fds, NULL);
-
- kind = link_kind(ifi, tb);
- if (!kind)
- goto unk;
-
- switch (ifi->ifi_type) {
- case ARPHRD_ETHER:
- ret = dump_one_ethernet(ifi, kind, tb, fds);
- break;
- case ARPHRD_NONE:
- ret = dump_one_gendev(ifi, kind, tb, fds);
- break;
- case ARPHRD_VOID:
- ret = dump_one_voiddev(ifi, kind, tb, fds);
- break;
- default:
-unk:
- ret = dump_unknown_device(ifi, kind, tb, fds);
- break;
- }
-
- return ret;
-}
-
-static int dump_one_nf(struct nlmsghdr *hdr, void *arg)
-{
- struct cr_img *img = arg;
-
- if (lazy_image(img) && open_image_lazy(img))
- return -1;
-
- if (write_img_buf(img, hdr, hdr->nlmsg_len))
- return -1;
-
- return 0;
-}
-
-static int ct_restore_callback(struct nlmsghdr *nlh)
-{
- struct nfgenmsg *msg;
- struct nlattr *tb[CTA_MAX+1], *tbp[CTA_PROTOINFO_MAX + 1], *tb_tcp[CTA_PROTOINFO_TCP_MAX+1];
- int err;
-
- msg = NLMSG_DATA(nlh);
-
- if (msg->nfgen_family != AF_INET && msg->nfgen_family != AF_INET6)
- return 0;
-
- err = nlmsg_parse(nlh, sizeof(struct nfgenmsg), tb, CTA_MAX, NULL);
- if (err < 0)
- return -1;
-
- if (!tb[CTA_PROTOINFO])
- return 0;
-
- err = nla_parse_nested(tbp, CTA_PROTOINFO_MAX, tb[CTA_PROTOINFO], NULL);
- if (err < 0)
- return -1;
-
- if (!tbp[CTA_PROTOINFO_TCP])
- return 0;
-
- err = nla_parse_nested(tb_tcp, CTA_PROTOINFO_TCP_MAX, tbp[CTA_PROTOINFO_TCP], NULL);
- if (err < 0)
- return -1;
-
- if (tb_tcp[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) {
- struct nf_ct_tcp_flags *flags;
-
- flags = nla_data(tb_tcp[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]);
- flags->flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
- flags->mask |= IP_CT_TCP_FLAG_BE_LIBERAL;
- }
-
- if (tb_tcp[CTA_PROTOINFO_TCP_FLAGS_REPLY]) {
- struct nf_ct_tcp_flags *flags;
-
- flags = nla_data(tb_tcp[CTA_PROTOINFO_TCP_FLAGS_REPLY]);
- flags->flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
- flags->mask |= IP_CT_TCP_FLAG_BE_LIBERAL;
- }
-
- return 0;
-}
-
-static int restore_nf_ct(int pid, int type)
-{
- struct nlmsghdr *nlh = NULL;
- int exit_code = -1, sk;
- struct cr_img *img;
-
- img = open_image(type, O_RSTR, pid);
- if (empty_image(img)) {
- close_image(img);
- return 0;
- }
-
- sk = socket(AF_NETLINK, SOCK_RAW, NETLINK_NETFILTER);
- if (sk < 0) {
- pr_perror("Can't open rtnl sock for net dump");
- goto out_img;
- }
-
- nlh = xmalloc(sizeof(struct nlmsghdr));
- if (nlh == NULL)
- goto out;
-
- while (1) {
- struct nlmsghdr *p;
- int ret;
-
- ret = read_img_buf_eof(img, nlh, sizeof(struct nlmsghdr));
- if (ret < 0)
- goto out;
- if (ret == 0)
- break;
-
- p = xrealloc(nlh, nlh->nlmsg_len);
- if (p == NULL)
- goto out;
- nlh = p;
-
- ret = read_img_buf_eof(img, nlh + 1, nlh->nlmsg_len - sizeof(struct nlmsghdr));
- if (ret < 0)
- goto out;
- if (ret == 0) {
- pr_err("The image file was truncated\n");
- goto out;
- }
-
- if (type == CR_FD_NETNF_CT)
- if (ct_restore_callback(nlh))
- goto out;
-
- nlh->nlmsg_flags = NLM_F_REQUEST|NLM_F_ACK|NLM_F_CREATE;
- ret = do_rtnl_req(sk, nlh, nlh->nlmsg_len, NULL, NULL, NULL);
- if (ret)
- goto out;
- }
-
- exit_code = 0;
-out:
- xfree(nlh);
- close(sk);
-out_img:
- close_image(img);
- return exit_code;
-}
-
-static int dump_nf_ct(struct cr_imgset *fds, int type)
-{
- struct cr_img *img;
- struct {
- struct nlmsghdr nlh;
- struct nfgenmsg g;
- } req;
- int sk, ret;
-
- pr_info("Dumping netns links\n");
-
- ret = sk = socket(AF_NETLINK, SOCK_RAW, NETLINK_NETFILTER);
- if (sk < 0) {
- pr_perror("Can't open rtnl sock for net dump");
- goto out;
- }
-
- memset(&req, 0, sizeof(req));
- req.nlh.nlmsg_len = sizeof(req);
- req.nlh.nlmsg_type = (NFNL_SUBSYS_CTNETLINK << 8);
-
- if (type == CR_FD_NETNF_CT)
- req.nlh.nlmsg_type |= IPCTNL_MSG_CT_GET;
- else if (type == CR_FD_NETNF_EXP)
- req.nlh.nlmsg_type |= IPCTNL_MSG_EXP_GET;
- else
- BUG();
-
- req.nlh.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST;
- req.nlh.nlmsg_pid = 0;
- req.nlh.nlmsg_seq = CR_NLMSG_SEQ;
- req.g.nfgen_family = AF_UNSPEC;
-
- img = img_from_set(fds, type);
-
- ret = do_rtnl_req(sk, &req, sizeof(req), dump_one_nf, NULL, img);
- close(sk);
-out:
- return ret;
-
-}
-
-static int dump_links(struct cr_imgset *fds)
-{
- int sk, ret;
- struct {
- struct nlmsghdr nlh;
- struct rtgenmsg g;
- } req;
-
- pr_info("Dumping netns links\n");
-
- ret = sk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
- if (sk < 0) {
- pr_perror("Can't open rtnl sock for net dump");
- goto out;
- }
-
- memset(&req, 0, sizeof(req));
- req.nlh.nlmsg_len = sizeof(req);
- req.nlh.nlmsg_type = RTM_GETLINK;
- req.nlh.nlmsg_flags = NLM_F_ROOT|NLM_F_MATCH|NLM_F_REQUEST;
- req.nlh.nlmsg_pid = 0;
- req.nlh.nlmsg_seq = CR_NLMSG_SEQ;
- req.g.rtgen_family = AF_PACKET;
-
- ret = do_rtnl_req(sk, &req, sizeof(req), dump_one_link, NULL, fds);
- close(sk);
-out:
- return ret;
-}
-
-static int restore_link_cb(struct nlmsghdr *hdr, void *arg)
-{
- pr_info("Got response on SETLINK =)\n");
- return 0;
-}
-
-struct newlink_req {
- struct nlmsghdr h;
- struct ifinfomsg i;
- char buf[1024];
-};
-
-static int do_rtm_link_req(int msg_type, NetDeviceEntry *nde, int nlsk,
- int (*link_info)(NetDeviceEntry *, struct newlink_req *))
-{
- struct newlink_req req;
-
- memset(&req, 0, sizeof(req));
-
- req.h.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
- req.h.nlmsg_flags = NLM_F_REQUEST|NLM_F_ACK|NLM_F_CREATE;
- req.h.nlmsg_type = msg_type;
- req.h.nlmsg_seq = CR_NLMSG_SEQ;
- req.i.ifi_family = AF_PACKET;
- /*
- * SETLINK is called for external devices which may
- * have ifindex changed. Thus configure them by their
- * name only.
- */
- if (msg_type == RTM_NEWLINK)
- req.i.ifi_index = nde->ifindex;
- req.i.ifi_flags = nde->flags;
-
- addattr_l(&req.h, sizeof(req), IFLA_IFNAME, nde->name, strlen(nde->name));
- addattr_l(&req.h, sizeof(req), IFLA_MTU, &nde->mtu, sizeof(nde->mtu));
-
- if (nde->has_address) {
- pr_debug("Restore ll addr (%02x:../%d) for device\n",
- (int)nde->address.data[0], (int)nde->address.len);
- addattr_l(&req.h, sizeof(req), IFLA_ADDRESS,
- nde->address.data, nde->address.len);
- }
-
- if (link_info) {
- struct rtattr *linkinfo;
- int ret;
-
- linkinfo = NLMSG_TAIL(&req.h);
- addattr_l(&req.h, sizeof(req), IFLA_LINKINFO, NULL, 0);
-
- ret = link_info(nde, &req);
- if (ret < 0)
- return ret;
-
- linkinfo->rta_len = (void *)NLMSG_TAIL(&req.h) - (void *)linkinfo;
- }
-
- return do_rtnl_req(nlsk, &req, req.h.nlmsg_len, restore_link_cb, NULL, NULL);
-}
-
-int restore_link_parms(NetDeviceEntry *nde, int nlsk)
-{
- return do_rtm_link_req(RTM_SETLINK, nde, nlsk, NULL);
-}
-
-static int restore_one_link(NetDeviceEntry *nde, int nlsk,
- int (*link_info)(NetDeviceEntry *, struct newlink_req *))
-{
- pr_info("Restoring netdev %s idx %d\n", nde->name, nde->ifindex);
- return do_rtm_link_req(RTM_NEWLINK, nde, nlsk, link_info);
-}
-
-#ifndef VETH_INFO_MAX
-enum {
- VETH_INFO_UNSPEC,
- VETH_INFO_PEER,
-
- __VETH_INFO_MAX
-#define VETH_INFO_MAX (__VETH_INFO_MAX - 1)
-};
-#endif
-
-#if IFLA_MAX <= 28
-#define IFLA_NET_NS_FD 28
-#endif
-
-static int veth_link_info(NetDeviceEntry *nde, struct newlink_req *req)
-{
- int ns_fd = get_service_fd(NS_FD_OFF);
- struct rtattr *veth_data, *peer_data;
- struct ifinfomsg ifm;
- struct veth_pair *n;
-
- BUG_ON(ns_fd < 0);
-
- addattr_l(&req->h, sizeof(*req), IFLA_INFO_KIND, "veth", 4);
-
- veth_data = NLMSG_TAIL(&req->h);
- addattr_l(&req->h, sizeof(*req), IFLA_INFO_DATA, NULL, 0);
- peer_data = NLMSG_TAIL(&req->h);
- memset(&ifm, 0, sizeof(ifm));
- addattr_l(&req->h, sizeof(*req), VETH_INFO_PEER, &ifm, sizeof(ifm));
- list_for_each_entry(n, &opts.veth_pairs, node) {
- if (!strcmp(nde->name, n->inside))
- break;
- }
- if (&n->node != &opts.veth_pairs)
- addattr_l(&req->h, sizeof(*req), IFLA_IFNAME, n->outside, strlen(n->outside));
- addattr_l(&req->h, sizeof(*req), IFLA_NET_NS_FD, &ns_fd, sizeof(ns_fd));
- peer_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)peer_data;
- veth_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)veth_data;
-
- return 0;
-}
-
-static int venet_link_info(NetDeviceEntry *nde, struct newlink_req *req)
-{
- int ns_fd = get_service_fd(NS_FD_OFF);
- struct rtattr *venet_data;
-
- BUG_ON(ns_fd < 0);
-
- venet_data = NLMSG_TAIL(&req->h);
- addattr_l(&req->h, sizeof(*req), IFLA_INFO_KIND, "venet", 5);
- addattr_l(&req->h, sizeof(*req), IFLA_INFO_DATA, NULL, 0);
- addattr_l(&req->h, sizeof(*req), IFLA_NET_NS_FD, &ns_fd, sizeof(ns_fd));
- venet_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)venet_data;
-
- return 0;
-}
-
-static int bridge_link_info(NetDeviceEntry *nde, struct newlink_req *req)
-{
- struct rtattr *bridge_data;
-
- bridge_data = NLMSG_TAIL(&req->h);
- addattr_l(&req->h, sizeof(*req), IFLA_INFO_KIND, "bridge", sizeof("bridge"));
- bridge_data->rta_len = (void *)NLMSG_TAIL(&req->h) - (void *)bridge_data;
-
- return 0;
-}
-
-static int restore_link(NetDeviceEntry *nde, int nlsk)
-{
- pr_info("Restoring link %s type %d\n", nde->name, nde->type);
-
- switch (nde->type) {
- case ND_TYPE__LOOPBACK: /* fallthrough */
- case ND_TYPE__EXTLINK: /* see comment in protobuf/netdev.proto */
- return restore_link_parms(nde, nlsk);
- case ND_TYPE__VENET:
- return restore_one_link(nde, nlsk, venet_link_info);
- case ND_TYPE__VETH:
- return restore_one_link(nde, nlsk, veth_link_info);
- case ND_TYPE__TUN:
- return restore_one_tun(nde, nlsk);
- case ND_TYPE__BRIDGE:
- return restore_one_link(nde, nlsk, bridge_link_info);
-
- default:
- pr_err("Unsupported link type %d\n", nde->type);
- break;
- }
-
- return -1;
-}
-
-static int restore_links(int pid, NetnsEntry **netns)
-{
- int nlsk, ret;
- struct cr_img *img;
- NetDeviceEntry *nde;
-
- img = open_image(CR_FD_NETDEV, O_RSTR, pid);
- if (!img)
- return -1;
-
- nlsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
- if (nlsk < 0) {
- pr_perror("Can't create nlk socket");
- close_image(img);
- return -1;
- }
-
- while (1) {
- ret = pb_read_one_eof(img, &nde, PB_NETDEV);
- if (ret <= 0)
- break;
-
- ret = restore_link(nde, nlsk);
- if (ret) {
- pr_err("Can't restore link\n");
- goto exit;
- }
-
- if (nde->conf) {
- NetnsEntry **def_netns = netns;
- /*
- * optimize restore of devices configuration except lo
- * lo is created with namespace and before default is set
- * so we cant optimize its restore
- */
- if (nde->type == ND_TYPE__LOOPBACK)
- def_netns = NULL;
- ret = ipv4_conf_op(nde->name, nde->conf, nde->n_conf, CTL_WRITE, def_netns);
- }
-exit:
- net_device_entry__free_unpacked(nde, NULL);
- if (ret)
- break;
- }
-
- close(nlsk);
- close_image(img);
- return ret;
-}
-
-static int run_ip_tool(char *arg1, char *arg2, char *arg3, int fdin, int fdout, unsigned flags)
-{
- char *ip_tool_cmd;
- int ret;
-
- pr_debug("\tRunning ip %s %s\n", arg1, arg2);
-
- ip_tool_cmd = getenv("CR_IP_TOOL");
- if (!ip_tool_cmd)
- ip_tool_cmd = "ip";
-
- ret = cr_system(fdin, fdout, -1, ip_tool_cmd,
- (char *[]) { "ip", arg1, arg2, arg3, NULL }, flags);
- if (ret) {
- if (!(flags & CRS_CAN_FAIL))
- pr_err("IP tool failed on %s %s\n", arg1, arg2);
- return -1;
- }
-
- return 0;
-}
-
-static int run_iptables_tool(char *def_cmd, int fdin, int fdout)
-{
- int ret;
- char *cmd;
-
- cmd = getenv("CR_IPTABLES");
- if (!cmd)
- cmd = def_cmd;
- pr_debug("\tRunning %s for %s\n", cmd, def_cmd);
- ret = cr_system(fdin, fdout, -1, "sh", (char *[]) { "sh", "-c", cmd, NULL }, 0);
- if (ret)
- pr_err("%s failed\n", def_cmd);
-
- return ret;
-}
-
-static inline int dump_ifaddr(struct cr_imgset *fds)
-{
- struct cr_img *img = img_from_set(fds, CR_FD_IFADDR);
- return run_ip_tool("addr", "save", NULL, -1, img_raw_fd(img), 0);
-}
-
-static inline int dump_route(struct cr_imgset *fds)
-{
- struct cr_img *img;
-
- img = img_from_set(fds, CR_FD_ROUTE);
- if (run_ip_tool("route", "save", NULL, -1, img_raw_fd(img), 0))
- return -1;
-
- /* If ipv6 is disabled, "ip -6 route dump" dumps all routes */
- if (!kdat.ipv6)
- return 0;
-
- img = img_from_set(fds, CR_FD_ROUTE6);
- if (run_ip_tool("-6", "route", "save", -1, img_raw_fd(img), 0))
- return -1;
-
- return 0;
-}
-
-static inline int dump_rule(struct cr_imgset *fds)
-{
- struct cr_img *img;
- char *path;
-
- img = img_from_set(fds, CR_FD_RULE);
- path = xstrdup(img->path);
-
- if (!path)
- return -1;
-
- if (run_ip_tool("rule", "save", NULL, -1, img_raw_fd(img), CRS_CAN_FAIL)) {
- pr_warn("Check if \"ip rule save\" is supported!\n");
- unlinkat(get_service_fd(IMG_FD_OFF), path, 0);
- }
-
- free(path);
-
- return 0;
-}
-
-static inline int dump_iptables(struct cr_imgset *fds)
-{
- struct cr_img *img;
-
- img = img_from_set(fds, CR_FD_IPTABLES);
- if (run_iptables_tool("iptables-save", -1, img_raw_fd(img)))
- return -1;
-
- if (kdat.ipv6) {
- img = img_from_set(fds, CR_FD_IP6TABLES);
- if (run_iptables_tool("ip6tables-save", -1, img_raw_fd(img)))
- return -1;
- }
-
- return 0;
-}
-
-static int dump_netns_conf(struct cr_imgset *fds)
-{
- int ret, n;
- NetnsEntry netns = NETNS_ENTRY__INIT;
-
- netns.n_def_conf = ARRAY_SIZE(devconfs);
- netns.n_all_conf = ARRAY_SIZE(devconfs);
- netns.def_conf = xmalloc(sizeof(int) * netns.n_def_conf);
- if (!netns.def_conf)
- return -1;
- netns.all_conf = xmalloc(sizeof(int) * netns.n_all_conf);
- if (!netns.all_conf) {
- xfree(netns.def_conf);
- return -1;
- }
-
- n = netns.n_def_conf;
- ret = ipv4_conf_op("default", netns.def_conf, n, CTL_READ, NULL);
- if (ret < 0)
- goto err_free;
- ret = ipv4_conf_op("all", netns.all_conf, n, CTL_READ, NULL);
- if (ret < 0)
- goto err_free;
-
- ret = pb_write_one(img_from_set(fds, CR_FD_NETNS), &netns, PB_NETNS);
-err_free:
- xfree(netns.def_conf);
- xfree(netns.all_conf);
- return ret;
-}
-
-static int restore_ip_dump(int type, int pid, char *cmd)
-{
- int ret = -1;
- struct cr_img *img;
-
- img = open_image(type, O_RSTR, pid);
- if (empty_image(img)) {
- close_image(img);
- return 0;
- }
- if (img) {
- ret = run_ip_tool(cmd, "restore", NULL, img_raw_fd(img), -1, 0);
- close_image(img);
- }
-
- return ret;
-}
-
-static inline int restore_ifaddr(int pid)
-{
- return restore_ip_dump(CR_FD_IFADDR, pid, "addr");
-}
-
-static inline int restore_route(int pid)
-{
- if (restore_ip_dump(CR_FD_ROUTE, pid, "route"))
- return -1;
-
- if (restore_ip_dump(CR_FD_ROUTE6, pid, "route"))
- return -1;
-
- return 0;
-}
-
-static inline int restore_rule(int pid)
-{
- struct cr_img *img;
- int ret = 0;
-
- img = open_image(CR_FD_RULE, O_RSTR, pid);
- if (!img) {
- ret = -1;
- goto out;
- }
-
- if (empty_image(img))
- goto close;
-
- /*
- * Delete 3 default rules to prevent duplicates. See kernel's
- * function fib_default_rules_init() for the details.
- */
- run_ip_tool("rule", "delete", NULL, -1, -1, 0);
- run_ip_tool("rule", "delete", NULL, -1, -1, 0);
- run_ip_tool("rule", "delete", NULL, -1, -1, 0);
-
- if (restore_ip_dump(CR_FD_RULE, pid, "rule"))
- ret = -1;
-close:
- close_image(img);
-out:
- return ret;
-}
-
-static inline int restore_iptables(int pid)
-{
- int ret = -1;
- struct cr_img *img;
-
- img = open_image(CR_FD_IPTABLES, O_RSTR, pid);
- if (img) {
- ret = run_iptables_tool("iptables-restore", img_raw_fd(img), -1);
- close_image(img);
- }
- if (ret)
- return ret;
-
- img = open_image(CR_FD_IP6TABLES, O_RSTR, pid);
- if (img == NULL)
- return -1;
- if (empty_image(img))
- goto out;
-
- ret = run_iptables_tool("ip6tables-restore", img_raw_fd(img), -1);
-out:
- close_image(img);
-
- return ret;
-}
-
-static int restore_netns_conf(int pid, NetnsEntry **netns)
-{
- int ret = 0, n;
- struct cr_img *img;
-
- img = open_image(CR_FD_NETNS, O_RSTR, pid);
- if (!img)
- return -1;
-
- if (empty_image(img))
- /* Backward compatibility */
- goto out;
-
- ret = pb_read_one(img, netns, PB_NETNS);
- if (ret < 0) {
- pr_err("Can not read netns object\n");
- return -1;
- }
-
- n = (*netns)->n_def_conf;
- ret = ipv4_conf_op("default", (*netns)->def_conf, n, CTL_WRITE, NULL);
- if (ret)
- goto out;
- ret = ipv4_conf_op("all", (*netns)->all_conf, n, CTL_WRITE, NULL);
-out:
- close_image(img);
- return ret;
-}
-
-static int mount_ns_sysfs(void)
-{
- char sys_mount[] = "crtools-sys.XXXXXX";
-
- BUG_ON(ns_sysfs_fd != -1);
-
- /*
- * A new mntns is required to avoid the race between
- * open_detach_mount and creating mntns.
- */
- if (unshare(CLONE_NEWNS)) {
- pr_perror("Can't create new mount namespace");
- return -1;
- }
-
- if (mount(NULL, "/", NULL, MS_PRIVATE | MS_REC, NULL)) {
- pr_perror("Can't mark the root mount as private");
- return -1;
- }
-
- if (mkdtemp(sys_mount) == NULL) {
- pr_perror("mkdtemp failed %s", sys_mount);
- return -1;
- }
-
- /*
- * The setns() is called, so we're in proper context,
- * no need in pulling the mountpoint from parasite.
- */
- pr_info("Mount ns' sysfs in %s\n", sys_mount);
- if (mount("sysfs", sys_mount, "sysfs", MS_MGC_VAL, NULL)) {
- pr_perror("mount failed");
- rmdir(sys_mount);
- return -1;
- }
-
- ns_sysfs_fd = open_detach_mount(sys_mount);
- return ns_sysfs_fd >= 0 ? 0 : -1;
-}
-
-int dump_net_ns(int ns_id)
-{
- struct cr_imgset *fds;
- int ret;
-
- fds = cr_imgset_open(ns_id, NETNS, O_DUMP);
- if (fds == NULL)
- return -1;
-
- ret = mount_ns_sysfs();
- if (!ret)
- ret = dump_netns_conf(fds);
- if (!ret)
- ret = dump_links(fds);
- if (!ret)
- ret = dump_ifaddr(fds);
- if (!ret)
- ret = dump_route(fds);
- if (!ret)
- ret = dump_rule(fds);
- if (!ret)
- ret = dump_iptables(fds);
- if (!ret)
- ret = dump_nf_ct(fds, CR_FD_NETNF_CT);
- if (!ret)
- ret = dump_nf_ct(fds, CR_FD_NETNF_EXP);
-
- close(ns_sysfs_fd);
- ns_sysfs_fd = -1;
-
- close_cr_imgset(&fds);
- return ret;
-}
-
-int prepare_net_ns(int pid)
-{
- int ret;
- NetnsEntry *netns = NULL;
-
- ret = restore_netns_conf(pid, &netns);
- if (!ret)
- ret = restore_links(pid, &netns);
- if (netns)
- netns_entry__free_unpacked(netns, NULL);
-
- if (!ret)
- ret = restore_ifaddr(pid);
- if (!ret)
- ret = restore_route(pid);
- if (!ret)
- ret = restore_rule(pid);
- if (!ret)
- ret = restore_iptables(pid);
- if (!ret)
- ret = restore_nf_ct(pid, CR_FD_NETNF_CT);
- if (!ret)
- ret = restore_nf_ct(pid, CR_FD_NETNF_EXP);
-
- close_service_fd(NS_FD_OFF);
-
- return ret;
-}
-
-int netns_keep_nsfd(void)
-{
- int ns_fd, ret;
-
- if (!(root_ns_mask & CLONE_NEWNET))
- return 0;
-
- /*
- * When restoring a net namespace we need to communicate
- * with the original (i.e. -- init) one. Thus, prepare for
- * that before we leave the existing namespaces.
- */
-
- ns_fd = open("/proc/self/ns/net", O_RDONLY | O_CLOEXEC);
- if (ns_fd < 0) {
- pr_perror("Can't cache net fd");
- return -1;
- }
-
- ret = install_service_fd(NS_FD_OFF, ns_fd);
- if (ret < 0)
- pr_err("Can't install ns net reference\n");
- else
- pr_info("Saved netns fd for links restore\n");
- close(ns_fd);
-
- return ret >= 0 ? 0 : -1;
-}
-
-/*
- * If we want to modify iptables, we need to recevied the current
- * configuration, change it and load a new one into the kernel.
- * iptables can change or add only one rule.
- * iptables-restore allows to make a few changes for one iteration,
- * so it works faster.
- */
-static int iptables_restore(bool ipv6, char *buf, int size)
-{
- int pfd[2], ret = -1;
- char *cmd4[] = {"iptables-restore", "--noflush", NULL};
- char *cmd6[] = {"ip6tables-restore", "--noflush", NULL};
- char **cmd = ipv6 ? cmd6 : cmd4;;
-
- if (pipe(pfd) < 0) {
- pr_perror("Unable to create pipe");
- return -1;
- }
-
- if (write(pfd[1], buf, size) < size) {
- pr_perror("Unable to write iptables configugration");
- goto err;
- }
- close_safe(&pfd[1]);
-
- ret = cr_system(pfd[0], -1, -1, cmd[0], cmd, 0);
-err:
- close_safe(&pfd[1]);
- close_safe(&pfd[0]);
- return ret;
-}
-
-static int network_lock_internal()
-{
- char conf[] = "*filter\n"
- ":CRIU - [0:0]\n"
- "-I INPUT -j CRIU\n"
- "-I OUTPUT -j CRIU\n"
- "-A CRIU -j DROP\n"
- "COMMIT\n";
- int ret = 0, nsret;
-
- if (switch_ns(root_item->pid.real, &net_ns_desc, &nsret))
- return -1;
-
-
- ret |= iptables_restore(false, conf, sizeof(conf) - 1);
- if (kdat.ipv6)
- ret |= iptables_restore(true, conf, sizeof(conf) - 1);
-
- if (restore_ns(nsret, &net_ns_desc))
- ret = -1;
-
- return ret;
-}
-
-static int network_unlock_internal()
-{
- char conf[] = "*filter\n"
- ":CRIU - [0:0]\n"
- "-D INPUT -j CRIU\n"
- "-D OUTPUT -j CRIU\n"
- "-X CRIU\n"
- "COMMIT\n";
- int ret = 0, nsret;
-
- if (switch_ns(root_item->pid.real, &net_ns_desc, &nsret))
- return -1;
-
-
- ret |= iptables_restore(false, conf, sizeof(conf) - 1);
- if (kdat.ipv6)
- ret |= iptables_restore(true, conf, sizeof(conf) - 1);
-
- if (restore_ns(nsret, &net_ns_desc))
- ret = -1;
-
- return ret;
-}
-
-int network_lock(void)
-{
- pr_info("Lock network\n");
-
- /* Each connection will be locked on dump */
- if (!(root_ns_mask & CLONE_NEWNET))
- return 0;
-
- if (run_scripts(ACT_NET_LOCK))
- return -1;
-
- return network_lock_internal();
-}
-
-void network_unlock(void)
-{
- pr_info("Unlock network\n");
-
- cpt_unlock_tcp_connections();
- rst_unlock_tcp_connections();
-
- if (root_ns_mask & CLONE_NEWNET) {
- run_scripts(ACT_NET_UNLOCK);
- network_unlock_internal();
- }
-}
-
-int veth_pair_add(char *in, char *out)
-{
- char *aux;
- struct veth_pair *n;
-
- n = xmalloc(sizeof(*n));
- if (n == NULL)
- return -1;
-
- n->inside = in;
- n->outside = out;
- /*
- * Does the out string specify a bridge for
- * moving the outside end of the veth pair to?
- */
- aux = strrchr(out, '@');
- if (aux) {
- *aux++ = '\0';
- n->bridge = aux;
- } else {
- n->bridge = NULL;
- }
-
- list_add(&n->node, &opts.veth_pairs);
- if (n->bridge)
- pr_debug("Added %s:%s@%s veth map\n", in, out, aux);
- else
- pr_debug("Added %s:%s veth map\n", in, out);
- return 0;
-}
-
-/*
- * The setns() syscall (called by switch_ns()) can be extremely
- * slow. If we call it two or more times from the same task the
- * kernel will synchonously go on a very slow routine called
- * synchronize_rcu() trying to put a reference on old namespaces.
- *
- * To avoid doing this more than once we pre-create all the
- * needed other-ns sockets in advance.
- */
-
-static int prep_ns_sockets(struct ns_id *ns, bool for_dump)
-{
- int nsret = -1, ret;
-
- if (ns->type != NS_CRIU) {
- pr_info("Switching to %d's net for collecting sockets\n", ns->ns_pid);
- if (switch_ns(ns->ns_pid, &net_ns_desc, &nsret))
- return -1;
- }
-
- if (for_dump) {
- ret = ns->net.nlsk = socket(PF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG);
- if (ret < 0) {
- pr_perror("Can't create sock diag socket");
- goto err_nl;
- }
- } else
- ns->net.nlsk = -1;
-
- ret = ns->net.seqsk = socket(PF_UNIX, SOCK_SEQPACKET, 0);
- if (ret < 0) {
- pr_perror("Can't create seqsk for parasite");
- goto err_sq;
- }
-
- ret = 0;
-out:
- if (nsret >= 0 && restore_ns(nsret, &net_ns_desc) < 0) {
- nsret = -1;
- if (ret == 0)
- goto err_ret;
- }
-
- return ret;
-
-err_ret:
- close(ns->net.seqsk);
-err_sq:
- if (ns->net.nlsk >= 0)
- close(ns->net.nlsk);
-err_nl:
- goto out;
-}
-
-static int collect_net_ns(struct ns_id *ns, void *oarg)
-{
- bool for_dump = (oarg == (void *)1);
- int ret;
-
- pr_info("Collecting netns %d/%d\n", ns->id, ns->ns_pid);
- ret = prep_ns_sockets(ns, for_dump);
- if (ret)
- return ret;
-
- if (!for_dump)
- return 0;
-
- return collect_sockets(ns);
-}
-
-int collect_net_namespaces(bool for_dump)
-{
- return walk_namespaces(&net_ns_desc, collect_net_ns,
- (void *)(for_dump ? 1UL : 0));
-}
-
-struct ns_desc net_ns_desc = NS_DESC_ENTRY(CLONE_NEWNET, "net");
-
-int move_veth_to_bridge(void)
-{
- int s;
- int ret;
- struct veth_pair *n;
- struct ifreq ifr;
-
- s = -1;
- ret = 0;
- list_for_each_entry(n, &opts.veth_pairs, node) {
- if (n->bridge == NULL)
- continue;
-
- pr_debug("\tMoving dev %s to bridge %s\n", n->outside, n->bridge);
-
- if (s == -1) {
- s = socket(AF_LOCAL, SOCK_STREAM|SOCK_CLOEXEC, 0);
- if (s < 0) {
- pr_perror("Can't create control socket");
- return -1;
- }
- }
-
- /*
- * Add the device to the bridge. This is equivalent to:
- * $ brctl addif <bridge> <device>
- */
- ifr.ifr_ifindex = if_nametoindex(n->outside);
- if (ifr.ifr_ifindex == 0) {
- pr_perror("Can't get index of %s", n->outside);
- ret = -1;
- break;
- }
- strlcpy(ifr.ifr_name, n->bridge, IFNAMSIZ);
- ret = ioctl(s, SIOCBRADDIF, &ifr);
- if (ret < 0) {
- pr_perror("Can't add interface %s to bridge %s",
- n->outside, n->bridge);
- break;
- }
-
- /*
- * Make sure the device is up. This is equivalent to:
- * $ ip link set dev <device> up
- */
- ifr.ifr_ifindex = 0;
- strlcpy(ifr.ifr_name, n->outside, IFNAMSIZ);
- ret = ioctl(s, SIOCGIFFLAGS, &ifr);
- if (ret < 0) {
- pr_perror("Can't get flags of interface %s", n->outside);
- break;
- }
- if (ifr.ifr_flags & IFF_UP)
- continue;
- ifr.ifr_flags |= IFF_UP;
- ret = ioctl(s, SIOCSIFFLAGS, &ifr);
- if (ret < 0) {
- pr_perror("Can't set flags of interface %s to 0x%x",
- n->outside, ifr.ifr_flags);
- break;
- }
- }
-
- if (s >= 0)
- close(s);
- return ret;
-}
diff --git a/netfilter.c b/netfilter.c
deleted file mode 100644
index 95e18aa97451..000000000000
--- a/netfilter.c
+++ /dev/null
@@ -1,124 +0,0 @@
-#include <sys/socket.h>
-#include <arpa/inet.h>
-#include <unistd.h>
-#include <string.h>
-#include <wait.h>
-#include <stdlib.h>
-
-#include "asm/types.h"
-#include "util.h"
-#include "list.h"
-#include "files.h"
-#include "netfilter.h"
-#include "sockets.h"
-#include "sk-inet.h"
-
-static char buf[512];
-
-/*
- * Need to configure simple netfilter rules for blocking connections
- * ANy brave soul to write it using xtables-devel?
- */
-
-static const char *nf_conn_cmd = "%s -t filter %s %s --protocol tcp "
- "--source %s --sport %d --destination %s --dport %d -j DROP";
-
-static char iptable_cmd_ipv4[] = "iptables";
-static char iptable_cmd_ipv6[] = "ip6tables";
-
-static int nf_connection_switch_raw(int family, u32 *src_addr, u16 src_port,
- u32 *dst_addr, u16 dst_port,
- bool input, bool lock)
-{
- char sip[INET_ADDR_LEN], dip[INET_ADDR_LEN];
- char *cmd;
- char *argv[4] = { "sh", "-c", buf, NULL };
- int ret;
-
- switch (family) {
- case AF_INET:
- cmd = iptable_cmd_ipv4;
- break;
- case AF_INET6:
- cmd = iptable_cmd_ipv6;
- break;
- default:
- pr_err("Unknown socket family %d\n", family);
- return -1;
- };
-
- if (!inet_ntop(family, (void *)src_addr, sip, INET_ADDR_LEN) ||
- !inet_ntop(family, (void *)dst_addr, dip, INET_ADDR_LEN)) {
- pr_perror("nf: Can't translate ip addr");
- return -1;
- }
-
- snprintf(buf, sizeof(buf), nf_conn_cmd, cmd,
- lock ? "-A" : "-D",
- input ? "INPUT" : "OUTPUT",
- dip, (int)dst_port, sip, (int)src_port);
-
- pr_debug("\tRunning iptables [%s]\n", buf);
-
- /*
- * cr_system is used here, because it blocks SIGCHLD before waiting
- * a child and the child can't be waited from SIGCHLD handler.
- */
- ret = cr_system(-1, -1, -1, "sh", argv, 0);
- if (ret < 0 || !WIFEXITED(ret) || WEXITSTATUS(ret)) {
- pr_perror("Iptables configuration failed");
- return -1;
- }
-
- pr_info("%s %s:%d - %s:%d connection\n", lock ? "Locked" : "Unlocked",
- sip, (int)src_port, dip, (int)dst_port);
- return 0;
-}
-
-static int nf_connection_switch(struct inet_sk_desc *sk, bool lock)
-{
- int ret = 0;
-
- ret = nf_connection_switch_raw(sk->sd.family,
- sk->src_addr, sk->src_port,
- sk->dst_addr, sk->dst_port, true, lock);
- if (ret)
- return -1;
-
- ret = nf_connection_switch_raw(sk->sd.family,
- sk->dst_addr, sk->dst_port,
- sk->src_addr, sk->src_port, false, lock);
- if (ret) /* rollback */
- nf_connection_switch_raw(sk->sd.family,
- sk->src_addr, sk->src_port,
- sk->dst_addr, sk->dst_port, true, !lock);
- return ret;
-}
-
-int nf_lock_connection(struct inet_sk_desc *sk)
-{
- return nf_connection_switch(sk, true);
-}
-
-int nf_unlock_connection(struct inet_sk_desc *sk)
-{
- return nf_connection_switch(sk, false);
-}
-
-int nf_unlock_connection_info(struct inet_sk_info *si)
-{
- int ret = 0;
-
- ret |= nf_connection_switch_raw(si->ie->family,
- si->ie->src_addr, si->ie->src_port,
- si->ie->dst_addr, si->ie->dst_port, true, false);
- ret |= nf_connection_switch_raw(si->ie->family,
- si->ie->dst_addr, si->ie->dst_port,
- si->ie->src_addr, si->ie->src_port, false, false);
- /*
- * rollback nothing in case of any error,
- * because nobody checks errors of this function
- */
-
- return ret;
-}
diff --git a/page-pipe.c b/page-pipe.c
deleted file mode 100644
index db58f6a59c8e..000000000000
--- a/page-pipe.c
+++ /dev/null
@@ -1,238 +0,0 @@
-#include <unistd.h>
-#include <fcntl.h>
-
-#undef LOG_PREFIX
-#define LOG_PREFIX "page-pipe: "
-
-#include "config.h"
-#include "util.h"
-#include "page-pipe.h"
-
-/* can existing iov accumulate the page? */
-static inline bool iov_grow_page(struct iovec *iov, unsigned long addr)
-{
- if ((unsigned long)iov->iov_base + iov->iov_len == addr) {
- iov->iov_len += PAGE_SIZE;
- return true;
- }
-
- return false;
-}
-
-static inline void iov_init(struct iovec *iov, unsigned long addr)
-{
- iov->iov_base = (void *)addr;
- iov->iov_len = PAGE_SIZE;
-}
-
-static int page_pipe_grow(struct page_pipe *pp)
-{
- struct page_pipe_buf *ppb;
-
- pr_debug("Will grow page pipe (iov off is %u)\n", pp->free_iov);
-
- if (!list_empty(&pp->free_bufs)) {
- ppb = list_first_entry(&pp->free_bufs, struct page_pipe_buf, l);
- list_move_tail(&ppb->l, &pp->bufs);
- goto out;
- }
-
- if (pp->chunk_mode && pp->nr_pipes == NR_PIPES_PER_CHUNK)
- return -EAGAIN;
-
- ppb = xmalloc(sizeof(*ppb));
- if (!ppb)
- return -1;
-
- if (pipe(ppb->p)) {
- xfree(ppb);
- pr_perror("Can't make pipe for page-pipe");
- return -1;
- }
-
- ppb->pipe_size = fcntl(ppb->p[0], F_GETPIPE_SZ, 0) / PAGE_SIZE;
- pp->nr_pipes++;
-
- list_add_tail(&ppb->l, &pp->bufs);
-out:
- ppb->pages_in = 0;
- ppb->nr_segs = 0;
- ppb->iov = &pp->iovs[pp->free_iov];
-
- return 0;
-}
-
-struct page_pipe *create_page_pipe(unsigned int nr_segs,
- struct iovec *iovs, bool chunk_mode)
-{
- struct page_pipe *pp;
-
- pr_debug("Create page pipe for %u segs\n", nr_segs);
-
- pp = xmalloc(sizeof(*pp));
- if (pp) {
- pp->nr_pipes = 0;
- INIT_LIST_HEAD(&pp->bufs);
- INIT_LIST_HEAD(&pp->free_bufs);
- pp->nr_iovs = nr_segs;
- pp->iovs = iovs;
- pp->free_iov = 0;
-
- pp->nr_holes = 0;
- pp->free_hole = 0;
- pp->holes = NULL;
-
- pp->chunk_mode = chunk_mode;
-
- if (page_pipe_grow(pp))
- return NULL;
- }
-
- return pp;
-}
-
-void destroy_page_pipe(struct page_pipe *pp)
-{
- struct page_pipe_buf *ppb, *n;
-
- pr_debug("Killing page pipe\n");
-
- list_splice(&pp->free_bufs, &pp->bufs);
- list_for_each_entry_safe(ppb, n, &pp->bufs, l) {
- close(ppb->p[0]);
- close(ppb->p[1]);
- xfree(ppb);
- }
-
- xfree(pp);
-}
-
-void page_pipe_reinit(struct page_pipe *pp)
-{
- struct page_pipe_buf *ppb, *n;
-
- BUG_ON(!pp->chunk_mode);
-
- pr_debug("Clean up page pipe\n");
-
- list_for_each_entry_safe(ppb, n, &pp->bufs, l)
- list_move(&ppb->l, &pp->free_bufs);
-
- pp->free_hole = 0;
-
- if (page_pipe_grow(pp))
- BUG(); /* It can't fail, because ppb is in free_bufs */
-}
-
-static inline int try_add_page_to(struct page_pipe *pp, struct page_pipe_buf *ppb,
- unsigned long addr)
-{
- if (ppb->pages_in == ppb->pipe_size) {
- unsigned long new_size = ppb->pipe_size << 1;
- int ret;
-
- if (new_size > PIPE_MAX_SIZE)
- return 1;
-
- ret = fcntl(ppb->p[0], F_SETPIPE_SZ, new_size * PAGE_SIZE);
- if (ret < 0)
- return 1; /* need to add another buf */
-
- ret /= PAGE_SIZE;
- BUG_ON(ret < ppb->pipe_size);
-
- pr_debug("Grow pipe %x -> %x\n", ppb->pipe_size, ret);
- ppb->pipe_size = ret;
- }
-
- if (ppb->nr_segs) {
- if (iov_grow_page(&ppb->iov[ppb->nr_segs - 1], addr))
- goto out;
-
- if (ppb->nr_segs == UIO_MAXIOV)
- /* XXX -- shrink pipe back? */
- return 1;
- }
-
- pr_debug("Add iov to page pipe (%u iovs, %u/%u total)\n",
- ppb->nr_segs, pp->free_iov, pp->nr_iovs);
- iov_init(&ppb->iov[ppb->nr_segs++], addr);
- pp->free_iov++;
- BUG_ON(pp->free_iov > pp->nr_iovs);
-out:
- ppb->pages_in++;
- return 0;
-}
-
-static inline int try_add_page(struct page_pipe *pp, unsigned long addr)
-{
- BUG_ON(list_empty(&pp->bufs));
- return try_add_page_to(pp, list_entry(pp->bufs.prev, struct page_pipe_buf, l), addr);
-}
-
-int page_pipe_add_page(struct page_pipe *pp, unsigned long addr)
-{
- int ret;
-
- ret = try_add_page(pp, addr);
- if (ret <= 0)
- return ret;
-
- ret = page_pipe_grow(pp);
- if (ret < 0)
- return ret;
-
- ret = try_add_page(pp, addr);
- BUG_ON(ret > 0);
- return ret;
-}
-
-#define PP_HOLES_BATCH 32
-
-int page_pipe_add_hole(struct page_pipe *pp, unsigned long addr)
-{
- if (pp->free_hole >= pp->nr_holes) {
- pp->holes = xrealloc(pp->holes,
- (pp->nr_holes + PP_HOLES_BATCH) * sizeof(struct iovec));
- if (!pp->holes)
- return -1;
-
- pp->nr_holes += PP_HOLES_BATCH;
- }
-
- if (pp->free_hole &&
- iov_grow_page(&pp->holes[pp->free_hole - 1], addr))
- goto out;
-
- iov_init(&pp->holes[pp->free_hole++], addr);
-out:
- return 0;
-}
-
-void debug_show_page_pipe(struct page_pipe *pp)
-{
- struct page_pipe_buf *ppb;
- int i;
- struct iovec *iov;
-
- if (pr_quelled(LOG_DEBUG))
- return;
-
- pr_debug("Page pipe:\n");
- pr_debug("* %u pipes %u/%u iovs:\n",
- pp->nr_pipes, pp->free_iov, pp->nr_iovs);
- list_for_each_entry(ppb, &pp->bufs, l) {
- pr_debug("\tbuf %u pages, %u iovs:\n",
- ppb->pages_in, ppb->nr_segs);
- for (i = 0; i < ppb->nr_segs; i++) {
- iov = &ppb->iov[i];
- pr_debug("\t\t%p %lu\n", iov->iov_base, iov->iov_len / PAGE_SIZE);
- }
- }
-
- pr_debug("* %u holes:\n", pp->free_hole);
- for (i = 0; i < pp->free_hole; i++) {
- iov = &pp->holes[i];
- pr_debug("\t%p %lu\n", iov->iov_base, iov->iov_len / PAGE_SIZE);
- }
-}
diff --git a/page-read.c b/page-read.c
deleted file mode 100644
index 28ecd5bdb2bb..000000000000
--- a/page-read.c
+++ /dev/null
@@ -1,360 +0,0 @@
-#include <fcntl.h>
-#include <stdio.h>
-#include <unistd.h>
-
-#include "image.h"
-#include "cr_options.h"
-#include "servicefd.h"
-#include "page-read.h"
-
-#include "protobuf.h"
-#include "protobuf/pagemap.pb-c.h"
-
-#ifndef SEEK_DATA
-#define SEEK_DATA 3
-#define SEEK_HOLE 4
-#endif
-
-static int get_page_vaddr(struct page_read *pr, struct iovec *iov)
-{
- int ret;
- u64 img_va;
-
- ret = read_img_eof(pr->pmi, &img_va);
- if (ret <= 0)
- return ret;
-
- iov->iov_base = (void *)decode_pointer(img_va);
- iov->iov_len = PAGE_SIZE;
-
- return 1;
-}
-
-static int read_page(struct page_read *pr, unsigned long vaddr, int nr, void *buf)
-{
- int ret;
-
- BUG_ON(nr != 1);
-
- ret = read(img_raw_fd(pr->pmi), buf, PAGE_SIZE);
- if (ret != PAGE_SIZE) {
- pr_err("Can't read mapping page %d\n", ret);
- return -1;
- }
-
- return 1;
-}
-
-void pagemap2iovec(PagemapEntry *pe, struct iovec *iov)
-{
- iov->iov_base = decode_pointer(pe->vaddr);
- iov->iov_len = pe->nr_pages * PAGE_SIZE;
-}
-
-void iovec2pagemap(struct iovec *iov, PagemapEntry *pe)
-{
- pe->vaddr = encode_pointer(iov->iov_base);
- pe->nr_pages = iov->iov_len / PAGE_SIZE;
-}
-
-static int get_pagemap(struct page_read *pr, struct iovec *iov)
-{
- int ret;
- PagemapEntry *pe;
-
- ret = pb_read_one_eof(pr->pmi, &pe, PB_PAGEMAP);
- if (ret <= 0)
- return ret;
-
- pagemap2iovec(pe, iov);
-
- pr->pe = pe;
- pr->cvaddr = (unsigned long)iov->iov_base;
-
- if (pe->in_parent && !pr->parent) {
- pr_err("No parent for snapshot pagemap\n");
- return -1;
- }
-
- return 1;
-}
-
-static void put_pagemap(struct page_read *pr)
-{
- pagemap_entry__free_unpacked(pr->pe, NULL);
-}
-
-static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, int nr, void *buf);
-
-static void skip_pagemap_pages(struct page_read *pr, unsigned long len)
-{
- if (!len)
- return;
-
- pr_debug("\tpr%u Skip %lu bytes from page-dump\n", pr->id, len);
- if (!pr->pe->in_parent)
- lseek(img_raw_fd(pr->pi), len, SEEK_CUR);
- pr->cvaddr += len;
-}
-
-int seek_pagemap_page(struct page_read *pr, unsigned long vaddr, bool warn)
-{
- int ret;
- struct iovec iov;
-
- if (pr->pe)
- pagemap2iovec(pr->pe, &iov);
- else
- goto new_pagemap;
-
- while (1) {
- unsigned long iov_end;
-
- if (vaddr < pr->cvaddr) {
- if (warn)
- pr_err("Missing %lx in parent pagemap, current iov: base=%lx,len=%zu\n",
- vaddr, (unsigned long)iov.iov_base, iov.iov_len);
- return 0;
- }
- iov_end = (unsigned long)iov.iov_base + iov.iov_len;
-
- if (iov_end <= vaddr) {
- skip_pagemap_pages(pr, iov_end - pr->cvaddr);
- put_pagemap(pr);
-new_pagemap:
- ret = get_pagemap(pr, &iov);
- if (ret <= 0)
- return ret;
-
- continue;
- }
-
- skip_pagemap_pages(pr, vaddr - pr->cvaddr);
- return 1;
- }
-}
-
-static inline void pagemap_bound_check(PagemapEntry *pe, unsigned long vaddr, int nr)
-{
- if (vaddr < pe->vaddr || (vaddr - pe->vaddr) / PAGE_SIZE + nr > pe->nr_pages) {
- pr_err("Page read err %"PRIx64":%u vs %lx:%u\n",
- pe->vaddr, pe->nr_pages, vaddr, nr);
- BUG();
- }
-}
-
-static int read_pagemap_page(struct page_read *pr, unsigned long vaddr, int nr, void *buf)
-{
- int ret;
- unsigned long len = nr * PAGE_SIZE;
-
- pr_info("pr%u Read %lx %u pages\n", pr->id, vaddr, nr);
- pagemap_bound_check(pr->pe, vaddr, nr);
-
- if (pr->pe->in_parent) {
- struct page_read *ppr = pr->parent;
-
- /*
- * Parent pagemap at this point entry may be shorter
- * than the current vaddr:nr needs, so we have to
- * carefully 'split' the vaddr:nr into pieces and go
- * to parent page-read with the longest requests it
- * can handle.
- */
-
- do {
- int p_nr;
-
- pr_debug("\tpr%u Read from parent\n", pr->id);
- ret = seek_pagemap_page(ppr, vaddr, true);
- if (ret <= 0)
- return -1;
-
- /*
- * This is how many pages we have in the parent
- * page_read starting from vaddr. Go ahead and
- * read as much as we can.
- */
- p_nr = ppr->pe->nr_pages - (vaddr - ppr->pe->vaddr) / PAGE_SIZE;
- pr_info("\tparent has %u pages in\n", p_nr);
- if (p_nr > nr)
- p_nr = nr;
-
- ret = read_pagemap_page(ppr, vaddr, p_nr, buf);
- if (ret == -1)
- return ret;
-
- /*
- * OK, let's see how much data we have left and go
- * to parent page-read again for the next pagemap
- * entry.
- */
- nr -= p_nr;
- vaddr += p_nr * PAGE_SIZE;
- buf += p_nr * PAGE_SIZE;
- } while (nr);
- } else {
- int fd = img_raw_fd(pr->pi);
- off_t current_vaddr = lseek(fd, 0, SEEK_CUR);
-
- pr_debug("\tpr%u Read page from self %lx/%"PRIx64"\n", pr->id, pr->cvaddr, current_vaddr);
- ret = read(fd, buf, len);
- if (ret != len) {
- pr_perror("Can't read mapping page %d", ret);
- return -1;
- }
-
- if (opts.auto_dedup) {
- ret = punch_hole(pr, current_vaddr, len, false);
- if (ret == -1) {
- return -1;
- }
- }
- }
-
- pr->cvaddr += len;
-
- return 1;
-}
-
-static void close_page_read(struct page_read *pr)
-{
- int ret;
-
- if (pr->bunch.iov_len > 0) {
- ret = punch_hole(pr, 0, 0, true);
- if (ret == -1)
- return;
-
- pr->bunch.iov_len = 0;
- }
-
- if (pr->parent) {
- close_page_read(pr->parent);
- xfree(pr->parent);
- }
-
- close_image(pr->pmi);
- if (pr->pi)
- close_image(pr->pi);
-}
-
-static int try_open_parent(int dfd, int pid, struct page_read *pr, int pr_flags)
-{
- int pfd, ret;
- struct page_read *parent = NULL;
-
- pfd = openat(dfd, CR_PARENT_LINK, O_RDONLY);
- if (pfd < 0 && errno == ENOENT)
- goto out;
-
- parent = xmalloc(sizeof(*parent));
- if (!parent)
- goto err_cl;
-
- ret = open_page_read_at(pfd, pid, parent, pr_flags);
- if (ret < 0)
- goto err_free;
-
- if (!ret) {
- xfree(parent);
- parent = NULL;
- }
-
- close(pfd);
-out:
- pr->parent = parent;
- return 0;
-
-err_free:
- xfree(parent);
-err_cl:
- close(pfd);
- return -1;
-}
-
-int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flags)
-{
- int flags, i_typ, i_typ_o;
- static unsigned ids = 1;
-
- if (opts.auto_dedup)
- pr_flags |= PR_MOD;
- if (pr_flags & PR_MOD)
- flags = O_RDWR;
- else
- flags = O_RSTR;
-
- switch (pr_flags & PR_TYPE_MASK) {
- case PR_TASK:
- i_typ = CR_FD_PAGEMAP;
- i_typ_o = CR_FD_PAGES_OLD;
- break;
- case PR_SHMEM:
- i_typ = CR_FD_SHMEM_PAGEMAP;
- i_typ_o = CR_FD_SHM_PAGES_OLD;
- break;
- default:
- BUG();
- return -1;
- }
-
- pr->pe = NULL;
- pr->parent = NULL;
- pr->bunch.iov_len = 0;
- pr->bunch.iov_base = NULL;
-
- pr->pmi = open_image_at(dfd, i_typ, O_RSTR, (long)pid);
- if (!pr->pmi)
- return -1;
-
- if (empty_image(pr->pmi)) {
- close_image(pr->pmi);
- goto open_old;
- }
-
- if ((i_typ != CR_FD_SHMEM_PAGEMAP) && try_open_parent(dfd, pid, pr, pr_flags)) {
- close_image(pr->pmi);
- return -1;
- }
-
- pr->pi = open_pages_image_at(dfd, flags, pr->pmi);
- if (!pr->pi) {
- close_page_read(pr);
- return -1;
- }
-
- pr->get_pagemap = get_pagemap;
- pr->put_pagemap = put_pagemap;
- pr->read_pages = read_pagemap_page;
- pr->close = close_page_read;
- pr->id = ids++;
-
- pr_debug("Opened page read %u (parent %u)\n",
- pr->id, pr->parent ? pr->parent->id : 0);
-
- return 1;
-
-open_old:
- pr->pmi = open_image_at(dfd, i_typ_o, flags, pid);
- if (!pr->pmi)
- return -1;
-
- if (empty_image(pr->pmi)) {
- close_image(pr->pmi);
- return 0;
- }
-
- pr->get_pagemap = get_page_vaddr;
- pr->put_pagemap = NULL;
- pr->read_pages = read_page;
- pr->pi = NULL;
- pr->close = close_page_read;
-
- return 1;
-}
-
-int open_page_read(int pid, struct page_read *pr, int pr_flags)
-{
- return open_page_read_at(get_service_fd(IMG_FD_OFF), pid, pr, pr_flags);
-}
diff --git a/page-xfer.c b/page-xfer.c
deleted file mode 100644
index eee8f5f17992..000000000000
--- a/page-xfer.c
+++ /dev/null
@@ -1,880 +0,0 @@
-#include <sys/socket.h>
-#include <netinet/in.h>
-#include <arpa/inet.h>
-#include <linux/falloc.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <sys/stat.h>
-
-#include "cr_options.h"
-#include "servicefd.h"
-#include "image.h"
-#include "page-xfer.h"
-#include "page-pipe.h"
-#include "util.h"
-#include "protobuf.h"
-#include "protobuf/pagemap.pb-c.h"
-
-struct page_server_iov {
- u32 cmd;
- u32 nr_pages;
- u64 vaddr;
- u64 dst_id;
-};
-
-static void psi2iovec(struct page_server_iov *ps, struct iovec *iov)
-{
- iov->iov_base = decode_pointer(ps->vaddr);
- iov->iov_len = ps->nr_pages * PAGE_SIZE;
-}
-
-static void iovec2psi(struct iovec *iov, struct page_server_iov *ps)
-{
- ps->vaddr = encode_pointer(iov->iov_base);
- ps->nr_pages = iov->iov_len / PAGE_SIZE;
-}
-
-static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, long id);
-
-#define PS_IOV_ADD 1
-#define PS_IOV_HOLE 2
-#define PS_IOV_OPEN 3
-#define PS_IOV_OPEN2 4
-#define PS_IOV_PARENT 5
-
-#define PS_IOV_FLUSH 0x1023
-#define PS_IOV_FLUSH_N_CLOSE 0x1024
-
-#define PS_TYPE_BITS 8
-#define PS_TYPE_MASK ((1 << PS_TYPE_BITS) - 1)
-
-static inline u64 encode_pm_id(int type, long id)
-{
- return ((u64)id) << PS_TYPE_BITS | type;
-}
-
-static int decode_pm_type(u64 dst_id)
-{
- return dst_id & PS_TYPE_MASK;
-}
-
-static long decode_pm_id(u64 dst_id)
-{
- return (long)(dst_id >> PS_TYPE_BITS);
-}
-
-struct page_xfer_job {
- u64 dst_id;
- int p[2];
- unsigned pipe_size;
- struct page_xfer loc_xfer;
-};
-
-static struct page_xfer_job cxfer = {
- .dst_id = ~0,
-};
-
-static void page_server_close(void)
-{
- if (cxfer.dst_id != ~0)
- cxfer.loc_xfer.close(&cxfer.loc_xfer);
-}
-
-static void close_page_xfer(struct page_xfer *xfer);
-static int page_server_open(int sk, struct page_server_iov *pi)
-{
- int type;
- long id;
-
- type = decode_pm_type(pi->dst_id);
- id = decode_pm_id(pi->dst_id);
- pr_info("Opening %d/%ld\n", type, id);
-
- page_server_close();
-
- if (open_page_local_xfer(&cxfer.loc_xfer, type, id))
- return -1;
-
- cxfer.dst_id = pi->dst_id;
-
- if (sk >= 0) {
- char has_parent = !!cxfer.loc_xfer.parent;
-
- if (write(sk, &has_parent, 1) != 1) {
- pr_perror("Unable to send reponse");
- close_page_xfer(&cxfer.loc_xfer);
- return -1;
- }
- }
-
- return 0;
-}
-
-static int prep_loc_xfer(struct page_server_iov *pi)
-{
- if (cxfer.dst_id != pi->dst_id) {
- pr_warn("Deprecated IO w/o open\n");
- return page_server_open(-1, pi);
- } else
- return 0;
-}
-
-static int page_server_add(int sk, struct page_server_iov *pi)
-{
- size_t len;
- struct page_xfer *lxfer = &cxfer.loc_xfer;
- struct iovec iov;
-
- pr_debug("Adding %"PRIx64"/%u\n", pi->vaddr, pi->nr_pages);
-
- if (prep_loc_xfer(pi))
- return -1;
-
- psi2iovec(pi, &iov);
- if (lxfer->write_pagemap(lxfer, &iov))
- return -1;
-
- len = iov.iov_len;
- while (len > 0) {
- ssize_t chunk;
-
- chunk = len;
- if (chunk > cxfer.pipe_size)
- chunk = cxfer.pipe_size;
-
- chunk = splice(sk, NULL, cxfer.p[1], NULL, chunk, SPLICE_F_MOVE | SPLICE_F_NONBLOCK);
- if (chunk < 0) {
- pr_perror("Can't read from socket");
- return -1;
- }
-
- if (lxfer->write_pages(lxfer, cxfer.p[0], chunk))
- return -1;
-
- len -= chunk;
- }
-
- return 0;
-}
-
-static int page_server_hole(int sk, struct page_server_iov *pi)
-{
- struct page_xfer *lxfer = &cxfer.loc_xfer;
- struct iovec iov;
-
- pr_debug("Adding %"PRIx64"/%u hole\n", pi->vaddr, pi->nr_pages);
-
- if (prep_loc_xfer(pi))
- return -1;
-
- psi2iovec(pi, &iov);
- if (lxfer->write_hole(lxfer, &iov))
- return -1;
-
- return 0;
-}
-
-static int page_server_check_parent(int sk, struct page_server_iov *pi);
-
-static int page_server_serve(int sk)
-{
- int ret = -1;
- bool flushed = false;
-
- /*
- * This socket only accepts data except one thing -- it
- * writes back the has_parent bit from time to time, so
- * make it NODELAY all the time.
- */
- tcp_nodelay(sk, true);
-
- if (pipe(cxfer.p)) {
- pr_perror("Can't make pipe for xfer");
- close(sk);
- return -1;
- }
-
- cxfer.pipe_size = fcntl(cxfer.p[0], F_GETPIPE_SZ, 0);
- pr_debug("Created xfer pipe size %u\n", cxfer.pipe_size);
-
- while (1) {
- struct page_server_iov pi;
-
- ret = recv(sk, &pi, sizeof(pi), MSG_WAITALL);
- if (!ret)
- break;
-
- if (ret != sizeof(pi)) {
- pr_perror("Can't read pagemap from socket");
- ret = -1;
- break;
- }
-
- flushed = false;
-
- switch (pi.cmd) {
- case PS_IOV_OPEN:
- ret = page_server_open(-1, &pi);
- break;
- case PS_IOV_OPEN2:
- ret = page_server_open(sk, &pi);
- break;
- case PS_IOV_PARENT:
- ret = page_server_check_parent(sk, &pi);
- break;
- case PS_IOV_ADD:
- ret = page_server_add(sk, &pi);
- break;
- case PS_IOV_HOLE:
- ret = page_server_hole(sk, &pi);
- break;
- case PS_IOV_FLUSH:
- case PS_IOV_FLUSH_N_CLOSE:
- {
- int32_t status = 0;
-
- ret = 0;
-
- /*
- * An answer must be sent back to inform another side,
- * that all data were received
- */
- if (write(sk, &status, sizeof(status)) != sizeof(status)) {
- pr_perror("Can't send the final package");
- ret = -1;
- }
-
- flushed = true;
- break;
- }
- default:
- pr_err("Unknown command %u\n", pi.cmd);
- ret = -1;
- break;
- }
-
- if (ret || (pi.cmd == PS_IOV_FLUSH_N_CLOSE))
- break;
- }
-
- if (!ret && !flushed) {
- pr_err("The data were not flushed\n");
- ret = -1;
- }
-
- if (ret == 0 && opts.ps_socket == -1) {
- char c;
-
- /*
- * Wait when a remote side closes the connection
- * to avoid TIME_WAIT bucket
- */
-
- if (read(sk, &c, sizeof(c)) != 0) {
- pr_perror("Unexpected data");
- ret = -1;
- }
- }
-
- page_server_close();
- pr_info("Session over\n");
-
- close(sk);
- return ret;
-}
-
-static int get_sockaddr_in(struct sockaddr_in *addr)
-{
- memset(addr, 0, sizeof(*addr));
- addr->sin_family = AF_INET;
-
- if (!opts.addr)
- addr->sin_addr.s_addr = INADDR_ANY;
- else if (!inet_aton(opts.addr, &addr->sin_addr)) {
- pr_perror("Bad page server address");
- return -1;
- }
-
- addr->sin_port = opts.port;
- return 0;
-}
-
-int cr_page_server(bool daemon_mode, int cfd)
-{
- int sk = -1, ask = -1, ret;
- struct sockaddr_in saddr, caddr;
- socklen_t slen = sizeof(saddr);
- socklen_t clen = sizeof(caddr);
-
- up_page_ids_base();
-
- if (opts.ps_socket != -1) {
- ret = 0;
- ask = opts.ps_socket;
- pr_info("Re-using ps socket %d\n", ask);
- goto no_server;
- }
-
- pr_info("Starting page server on port %u\n", (int)ntohs(opts.port));
-
- sk = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP);
- if (sk < 0) {
- pr_perror("Can't init page server");
- return -1;
- }
-
- if (get_sockaddr_in(&saddr))
- goto out;
-
- if (bind(sk, (struct sockaddr *)&saddr, slen)) {
- pr_perror("Can't bind page server");
- goto out;
- }
-
- if (listen(sk, 1)) {
- pr_perror("Can't listen on page server socket");
- goto out;
- }
-
- /* Get socket port in case of autobind */
- if (opts.port == 0) {
- if (getsockname(sk, (struct sockaddr *)&saddr, &slen)) {
- pr_perror("Can't get page server name");
- goto out;
- }
-
- opts.port = ntohs(saddr.sin_port);
- pr_info("Using %u port\n", opts.port);
- }
-
-no_server:
- if (daemon_mode) {
- ret = cr_daemon(1, 0, &ask, cfd);
- if (ret == -1) {
- pr_err("Can't run in the background\n");
- goto out;
- }
- if (ret > 0) { /* parent task, daemon started */
- close_safe(&sk);
- if (opts.pidfile) {
- if (write_pidfile(ret) == -1) {
- pr_perror("Can't write pidfile");
- kill(ret, SIGKILL);
- waitpid(ret, NULL, 0);
- return -1;
- }
- }
-
- return ret;
- }
- }
-
- if (sk >= 0) {
- ret = ask = accept(sk, (struct sockaddr *)&caddr, &clen);
- if (ask < 0)
- pr_perror("Can't accept connection to server");
- else
- pr_info("Accepted connection from %s:%u\n",
- inet_ntoa(caddr.sin_addr),
- (int)ntohs(caddr.sin_port));
- close(sk);
- }
-
- if (ask >= 0)
- ret = page_server_serve(ask);
-
- if (daemon_mode)
- exit(ret);
-
- return ret;
-
-out:
- close(sk);
- return -1;
-}
-
-static int page_server_sk = -1;
-
-int connect_to_page_server(void)
-{
- struct sockaddr_in saddr;
-
- if (!opts.use_page_server)
- return 0;
-
- if (opts.ps_socket != -1) {
- page_server_sk = opts.ps_socket;
- pr_info("Re-using ps socket %d\n", page_server_sk);
- goto out;
- }
-
- pr_info("Connecting to server %s:%u\n",
- opts.addr, (int)ntohs(opts.port));
-
- page_server_sk = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP);
- if (page_server_sk < 0) {
- pr_perror("Can't create socket");
- return -1;
- }
-
- if (get_sockaddr_in(&saddr))
- return -1;
-
- if (connect(page_server_sk, (struct sockaddr *)&saddr, sizeof(saddr)) < 0) {
- pr_perror("Can't connect to server");
- return -1;
- }
-
-out:
- /*
- * CORK the socket at the very beginning. As per ANK
- * the corked by default socket with sporadic NODELAY-s
- * on urgent data is the smartest mode ever.
- */
- tcp_cork(page_server_sk, true);
- return 0;
-}
-
-int disconnect_from_page_server(void)
-{
- struct page_server_iov pi = { };
- int32_t status = -1;
- int ret = -1;
-
- if (!opts.use_page_server)
- return 0;
-
- if (page_server_sk == -1)
- return 0;
-
- pr_info("Disconnect from the page server %s:%u\n",
- opts.addr, (int)ntohs(opts.port));
-
- if (opts.ps_socket != -1)
- /*
- * The socket might not get closed (held by
- * the parent process) so we must order the
- * page-server to terminate itself.
- */
- pi.cmd = PS_IOV_FLUSH_N_CLOSE;
- else
- pi.cmd = PS_IOV_FLUSH;
-
- if (write(page_server_sk, &pi, sizeof(pi)) != sizeof(pi)) {
- pr_perror("Can't write the fini command to server");
- goto out;
- }
-
- if (read(page_server_sk, &status, sizeof(status)) != sizeof(status)) {
- pr_perror("The page server doesn't answer");
- goto out;
- }
-
- ret = 0;
-out:
- close_safe(&page_server_sk);
- return ret ? : status;
-}
-
-static int write_pagemap_to_server(struct page_xfer *xfer,
- struct iovec *iov)
-{
- struct page_server_iov pi;
-
- pi.cmd = PS_IOV_ADD;
- pi.dst_id = xfer->dst_id;
- iovec2psi(iov, &pi);
-
- if (write(xfer->sk, &pi, sizeof(pi)) != sizeof(pi)) {
- pr_perror("Can't write pagemap to server");
- return -1;
- }
-
- return 0;
-}
-
-static int write_pages_to_server(struct page_xfer *xfer,
- int p, unsigned long len)
-{
- pr_debug("Splicing %lu bytes / %lu pages into socket\n", len, len / PAGE_SIZE);
-
- if (splice(p, NULL, xfer->sk, NULL, len, SPLICE_F_MOVE) != len) {
- pr_perror("Can't write pages to socket");
- return -1;
- }
-
- return 0;
-}
-
-static int write_hole_to_server(struct page_xfer *xfer, struct iovec *iov)
-{
- struct page_server_iov pi;
-
- pi.cmd = PS_IOV_HOLE;
- pi.dst_id = xfer->dst_id;
- iovec2psi(iov, &pi);
-
- if (write(xfer->sk, &pi, sizeof(pi)) != sizeof(pi)) {
- pr_perror("Can't write pagehole to server");
- return -1;
- }
-
- return 0;
-}
-
-static void close_server_xfer(struct page_xfer *xfer)
-{
- xfer->sk = -1;
-}
-
-static int open_page_server_xfer(struct page_xfer *xfer, int fd_type, long id)
-{
- struct page_server_iov pi;
- char has_parent;
-
- xfer->sk = page_server_sk;
- xfer->write_pagemap = write_pagemap_to_server;
- xfer->write_pages = write_pages_to_server;
- xfer->write_hole = write_hole_to_server;
- xfer->close = close_server_xfer;
- xfer->dst_id = encode_pm_id(fd_type, id);
- xfer->parent = NULL;
-
- pi.cmd = PS_IOV_OPEN2;
- pi.dst_id = xfer->dst_id;
- pi.vaddr = 0;
- pi.nr_pages = 0;
-
- if (write(xfer->sk, &pi, sizeof(pi)) != sizeof(pi)) {
- pr_perror("Can't write to page server");
- return -1;
- }
-
- /* Push the command NOW */
- tcp_nodelay(xfer->sk, true);
-
- if (read(xfer->sk, &has_parent, 1) != 1) {
- pr_perror("The page server doesn't answer");
- return -1;
- }
-
- if (has_parent)
- xfer->parent = (void *) 1; /* This is required for generate_iovs() */
-
- return 0;
-}
-
-static int write_pagemap_loc(struct page_xfer *xfer,
- struct iovec *iov)
-{
- int ret;
- PagemapEntry pe = PAGEMAP_ENTRY__INIT;
-
- iovec2pagemap(iov, &pe);
- if (opts.auto_dedup && xfer->parent != NULL) {
- ret = dedup_one_iovec(xfer->parent, iov);
- if (ret == -1) {
- pr_perror("Auto-deduplication failed");
- return ret;
- }
- }
- return pb_write_one(xfer->pmi, &pe, PB_PAGEMAP);
-}
-
-static int write_pages_loc(struct page_xfer *xfer,
- int p, unsigned long len)
-{
- ssize_t ret;
-
- ret = splice(p, NULL, img_raw_fd(xfer->pi), NULL, len, SPLICE_F_MOVE);
- if (ret == -1) {
- pr_perror("Unable to spice data");
- return -1;
- }
- if (ret != len) {
- pr_err("Only %zu of %lu bytes have been spliced\n", ret, len);
- return -1;
- }
-
- return 0;
-}
-
-static int check_pagehole_in_parent(struct page_read *p, struct iovec *iov)
-{
- int ret;
- unsigned long off, end;
-
- /*
- * Try to find pagemap entry in parent, from which
- * the data will be read on restore.
- *
- * This is the optimized version of the page-by-page
- * read_pagemap_page routine.
- */
-
- pr_debug("Checking %p/%zu hole\n", iov->iov_base, iov->iov_len);
- off = (unsigned long)iov->iov_base;
- end = off + iov->iov_len;
- while (1) {
- struct iovec piov;
- unsigned long pend;
-
- ret = seek_pagemap_page(p, off, true);
- if (ret <= 0 || !p->pe)
- return -1;
-
- pagemap2iovec(p->pe, &piov);
- pr_debug("\tFound %p/%zu\n", piov.iov_base, piov.iov_len);
-
- /*
- * The pagemap entry in parent may heppen to be
- * shorter, than the hole we write. In this case
- * we should go ahead and check the remainder.
- */
-
- pend = (unsigned long)piov.iov_base + piov.iov_len;
- if (end <= pend)
- return 0;
-
- pr_debug("\t\tcontinue on %lx\n", pend);
- off = pend;
- }
-}
-
-static int write_pagehole_loc(struct page_xfer *xfer, struct iovec *iov)
-{
- PagemapEntry pe = PAGEMAP_ENTRY__INIT;
-
- if (xfer->parent != NULL) {
- int ret;
-
- ret = check_pagehole_in_parent(xfer->parent, iov);
- if (ret) {
- pr_err("Hole %p/%zu not found in parent\n",
- iov->iov_base, iov->iov_len);
- return -1;
- }
- }
-
- iovec2pagemap(iov, &pe);
- pe.has_in_parent = true;
- pe.in_parent = true;
-
- if (pb_write_one(xfer->pmi, &pe, PB_PAGEMAP) < 0)
- return -1;
-
- return 0;
-}
-
-static void close_page_xfer(struct page_xfer *xfer)
-{
- if (xfer->parent != NULL) {
- xfer->parent->close(xfer->parent);
- xfree(xfer->parent);
- xfer->parent = NULL;
- }
- close_image(xfer->pi);
- close_image(xfer->pmi);
-}
-
-int page_xfer_dump_pages(struct page_xfer *xfer, struct page_pipe *pp,
- unsigned long off)
-{
- struct page_pipe_buf *ppb;
- struct iovec *hole = NULL;
-
- pr_debug("Transfering pages:\n");
-
- if (pp->free_hole)
- hole = &pp->holes[0];
-
- list_for_each_entry(ppb, &pp->bufs, l) {
- int i;
-
- pr_debug("\tbuf %d/%d\n", ppb->pages_in, ppb->nr_segs);
-
- for (i = 0; i < ppb->nr_segs; i++) {
- struct iovec *iov = &ppb->iov[i];
-
- while (hole && (hole->iov_base < iov->iov_base)) {
- BUG_ON(hole->iov_base < (void *)off);
- hole->iov_base -= off;
- pr_debug("\th %p [%u]\n", hole->iov_base,
- (unsigned int)(hole->iov_len / PAGE_SIZE));
- if (xfer->write_hole(xfer, hole))
- return -1;
-
- hole++;
- if (hole >= &pp->holes[pp->free_hole])
- hole = NULL;
- }
-
- BUG_ON(iov->iov_base < (void *)off);
- iov->iov_base -= off;
- pr_debug("\tp %p [%u]\n", iov->iov_base,
- (unsigned int)(iov->iov_len / PAGE_SIZE));
-
- if (xfer->write_pagemap(xfer, iov))
- return -1;
- if (xfer->write_pages(xfer, ppb->p[0], iov->iov_len))
- return -1;
- }
- }
-
- while (hole) {
- BUG_ON(hole->iov_base < (void *)off);
- hole->iov_base -= off;
- pr_debug("\th* %p [%u]\n", hole->iov_base,
- (unsigned int)(hole->iov_len / PAGE_SIZE));
- if (xfer->write_hole(xfer, hole))
- return -1;
-
- hole++;
- if (hole >= &pp->holes[pp->free_hole])
- hole = NULL;
- }
-
- return 0;
-}
-
-static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, long id)
-{
- xfer->pmi = open_image(fd_type, O_DUMP, id);
- if (!xfer->pmi)
- return -1;
-
- xfer->pi = open_pages_image(O_DUMP, xfer->pmi);
- if (!xfer->pi) {
- close_image(xfer->pmi);
- return -1;
- }
-
- /*
- * Open page-read for parent images (if it exists). It will
- * be used for two things:
- * 1) when writing a page, those from parent will be dedup-ed
- * 2) when writing a hole, the respective place would be checked
- * to exist in parent (either pagemap or hole)
- */
- xfer->parent = NULL;
- if (fd_type == CR_FD_PAGEMAP) {
- int ret;
- int pfd;
-
- pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY);
- if (pfd < 0 && errno == ENOENT)
- goto out;
-
- xfer->parent = xmalloc(sizeof(*xfer->parent));
- if (!xfer->parent) {
- close(pfd);
- return -1;
- }
-
- ret = open_page_read_at(pfd, id, xfer->parent, PR_TASK);
- if (ret <= 0) {
- pr_perror("No parent image found, though parent directory is set");
- xfree(xfer->parent);
- xfer->parent = NULL;
- close(pfd);
- goto out;
- }
- close(pfd);
- }
-
-out:
- xfer->write_pagemap = write_pagemap_loc;
- xfer->write_pages = write_pages_loc;
- xfer->write_hole = write_pagehole_loc;
- xfer->close = close_page_xfer;
- return 0;
-}
-
-int open_page_xfer(struct page_xfer *xfer, int fd_type, long id)
-{
- if (opts.use_page_server)
- return open_page_server_xfer(xfer, fd_type, id);
- else
- return open_page_local_xfer(xfer, fd_type, id);
-}
-
-/*
- * Return:
- * 1 - if a parent image exists
- * 0 - if a parent image doesn't exist
- * -1 - in error cases
- */
-int check_parent_local_xfer(int fd_type, int id)
-{
- char path[PATH_MAX];
- struct stat st;
- int ret, pfd;
-
- pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY);
- if (pfd < 0 && errno == ENOENT)
- return 0;
-
- snprintf(path, sizeof(path), imgset_template[fd_type].fmt, id);
- ret = fstatat(pfd, path, &st, 0);
- if (ret == -1 && errno != ENOENT) {
- pr_perror("Unable to stat %s", path);
- close(pfd);
- return -1;
- }
-
- close(pfd);
- return (ret == 0);
-}
-
-static int page_server_check_parent(int sk, struct page_server_iov *pi)
-{
- int type, ret;
- long id;
-
- type = decode_pm_type(pi->dst_id);
- id = decode_pm_id(pi->dst_id);
-
- ret = check_parent_local_xfer(type, id);
- if (ret < 0)
- return -1;
-
- if (write(sk, &ret, sizeof(ret)) != sizeof(ret)) {
- pr_perror("Unable to send reponse");
- return -1;
- }
-
- return 0;
-}
-
-static int check_parent_server_xfer(int fd_type, long id)
-{
- struct page_server_iov pi = {};
- int has_parent;
-
- pi.cmd = PS_IOV_PARENT;
- pi.dst_id = encode_pm_id(fd_type, id);
-
- if (write(page_server_sk, &pi, sizeof(pi)) != sizeof(pi)) {
- pr_perror("Can't write to page server");
- return -1;
- }
-
- tcp_nodelay(page_server_sk, true);
-
- if (read(page_server_sk, &has_parent, sizeof(int)) != sizeof(int)) {
- pr_perror("The page server doesn't answer");
- return -1;
- }
-
- return has_parent;
-}
-
-int check_parent_page_xfer(int fd_type, long id)
-{
- if (opts.use_page_server)
- return check_parent_server_xfer(fd_type, id);
- else
- return check_parent_local_xfer(fd_type, id);
-}
diff --git a/pagemap-cache.c b/pagemap-cache.c
deleted file mode 100644
index c2e467b673be..000000000000
--- a/pagemap-cache.c
+++ /dev/null
@@ -1,173 +0,0 @@
-#include <unistd.h>
-#include <fcntl.h>
-
-#include "pagemap-cache.h"
-#include "compiler.h"
-#include "xmalloc.h"
-#include "util.h"
-#include "log.h"
-#include "vma.h"
-#include "kerndat.h"
-
-#undef LOG_PREFIX
-#define LOG_PREFIX "pagemap-cache: "
-
-/* To carry up to 2M of physical memory */
-#define PMC_SHIFT (21)
-#define PMC_SIZE (1ul << PMC_SHIFT)
-#define PMC_MASK (~(PMC_SIZE - 1))
-#define PMC_SIZE_GAP (PMC_SIZE / 4)
-
-#define PAGEMAP_LEN(addr) (PAGE_PFN(addr) * sizeof(u64))
-
-static inline void pmc_reset(pmc_t *pmc)
-{
- memzero(pmc, sizeof(*pmc));
- pmc->fd = -1;
-}
-
-static inline void pmc_zap(pmc_t *pmc)
-{
- pmc->start = pmc->end = 0;
-}
-
-int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t size)
-{
- size_t map_size = max(size, (size_t)PMC_SIZE);
- pmc_reset(pmc);
-
- BUG_ON(!vma_head);
-
- pmc->pid = pid;
- pmc->map_len = PAGEMAP_LEN(map_size);
- pmc->vma_head = vma_head;
-
- pmc->map = xmalloc(pmc->map_len);
- if (!pmc->map)
- goto err;
-
- if (kdat.pmap == PM_DISABLED) {
- pmc->fd = -1;
- pr_warn("No pagemap for %d available, "
- "switching to greedy mode\n", pid);
- } else {
- pmc->fd = open_proc(pid, "pagemap");
- if (pmc->fd < 0)
- goto err;
- }
-
- pr_debug("created for pid %d (takes %zu bytes)\n", pid, pmc->map_len);
- return 0;
-
-err:
- pr_err("Failed to init pagemap for %d\n", pid);
- pmc_fini(pmc);
- return -1;
-}
-
-static inline u64 *__pmc_get_map(pmc_t *pmc, unsigned long addr)
-{
- return &pmc->map[PAGE_PFN(addr - pmc->start)];
-}
-
-static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma)
-{
- unsigned long low = vma->e->start & PMC_MASK;
- unsigned long high = low + PMC_SIZE;
- size_t len = vma_area_len(vma);
- size_t size_map;
-
- if (high > kdat.task_size)
- high = kdat.task_size;
-
- pmc->start = vma->e->start;
- pmc->end = vma->e->end;
-
- pr_debug("filling VMA %lx-%lx (%zuK) [l:%lx h:%lx]\n",
- (long)vma->e->start, (long)vma->e->end, len >> 10, low, high);
-
- /*
- * If we meet a small VMA, lets try to fit 2M cache
- * window at least 75% full, otherwise left as a plain
- * "one vma at a time" read. Note the VMAs in cache must
- * fit in solid manner, iow -- either the whole vma fits
- * the cache window, either plain read is used.
- *
- * The benefit (apart redusing the number of read() calls)
- * is to walk page tables less.
- */
- if (len < PMC_SIZE && (vma->e->start - low) < PMC_SIZE_GAP) {
- size_t size_cov = len;
- size_t nr_vmas = 1;
-
- pr_debug("\t%16lx-%-16lx nr:%-5zu cov:%zu\n",
- (long)vma->e->start, (long)vma->e->end, nr_vmas, size_cov);
-
- list_for_each_entry_continue(vma, pmc->vma_head, list) {
- if (vma->e->start > high || vma->e->end > high)
- break;
-
- BUG_ON(vma->e->start < low);
- size_cov += vma_area_len(vma);
- nr_vmas++;
-
- pr_debug("\t%16lx-%-16lx nr:%-5zu cov:%zu\n",
- (long)vma->e->start, (long)vma->e->end, nr_vmas, size_cov);
- }
-
- if (nr_vmas > 1) {
- /*
- * Note we don't touch low bound since it's set
- * to first VMA start already and not updating it
- * allows us to save a couple of code bytes.
- */
- pmc->end = high;
- pr_debug("\tcache mode [l:%lx h:%lx]\n", pmc->start, pmc->end);
- } else
- pr_debug("\tsimple mode [l:%lx h:%lx]\n", pmc->start, pmc->end);
- }
-
- size_map = PAGEMAP_LEN(pmc->end - pmc->start);
- BUG_ON(pmc->map_len < size_map);
-
- if (unlikely(pmc->fd < 0)) {
- /*
- * We don't have access to the dumpee pagemap so fill
- * everything as present. It's better than refuse
- * to dump because it simply disables optimisation.
- */
- memset(pmc->map, 1, size_map);
- } else {
- if (pread(pmc->fd, pmc->map, size_map, PAGEMAP_PFN_OFF(pmc->start)) != size_map) {
- pmc_zap(pmc);
- pr_perror("Can't read %d's pagemap file", pmc->pid);
- return -1;
- }
- }
-
- return 0;
-}
-
-u64 *pmc_get_map(pmc_t *pmc, const struct vma_area *vma)
-{
- /* Hit */
- if (likely(pmc->start <= vma->e->start && pmc->end >= vma->e->end))
- return __pmc_get_map(pmc, vma->e->start);
-
- /* Miss, refill the cache */
- if (pmc_fill_cache(pmc, vma)) {
- pr_err("Failed to fill cache for %d (%lx-%lx)\n",
- pmc->pid, (long)vma->e->start, (long)vma->e->end);
- return NULL;
- }
-
- /* Hit for sure */
- return __pmc_get_map(pmc, vma->e->start);
-}
-
-void pmc_fini(pmc_t *pmc)
-{
- close_safe(&pmc->fd);
- xfree(pmc->map);
- pmc_reset(pmc);
-}
diff --git a/parasite-syscall.c b/parasite-syscall.c
deleted file mode 100644
index feb77b53b93d..000000000000
--- a/parasite-syscall.c
+++ /dev/null
@@ -1,1408 +0,0 @@
-#include <unistd.h>
-#include <inttypes.h>
-
-#include <sys/stat.h>
-#include <sys/wait.h>
-#include <sys/mman.h>
-
-#include "protobuf.h"
-#include "protobuf/sa.pb-c.h"
-#include "protobuf/timer.pb-c.h"
-#include "protobuf/creds.pb-c.h"
-#include "protobuf/core.pb-c.h"
-#include "protobuf/pagemap.pb-c.h"
-
-#include "imgset.h"
-#include "ptrace.h"
-#include "asm/processor-flags.h"
-#include "parasite-syscall.h"
-#include "parasite-blob.h"
-#include "parasite.h"
-#include "crtools.h"
-#include "namespaces.h"
-#include "kerndat.h"
-#include "config.h"
-#include "pstree.h"
-#include "posix-timer.h"
-#include "net.h"
-#include "mem.h"
-#include "vma.h"
-#include "proc_parse.h"
-#include "aio.h"
-
-#include <string.h>
-#include <stdlib.h>
-#include <elf.h>
-
-#include "asm/parasite-syscall.h"
-#include "asm/dump.h"
-#include "asm/restorer.h"
-#include "pie/pie-relocs.h"
-
-#define MEMFD_FNAME "CRIUMFD"
-#define MEMFD_FNAME_SZ sizeof(MEMFD_FNAME)
-
-static int can_run_syscall(unsigned long ip, unsigned long start,
- unsigned long end, unsigned long pad)
-{
- return ip >= start && ip < (end - code_syscall_size - pad);
-}
-
-static int syscall_fits_vma_area(struct vma_area *vma_area, unsigned long pad)
-{
- return can_run_syscall((unsigned long)vma_area->e->start,
- (unsigned long)vma_area->e->start,
- (unsigned long)vma_area->e->end,
- pad);
-}
-
-static struct vma_area *get_vma_by_ip(struct list_head *vma_area_list,
- unsigned long ip,
- unsigned long pad)
-{
- struct vma_area *vma_area;
-
- list_for_each_entry(vma_area, vma_area_list, list) {
- if (vma_area->e->start >= kdat.task_size)
- continue;
- if (!(vma_area->e->prot & PROT_EXEC))
- continue;
- if (syscall_fits_vma_area(vma_area, pad))
- return vma_area;
- }
-
- return NULL;
-}
-
-static inline int ptrace_get_regs(int pid, user_regs_struct_t *regs)
-{
- struct iovec iov;
-
- iov.iov_base = regs;
- iov.iov_len = sizeof(user_regs_struct_t);
- return ptrace(PTRACE_GETREGSET, pid, NT_PRSTATUS, &iov);
-}
-
-static inline int ptrace_set_regs(int pid, user_regs_struct_t *regs)
-{
- struct iovec iov;
-
- iov.iov_base = regs;
- iov.iov_len = sizeof(user_regs_struct_t);
- return ptrace(PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov);
-}
-
-static int get_thread_ctx(int pid, struct thread_ctx *ctx)
-{
- if (ptrace(PTRACE_GETSIGMASK, pid, sizeof(k_rtsigset_t), &ctx->sigmask)) {
- pr_perror("can't get signal blocking mask for %d", pid);
- return -1;
- }
-
- if (ptrace_get_regs(pid, &ctx->regs)) {
- pr_perror("Can't obtain registers (pid: %d)", pid);
- return -1;
- }
-
- return 0;
-}
-
-static int restore_thread_ctx(int pid, struct thread_ctx *ctx)
-{
- int ret = 0;
-
- if (ptrace_set_regs(pid, &ctx->regs)) {
- pr_perror("Can't restore registers (pid: %d)", pid);
- ret = -1;
- }
- if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &ctx->sigmask)) {
- pr_perror("Can't block signals");
- ret = -1;
- }
-
- return ret;
-}
-
-static int parasite_run(pid_t pid, int cmd, unsigned long ip, void *stack,
- user_regs_struct_t *regs, struct thread_ctx *octx)
-{
- k_rtsigset_t block;
-
- ksigfillset(&block);
- if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &block)) {
- pr_perror("Can't block signals for %d", pid);
- goto err_sig;
- }
-
- parasite_setup_regs(ip, stack, regs);
- if (ptrace_set_regs(pid, regs)) {
- pr_perror("Can't set registers for %d", pid);
- goto err_regs;
- }
-
- if (ptrace(cmd, pid, NULL, NULL)) {
- pr_perror("Can't run parasite at %d", pid);
- goto err_cont;
- }
-
- return 0;
-
-err_cont:
- if (ptrace_set_regs(pid, &octx->regs))
- pr_perror("Can't restore regs for %d", pid);
-err_regs:
- if (ptrace(PTRACE_SETSIGMASK, pid, sizeof(k_rtsigset_t), &octx->sigmask))
- pr_perror("Can't restore sigmask for %d", pid);
-err_sig:
- return -1;
-}
-
-/* we run at @regs->ip */
-static int parasite_trap(struct parasite_ctl *ctl, pid_t pid,
- user_regs_struct_t *regs,
- struct thread_ctx *octx)
-{
- siginfo_t siginfo;
- int status;
- int ret = -1;
-
- /*
- * Most ideas are taken from Tejun Heo's parasite thread
- * https://code.google.com/p/ptrace-parasite/
- */
-
- if (wait4(pid, &status, __WALL, NULL) != pid) {
- pr_perror("Waited pid mismatch (pid: %d)", pid);
- goto err;
- }
-
- if (!WIFSTOPPED(status)) {
- pr_err("Task is still running (pid: %d)\n", pid);
- goto err;
- }
-
- if (ptrace(PTRACE_GETSIGINFO, pid, NULL, &siginfo)) {
- pr_perror("Can't get siginfo (pid: %d)", pid);
- goto err;
- }
-
- if (ptrace_get_regs(pid, regs)) {
- pr_perror("Can't obtain registers (pid: %d)", pid);
- goto err;
- }
-
- if (WSTOPSIG(status) != SIGTRAP || siginfo.si_code != ARCH_SI_TRAP) {
- pr_debug("** delivering signal %d si_code=%d\n",
- siginfo.si_signo, siginfo.si_code);
-
- pr_err("Unexpected %d task interruption, aborting\n", pid);
- goto err;
- }
-
- /*
- * We've reached this point if int3 is triggered inside our
- * parasite code. So we're done.
- */
- ret = 0;
-err:
- if (restore_thread_ctx(pid, octx))
- ret = -1;
-
- return ret;
-}
-
-int __parasite_execute_syscall(struct parasite_ctl *ctl, user_regs_struct_t *regs)
-{
- pid_t pid = ctl->pid.real;
- int err;
- u8 code_orig[BUILTIN_SYSCALL_SIZE];
-
- /*
- * Inject syscall instruction and remember original code,
- * we will need it to restore original program content.
- */
- memcpy(code_orig, code_syscall, sizeof(code_orig));
- if (ptrace_swap_area(pid, (void *)ctl->syscall_ip,
- (void *)code_orig, sizeof(code_orig))) {
- pr_err("Can't inject syscall blob (pid: %d)\n", pid);
- return -1;
- }
-
- err = parasite_run(pid, PTRACE_CONT, ctl->syscall_ip, 0, regs, &ctl->orig);
- if (!err)
- err = parasite_trap(ctl, pid, regs, &ctl->orig);
-
- if (ptrace_poke_area(pid, (void *)code_orig,
- (void *)ctl->syscall_ip, sizeof(code_orig))) {
- pr_err("Can't restore syscall blob (pid: %d)\n", ctl->pid.real);
- err = -1;
- }
-
- return err;
-}
-
-void *parasite_args_s(struct parasite_ctl *ctl, int args_size)
-{
- BUG_ON(args_size > ctl->args_size);
- return ctl->addr_args;
-}
-
-static int parasite_execute_trap_by_pid(unsigned int cmd,
- struct parasite_ctl *ctl, pid_t pid,
- void *stack,
- struct thread_ctx *octx)
-{
- user_regs_struct_t regs = octx->regs;
- int ret;
-
- *ctl->addr_cmd = cmd;
-
- ret = parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, stack, ®s, octx);
- if (ret == 0)
- ret = parasite_trap(ctl, pid, ®s, octx);
- if (ret == 0)
- ret = (int)REG_RES(regs);
-
- if (ret)
- pr_err("Parasite exited with %d\n", ret);
-
- return ret;
-}
-
-static int __parasite_send_cmd(int sockfd, struct ctl_msg *m)
-{
- int ret;
-
- ret = send(sockfd, m, sizeof(*m), 0);
- if (ret == -1) {
- pr_perror("Failed to send command %d to daemon", m->cmd);
- return -1;
- } else if (ret != sizeof(*m)) {
- pr_err("Message to daemon is trimmed (%d/%d)\n",
- (int)sizeof(*m), ret);
- return -1;
- }
-
- pr_debug("Sent msg to daemon %d %d %d\n", m->cmd, m->ack, m->err);
- return 0;
-}
-
-static int parasite_wait_ack(int sockfd, unsigned int cmd, struct ctl_msg *m)
-{
- int ret;
-
- pr_debug("Wait for ack %d on daemon socket\n", cmd);
-
- while (1) {
- memzero(m, sizeof(*m));
-
- ret = recv(sockfd, m, sizeof(*m), MSG_WAITALL);
- if (ret == -1) {
- pr_perror("Failed to read ack");
- return -1;
- } else if (ret != sizeof(*m)) {
- pr_err("Message reply from daemon is trimmed (%d/%d)\n",
- (int)sizeof(*m), ret);
- return -1;
- }
- pr_debug("Fetched ack: %d %d %d\n",
- m->cmd, m->ack, m->err);
-
- if (m->cmd != cmd || m->ack != cmd) {
- pr_err("Communication error, this is not "
- "the ack we expected\n");
- return -1;
- }
- return 0;
- }
-
- return -1;
-}
-
-int __parasite_wait_daemon_ack(unsigned int cmd,
- struct parasite_ctl *ctl)
-{
- struct ctl_msg m;
-
- if (parasite_wait_ack(ctl->tsock, cmd, &m))
- return -1;
-
- if (m.err != 0) {
- pr_err("Command %d for daemon failed with %d\n",
- cmd, m.err);
- return -1;
- }
-
- return 0;
-}
-
-int __parasite_execute_daemon(unsigned int cmd, struct parasite_ctl *ctl)
-{
- struct ctl_msg m;
-
- m = ctl_msg_cmd(cmd);
- return __parasite_send_cmd(ctl->tsock, &m);
-}
-
-int parasite_execute_daemon(unsigned int cmd, struct parasite_ctl *ctl)
-{
- int ret;
-
- ret = __parasite_execute_daemon(cmd, ctl);
- if (!ret)
- ret = __parasite_wait_daemon_ack(cmd, ctl);
-
- return ret;
-}
-
-static int gen_parasite_saddr(struct sockaddr_un *saddr, int key)
-{
- int sun_len;
-
- saddr->sun_family = AF_UNIX;
- snprintf(saddr->sun_path, UNIX_PATH_MAX,
- "X/crtools-pr-%d", key);
-
- sun_len = SUN_LEN(saddr);
- *saddr->sun_path = '\0';
-
- return sun_len;
-}
-
-int parasite_send_fd(struct parasite_ctl *ctl, int fd)
-{
- if (send_fd(ctl->tsock, NULL, 0, fd) < 0) {
- pr_perror("Can't send file descriptor");
- return -1;
- }
- return 0;
-}
-
-/*
- * We need to detect parasite crashes not to hang on socket operations.
- * Since CRIU holds parasite with ptrace, it will receive SIGCHLD if the
- * latter would crash.
- *
- * This puts a restriction on how to execute a sub-process on dump stage.
- * One should use the cr_system helper, that blocks sigcild and waits
- * for the spawned program to finish.
- */
-static void sigchld_handler(int signal, siginfo_t *siginfo, void *data)
-{
- int pid, status;
-
- pid = waitpid(-1, &status, WNOHANG);
- if (pid <= 0)
- return;
-
- pr_err("si_code=%d si_pid=%d si_status=%d\n",
- siginfo->si_code, siginfo->si_pid, siginfo->si_status);
-
- if (WIFEXITED(status))
- pr_err("%d exited with %d unexpectedly\n", pid, WEXITSTATUS(status));
- else if (WIFSIGNALED(status))
- pr_err("%d was killed by %d unexpectedly\n", pid, WTERMSIG(status));
- else if (WIFSTOPPED(status))
- pr_err("%d was stopped by %d unexpectedly\n", pid, WSTOPSIG(status));
-
- exit(1);
-}
-
-static int setup_child_handler()
-{
- struct sigaction sa = {
- .sa_sigaction = sigchld_handler,
- .sa_flags = SA_SIGINFO | SA_RESTART,
- };
-
- sigemptyset(&sa.sa_mask);
- sigaddset(&sa.sa_mask, SIGCHLD);
- if (sigaction(SIGCHLD, &sa, NULL)) {
- pr_perror("Unable to setup SIGCHLD handler");
- return -1;
- }
-
- return 0;
-}
-
-static int restore_child_handler()
-{
- struct sigaction sa = {
- .sa_handler = SIG_DFL,
- .sa_flags = SA_SIGINFO | SA_RESTART,
- };
-
- sigemptyset(&sa.sa_mask);
- sigaddset(&sa.sa_mask, SIGCHLD);
- if (sigaction(SIGCHLD, &sa, NULL)) {
- pr_perror("Unable to setup SIGCHLD handler");
- return -1;
- }
-
- return 0;
-}
-
-static int prepare_tsock(struct parasite_ctl *ctl, pid_t pid,
- struct parasite_init_args *args, struct ns_id *net)
-{
- static int ssock = -1;
-
- pr_info("Putting tsock into pid %d\n", pid);
- args->h_addr_len = gen_parasite_saddr(&args->h_addr, getpid());
-
- if (ssock == -1) {
- ssock = net->net.seqsk;
- net->net.seqsk = -1;
-
- if (bind(ssock, (struct sockaddr *)&args->h_addr, args->h_addr_len) < 0) {
- pr_perror("Can't bind socket");
- goto err;
- }
-
- if (listen(ssock, 1)) {
- pr_perror("Can't listen on transport socket");
- goto err;
- }
- }
-
- /*
- * Set to -1 to prevent any accidental misuse. The
- * only valid user of it is accept_tsock().
- */
- ctl->tsock = -ssock;
- return 0;
-err:
- close_safe(&ssock);
- return -1;
-}
-
-static int accept_tsock(struct parasite_ctl *ctl)
-{
- int sock;
- int ask = -ctl->tsock; /* this '-' is explained above */
-
- sock = accept(ask, NULL, 0);
- if (sock < 0) {
- pr_perror("Can't accept connection to the transport socket");
- close(ask);
- return -1;
- }
-
- ctl->tsock = sock;
- return 0;
-}
-
-static int parasite_init_daemon(struct parasite_ctl *ctl, struct ns_id *net)
-{
- struct parasite_init_args *args;
- pid_t pid = ctl->pid.real;
- user_regs_struct_t regs;
- struct ctl_msg m = { };
-
- *ctl->addr_cmd = PARASITE_CMD_INIT_DAEMON;
-
- args = parasite_args(ctl, struct parasite_init_args);
-
- args->sigframe = ctl->rsigframe;
- args->log_level = log_get_loglevel();
-
- if (prepare_tsock(ctl, pid, args, net))
- goto err;
-
- /* after this we can catch parasite errors in chld handler */
- if (setup_child_handler())
- goto err;
-
- regs = ctl->orig.regs;
- if (parasite_run(pid, PTRACE_CONT, ctl->parasite_ip, ctl->rstack, ®s, &ctl->orig))
- goto err;
-
- if (accept_tsock(ctl) < 0)
- goto err;
-
- if (parasite_send_fd(ctl, log_get_fd()))
- goto err;
-
- pr_info("Wait for parasite being daemonized...\n");
-
- if (parasite_wait_ack(ctl->tsock, PARASITE_CMD_INIT_DAEMON, &m)) {
- pr_err("Can't switch parasite %d to daemon mode %d\n",
- pid, m.err);
- goto err;
- }
-
- ctl->sigreturn_addr = args->sigreturn_addr;
- ctl->daemonized = true;
- pr_info("Parasite %d has been switched to daemon mode\n", pid);
- return 0;
-err:
- return -1;
-}
-
-static int alloc_groups_copy_creds(CredsEntry *ce, struct parasite_dump_creds *c)
-{
- BUILD_BUG_ON(sizeof(ce->groups[0]) != sizeof(c->groups[0]));
- BUILD_BUG_ON(sizeof(ce->cap_inh[0]) != sizeof(c->cap_inh[0]));
- BUILD_BUG_ON(sizeof(ce->cap_prm[0]) != sizeof(c->cap_prm[0]));
- BUILD_BUG_ON(sizeof(ce->cap_eff[0]) != sizeof(c->cap_eff[0]));
- BUILD_BUG_ON(sizeof(ce->cap_bnd[0]) != sizeof(c->cap_bnd[0]));
-
- BUG_ON(ce->n_cap_inh != CR_CAP_SIZE);
- BUG_ON(ce->n_cap_prm != CR_CAP_SIZE);
- BUG_ON(ce->n_cap_eff != CR_CAP_SIZE);
- BUG_ON(ce->n_cap_bnd != CR_CAP_SIZE);
-
- memcpy(ce->cap_inh, c->cap_inh, sizeof(c->cap_inh[0]) * CR_CAP_SIZE);
- memcpy(ce->cap_prm, c->cap_prm, sizeof(c->cap_prm[0]) * CR_CAP_SIZE);
- memcpy(ce->cap_eff, c->cap_eff, sizeof(c->cap_eff[0]) * CR_CAP_SIZE);
- memcpy(ce->cap_bnd, c->cap_bnd, sizeof(c->cap_bnd[0]) * CR_CAP_SIZE);
-
- ce->secbits = c->secbits;
- ce->n_groups = c->ngroups;
-
- ce->groups = xmemdup(c->groups, sizeof(c->groups[0]) * c->ngroups);
-
- ce->uid = c->uids[0];
- ce->gid = c->gids[0];
- ce->euid = c->uids[1];
- ce->egid = c->gids[1];
- ce->suid = c->uids[2];
- ce->sgid = c->gids[2];
- ce->fsuid = c->uids[3];
- ce->fsgid = c->gids[3];
-
- return ce->groups ? 0 : -ENOMEM;
-}
-
-int parasite_dump_thread_leader_seized(struct parasite_ctl *ctl, int pid, CoreEntry *core)
-{
- ThreadCoreEntry *tc = core->thread_core;
- struct parasite_dump_thread *args;
- struct parasite_dump_creds *pc;
- int ret;
-
- args = parasite_args(ctl, struct parasite_dump_thread);
-
- pc = args->creds;
- pc->cap_last_cap = kdat.last_cap;
-
- ret = parasite_execute_daemon(PARASITE_CMD_DUMP_THREAD, ctl);
- if (ret < 0)
- return ret;
-
- ret = alloc_groups_copy_creds(tc->creds, pc);
- if (ret) {
- pr_err("Can't copy creds for thread leader %d\n", pid);
- return -1;
- }
-
- return dump_thread_core(pid, core, args);
-}
-
-int parasite_dump_thread_seized(struct parasite_ctl *ctl, int id,
- struct pid *tid, CoreEntry *core)
-{
- struct parasite_dump_thread *args;
- pid_t pid = tid->real;
- ThreadCoreEntry *tc = core->thread_core;
- CredsEntry *creds = tc->creds;
- struct parasite_dump_creds *pc;
- int ret;
- struct thread_ctx octx;
-
- BUG_ON(id == 0); /* Leader is dumped in dump_task_core_all */
-
- args = parasite_args(ctl, struct parasite_dump_thread);
-
- pc = args->creds;
- pc->cap_last_cap = kdat.last_cap;
-
- ret = get_thread_ctx(pid, &octx);
- if (ret)
- return -1;
-
- tc->has_blk_sigset = true;
- memcpy(&tc->blk_sigset, &octx.sigmask, sizeof(k_rtsigset_t));
-
- ret = parasite_execute_trap_by_pid(PARASITE_CMD_DUMP_THREAD, ctl,
- pid, ctl->r_thread_stack, &octx);
- if (ret) {
- pr_err("Can't init thread in parasite %d\n", pid);
- return -1;
- }
-
- ret = alloc_groups_copy_creds(creds, pc);
- if (ret) {
- pr_err("Can't copy creds for thread %d\n", pid);
- return -1;
- }
-
- ret = get_task_regs(pid, octx.regs, core);
- if (ret) {
- pr_err("Can't obtain regs for thread %d\n", pid);
- return -1;
- }
-
- tid->virt = args->tid;
- return dump_thread_core(pid, core, args);
-}
-
-int parasite_dump_sigacts_seized(struct parasite_ctl *ctl, struct cr_imgset *cr_imgset)
-{
- struct parasite_dump_sa_args *args;
- int ret, sig;
- struct cr_img *img;
- SaEntry se = SA_ENTRY__INIT;
-
- args = parasite_args(ctl, struct parasite_dump_sa_args);
-
- ret = parasite_execute_daemon(PARASITE_CMD_DUMP_SIGACTS, ctl);
- if (ret < 0)
- return ret;
-
- img = img_from_set(cr_imgset, CR_FD_SIGACT);
-
- for (sig = 1; sig <= SIGMAX; sig++) {
- int i = sig - 1;
-
- if (sig == SIGSTOP || sig == SIGKILL)
- continue;
-
- ASSIGN_TYPED(se.sigaction, encode_pointer(args->sas[i].rt_sa_handler));
- ASSIGN_TYPED(se.flags, args->sas[i].rt_sa_flags);
- ASSIGN_TYPED(se.restorer, encode_pointer(args->sas[i].rt_sa_restorer));
- ASSIGN_TYPED(se.mask, args->sas[i].rt_sa_mask.sig[0]);
-
- if (pb_write_one(img, &se, PB_SIGACT) < 0)
- return -1;
- }
-
- return 0;
-}
-
-static void encode_itimer(struct itimerval *v, ItimerEntry *ie)
-{
- ie->isec = v->it_interval.tv_sec;
- ie->iusec = v->it_interval.tv_usec;
- ie->vsec = v->it_value.tv_sec;
- ie->vusec = v->it_value.tv_usec;
-}
-
-int parasite_dump_itimers_seized(struct parasite_ctl *ctl, struct pstree_item *item)
-{
- CoreEntry *core = item->core[0];
- struct parasite_dump_itimers_args *args;
- int ret;
-
- args = parasite_args(ctl, struct parasite_dump_itimers_args);
-
- ret = parasite_execute_daemon(PARASITE_CMD_DUMP_ITIMERS, ctl);
- if (ret < 0)
- return ret;
-
- encode_itimer(&args->real, core->tc->timers->real);
- encode_itimer(&args->virt, core->tc->timers->virt);
- encode_itimer(&args->prof, core->tc->timers->prof);
-
- return 0;
-}
-
-static void encode_posix_timer(struct posix_timer *v, struct proc_posix_timer *vp, PosixTimerEntry *pte)
-{
- pte->it_id = vp->spt.it_id;
- pte->clock_id = vp->spt.clock_id;
- pte->si_signo = vp->spt.si_signo;
- pte->it_sigev_notify = vp->spt.it_sigev_notify;
- pte->sival_ptr = encode_pointer(vp->spt.sival_ptr);
-
- pte->overrun = v->overrun;
-
- pte->isec = v->val.it_interval.tv_sec;
- pte->insec = v->val.it_interval.tv_nsec;
- pte->vsec = v->val.it_value.tv_sec;
- pte->vnsec = v->val.it_value.tv_nsec;
-}
-
-static int core_alloc_posix_timers(TaskTimersEntry *tte, int n,
- PosixTimerEntry **pte)
-{
- int sz;
-
- /*
- * Will be free()-ed in core_entry_free()
- */
-
- sz = n * (sizeof(PosixTimerEntry *) + sizeof(PosixTimerEntry));
- tte->posix = xmalloc(sz);
- if (!tte->posix)
- return -1;
-
- tte->n_posix = n;
- *pte = (PosixTimerEntry *)(tte->posix + n);
- return 0;
-}
-
-int parasite_dump_posix_timers_seized(struct proc_posix_timers_stat *proc_args,
- struct parasite_ctl *ctl, struct pstree_item *item)
-{
- CoreEntry *core = item->core[0];
- TaskTimersEntry *tte = core->tc->timers;
- PosixTimerEntry *pte;
- struct parasite_dump_posix_timers_args * args;
- struct proc_posix_timer *temp;
- int i;
- int ret = 0;
-
- if (core_alloc_posix_timers(tte, proc_args->timer_n, &pte))
- return -1;
-
- args = parasite_args_s(ctl, posix_timers_dump_size(proc_args->timer_n));
- args->timer_n = proc_args->timer_n;
-
- i = 0;
- list_for_each_entry(temp, &proc_args->timers, list) {
- args->timer[i].it_id = temp->spt.it_id;
- i++;
- }
-
- ret = parasite_execute_daemon(PARASITE_CMD_DUMP_POSIX_TIMERS, ctl);
- if (ret < 0)
- goto end_posix;
-
- i = 0;
- list_for_each_entry(temp, &proc_args->timers, list) {
- posix_timer_entry__init(&pte[i]);
- encode_posix_timer(&args->timer[i], temp, &pte[i]);
- tte->posix[i] = &pte[i];
- i++;
- }
-
-end_posix:
- free_posix_timers(proc_args);
- return ret;
-}
-
-int parasite_dump_misc_seized(struct parasite_ctl *ctl, struct parasite_dump_misc *misc)
-{
- struct parasite_dump_misc *ma;
-
- ma = parasite_args(ctl, struct parasite_dump_misc);
- if (parasite_execute_daemon(PARASITE_CMD_DUMP_MISC, ctl) < 0)
- return -1;
-
- *misc = *ma;
- return 0;
-}
-
-struct parasite_tty_args *parasite_dump_tty(struct parasite_ctl *ctl, int fd, int type)
-{
- struct parasite_tty_args *p;
-
- p = parasite_args(ctl, struct parasite_tty_args);
- p->fd = fd;
- p->type = type;
-
- if (parasite_execute_daemon(PARASITE_CMD_DUMP_TTY, ctl) < 0)
- return NULL;
-
- return p;
-}
-
-int parasite_drain_fds_seized(struct parasite_ctl *ctl,
- struct parasite_drain_fd *dfds, int *lfds, struct fd_opts *opts)
-{
- int ret = -1, size;
- struct parasite_drain_fd *args;
-
- size = drain_fds_size(dfds);
- args = parasite_args_s(ctl, size);
- memcpy(args, dfds, size);
-
- ret = __parasite_execute_daemon(PARASITE_CMD_DRAIN_FDS, ctl);
- if (ret) {
- pr_err("Parasite failed to drain descriptors\n");
- goto err;
- }
-
- ret = recv_fds(ctl->tsock, lfds, dfds->nr_fds, opts);
- if (ret)
- pr_err("Can't retrieve FDs from socket\n");
-
- ret |= __parasite_wait_daemon_ack(PARASITE_CMD_DRAIN_FDS, ctl);
-err:
- return ret;
-}
-
-int parasite_get_proc_fd_seized(struct parasite_ctl *ctl)
-{
- int ret = -1, fd;
-
- ret = __parasite_execute_daemon(PARASITE_CMD_GET_PROC_FD, ctl);
- if (ret) {
- pr_err("Parasite failed to get proc fd\n");
- return ret;
- }
-
- fd = recv_fd(ctl->tsock);
- if (fd < 0)
- pr_err("Can't retrieve FD from socket\n");
- if (__parasite_wait_daemon_ack(PARASITE_CMD_GET_PROC_FD, ctl)) {
- close_safe(&fd);
- return -1;
- }
-
- return fd;
-}
-
-/* This is officially the 50000'th line in the CRIU source code */
-
-static bool task_in_parasite(struct parasite_ctl *ctl, user_regs_struct_t *regs)
-{
- void *addr = (void *) REG_IP(*regs);
- return addr >= ctl->remote_map &&
- addr < ctl->remote_map + ctl->map_length;
-}
-
-static int parasite_fini_seized(struct parasite_ctl *ctl)
-{
- pid_t pid = ctl->pid.real;
- user_regs_struct_t regs;
- int status, ret = 0;
- enum trace_flags flag;
-
- /* stop getting chld from parasite -- we're about to step-by-step it */
- if (restore_child_handler())
- return -1;
-
- /* Start to trace syscalls for each thread */
- if (ptrace(PTRACE_INTERRUPT, pid, NULL, NULL)) {
- pr_perror("Unable to interrupt the process");
- return -1;
- }
-
- pr_debug("Waiting for %d to trap\n", pid);
- if (wait4(pid, &status, __WALL, NULL) != pid) {
- pr_perror("Waited pid mismatch (pid: %d)", pid);
- return -1;
- }
-
- pr_debug("Daemon %d exited trapping\n", pid);
- if (!WIFSTOPPED(status)) {
- pr_err("Task is still running (pid: %d)\n", pid);
- return -1;
- }
-
- ret = ptrace_get_regs(pid, ®s);
- if (ret) {
- pr_perror("Unable to get registers");
- return -1;
- }
-
- if (!task_in_parasite(ctl, ®s)) {
- pr_err("The task is not in parasite code\n");
- return -1;
- }
-
- ret = __parasite_execute_daemon(PARASITE_CMD_FINI, ctl);
- close_safe(&ctl->tsock);
- if (ret)
- return -1;
-
- /* Go to sigreturn as closer as we can */
- ret = ptrace_stop_pie(pid, ctl->sigreturn_addr, &flag);
- if (ret < 0)
- return ret;
-
- if (parasite_stop_on_syscall(1, __NR_rt_sigreturn, flag))
- return -1;
-
- if (ptrace_flush_breakpoints(pid))
- return -1;
-
- /*
- * All signals are unblocked now. The kernel notifies about leaving
- * syscall before starting to deliver signals. All parasite code are
- * executed with blocked signals, so we can sefly unmap a parasite blob.
- */
-
- return 0;
-}
-
-/*
- * Trap tasks on the exit from the specified syscall
- *
- * tasks - number of processes, which should be trapped
- * sys_nr - the required syscall number
- */
-int parasite_stop_on_syscall(int tasks, const int sys_nr, enum trace_flags trace)
-{
- user_regs_struct_t regs;
- int status, ret;
- pid_t pid;
-
- if (tasks > 1)
- trace = TRACE_ALL;
-
- /* Stop all threads on the enter point in sys_rt_sigreturn */
- while (tasks) {
- pid = wait4(-1, &status, __WALL, NULL);
- if (pid == -1) {
- pr_perror("wait4 failed");
- return -1;
- }
-
- if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGTRAP) {
- pr_err("Task is in unexpected state: %x\n", status);
- return -1;
- }
-
- pr_debug("%d was trapped\n", pid);
-
- if (trace == TRACE_EXIT) {
- trace = TRACE_ENTER;
- pr_debug("`- Expecting exit\n");
- goto goon;
- }
- if (trace == TRACE_ENTER)
- trace = TRACE_EXIT;
-
- ret = ptrace_get_regs(pid, ®s);
- if (ret) {
- pr_perror("ptrace");
- return -1;
- }
-
- pr_debug("%d is going to execute the syscall %lx\n", pid, REG_SYSCALL_NR(regs));
- if (REG_SYSCALL_NR(regs) == sys_nr) {
- /*
- * The process is going to execute the required syscall,
- * the next stop will be on the exit from this syscall
- */
- ret = ptrace(PTRACE_SYSCALL, pid, NULL, NULL);
- if (ret) {
- pr_perror("ptrace");
- return -1;
- }
-
- pid = wait4(pid, &status, __WALL, NULL);
- if (pid == -1) {
- pr_perror("wait4 failed");
- return -1;
- }
-
- if (!WIFSTOPPED(status) || WSTOPSIG(status) != SIGTRAP) {
- pr_err("Task is in unexpected state: %x\n", status);
- return -1;
- }
-
- pr_debug("%d was stopped\n", pid);
- tasks--;
- continue;
- }
-goon:
- ret = ptrace(PTRACE_SYSCALL, pid, NULL, NULL);
- if (ret) {
- pr_perror("ptrace");
- return -1;
- }
- }
-
- return 0;
-}
-
-int parasite_stop_daemon(struct parasite_ctl *ctl)
-{
- if (ctl->daemonized) {
- /*
- * Looks like a previous attempt failed, we should do
- * nothing in this case. parasite will try to cure itself.
- */
- if (ctl->tsock < 0)
- return -1;
-
- if (parasite_fini_seized(ctl)) {
- close_safe(&ctl->tsock);
- return -1;
- }
- }
-
- ctl->daemonized = false;
-
- return 0;
-}
-
-int parasite_cure_remote(struct parasite_ctl *ctl)
-{
- int ret = 0;
-
- if (parasite_stop_daemon(ctl))
- return -1;
-
- if (ctl->remote_map) {
- struct parasite_unmap_args *args;
-
- *ctl->addr_cmd = PARASITE_CMD_UNMAP;
-
- args = parasite_args(ctl, struct parasite_unmap_args);
- args->parasite_start = ctl->remote_map;
- args->parasite_len = ctl->map_length;
- if (parasite_unmap(ctl, ctl->parasite_ip))
- ret = -1;
- }
-
- return ret;
-}
-
-int parasite_cure_local(struct parasite_ctl *ctl)
-{
- int ret = 0;
-
- if (ctl->local_map) {
- if (munmap(ctl->local_map, ctl->map_length)) {
- pr_err("munmap failed (pid: %d)\n", ctl->pid.real);
- ret = -1;
- }
- }
-
- free(ctl);
- return ret;
-}
-
-int parasite_cure_seized(struct parasite_ctl *ctl)
-{
- int ret;
-
- ret = parasite_cure_remote(ctl);
- if (!ret)
- ret = parasite_cure_local(ctl);
-
- return ret;
-}
-
-/*
- * parasite_unmap() is used for unmapping parasite and restorer blobs.
- * A blob can contain code for unmapping itself, so the porcess is
- * trapped on the exit from the munmap syscall.
- */
-int parasite_unmap(struct parasite_ctl *ctl, unsigned long addr)
-{
- user_regs_struct_t regs = ctl->orig.regs;
- pid_t pid = ctl->pid.real;
- int ret = -1;
-
- ret = parasite_run(pid, PTRACE_SYSCALL, addr, NULL, ®s, &ctl->orig);
- if (ret)
- goto err;
-
- ret = parasite_stop_on_syscall(1, __NR_munmap, TRACE_ENTER);
-
- if (restore_thread_ctx(pid, &ctl->orig))
- ret = -1;
-err:
- return ret;
-}
-
-/* If vma_area_list is NULL, a place for injecting syscall will not be set. */
-struct parasite_ctl *parasite_prep_ctl(pid_t pid, struct vm_area_list *vma_area_list)
-{
- struct parasite_ctl *ctl = NULL;
- struct vma_area *vma_area;
-
- if (!arch_can_dump_task(pid))
- goto err;
-
- /*
- * Control block early setup.
- */
- ctl = xzalloc(sizeof(*ctl));
- if (!ctl) {
- pr_err("Parasite control block allocation failed (pid: %d)\n", pid);
- goto err;
- }
-
- ctl->tsock = -1;
-
- if (get_thread_ctx(pid, &ctl->orig))
- goto err;
-
- ctl->pid.real = pid;
- ctl->pid.virt = 0;
-
- if (vma_area_list == NULL)
- return ctl;
-
- /* Search a place for injecting syscall */
- vma_area = get_vma_by_ip(&vma_area_list->h, REG_IP(ctl->orig.regs),
- MEMFD_FNAME_SZ);
- if (!vma_area) {
- pr_err("No suitable VMA found to run parasite "
- "bootstrap code (pid: %d)\n", pid);
- goto err;
- }
-
- ctl->syscall_ip = vma_area->e->start;
- pr_debug("Parasite syscall_ip at %p\n", (void *)ctl->syscall_ip);
-
- return ctl;
-
-err:
- xfree(ctl);
- return NULL;
-}
-
-static int parasite_mmap_exchange(struct parasite_ctl *ctl, unsigned long size)
-{
- int fd;
-
- ctl->remote_map = mmap_seized(ctl, NULL, size,
- PROT_READ | PROT_WRITE | PROT_EXEC,
- MAP_ANONYMOUS | MAP_SHARED, -1, 0);
- if (!ctl->remote_map) {
- pr_err("Can't allocate memory for parasite blob (pid: %d)\n", ctl->pid.real);
- return -1;
- }
-
- ctl->map_length = round_up(size, page_size());
-
- fd = open_proc_rw(ctl->pid.real, "map_files/%p-%p",
- ctl->remote_map, ctl->remote_map + ctl->map_length);
- if (fd < 0)
- return -1;
-
- ctl->local_map = mmap(NULL, size, PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_FILE, fd, 0);
- close(fd);
-
- if (ctl->local_map == MAP_FAILED) {
- ctl->local_map = NULL;
- pr_perror("Can't map remote parasite map");
- return -1;
- }
-
- return 0;
-}
-
-#ifdef CONFIG_HAS_MEMFD
-static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size)
-{
- void *where = (void *)ctl->syscall_ip + BUILTIN_SYSCALL_SIZE;
- u8 orig_code[MEMFD_FNAME_SZ] = MEMFD_FNAME;
- pid_t pid = ctl->pid.real;
- unsigned long sret = -ENOSYS;
- int ret, fd, lfd;
-
- BUILD_BUG_ON(sizeof(orig_code) < sizeof(long));
-
- if (ptrace_swap_area(pid, where, (void *)orig_code, sizeof(orig_code))) {
- pr_err("Can't inject memfd args (pid: %d)\n", pid);
- return -1;
- }
-
- ret = syscall_seized(ctl, __NR_memfd_create, &sret,
- (unsigned long)where, 0, 0, 0, 0, 0);
-
- if (ptrace_poke_area(pid, orig_code, where, sizeof(orig_code))) {
- fd = (int)(long)sret;
- if (fd >= 0)
- syscall_seized(ctl, __NR_close, &sret, fd, 0, 0, 0, 0, 0);
- pr_err("Can't restore memfd args (pid: %d)\n", pid);
- return -1;
- }
-
- if (ret < 0)
- return ret;
-
- fd = (int)(long)sret;
- if (fd == -ENOSYS)
- return 1;
- if (fd < 0)
- return fd;
-
- ctl->map_length = round_up(size, page_size());
- lfd = open_proc_rw(ctl->pid.real, "fd/%d", fd);
- if (lfd < 0)
- goto err_cure;
-
- if (ftruncate(lfd, ctl->map_length) < 0) {
- pr_perror("Fail to truncate memfd for parasite");
- goto err_cure;
- }
-
- ctl->remote_map = mmap_seized(ctl, NULL, size,
- PROT_READ | PROT_WRITE | PROT_EXEC,
- MAP_FILE | MAP_SHARED, fd, 0);
- if (!ctl->remote_map) {
- pr_err("Can't rmap memfd for parasite blob\n");
- goto err_curef;
- }
-
- ctl->local_map = mmap(NULL, size, PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_FILE, lfd, 0);
- if (ctl->local_map == MAP_FAILED) {
- ctl->local_map = NULL;
- pr_perror("Can't lmap memfd for parasite blob");
- goto err_curef;
- }
-
- syscall_seized(ctl, __NR_close, &sret, fd, 0, 0, 0, 0, 0);
- close(lfd);
-
- pr_info("Set up parasite blob using memfd\n");
- return 0;
-
-err_curef:
- close(lfd);
-err_cure:
- syscall_seized(ctl, __NR_close, &sret, fd, 0, 0, 0, 0, 0);
- return -1;
-}
-#else
-static int parasite_memfd_exchange(struct parasite_ctl *ctl, unsigned long size)
-{
- return 1;
-}
-#endif
-
-int parasite_map_exchange(struct parasite_ctl *ctl, unsigned long size)
-{
- int ret;
-
- ret = parasite_memfd_exchange(ctl, size);
- if (ret == 1) {
- pr_info("MemFD parasite doesn't work, goto legacy mmap\n");
- ret = parasite_mmap_exchange(ctl, size);
- }
- return ret;
-}
-
-static unsigned long parasite_args_size = PARASITE_ARG_SIZE_MIN;
-void parasite_ensure_args_size(unsigned long sz)
-{
- if (parasite_args_size < sz)
- parasite_args_size = sz;
-}
-
-static int parasite_start_daemon(struct parasite_ctl *ctl, struct pstree_item *item)
-{
- pid_t pid = ctl->pid.real;
-
- /*
- * Get task registers before going daemon, since the
- * get_task_regs needs to call ptrace on _stopped_ task,
- * while in daemon it is not such.
- */
-
- if (get_task_regs(pid, ctl->orig.regs, item->core[0])) {
- pr_err("Can't obtain regs for thread %d\n", pid);
- return -1;
- }
-
- if (construct_sigframe(ctl->sigframe, ctl->rsigframe, item->core[0]))
- return -1;
-
- if (parasite_init_daemon(ctl, dmpi(item)->netns))
- return -1;
-
- return 0;
-}
-
-struct parasite_ctl *parasite_infect_seized(pid_t pid, struct pstree_item *item,
- struct vm_area_list *vma_area_list)
-{
- int ret;
- struct parasite_ctl *ctl;
- unsigned long p, map_exchange_size;
-
- BUG_ON(item->threads[0].real != pid);
-
- ctl = parasite_prep_ctl(pid, vma_area_list);
- if (!ctl)
- return NULL;
-
- parasite_ensure_args_size(dump_pages_args_size(vma_area_list));
- parasite_ensure_args_size(aio_rings_args_size(vma_area_list));
-
- /*
- * Inject a parasite engine. Ie allocate memory inside alien
- * space and copy engine code there. Then re-map the engine
- * locally, so we will get an easy way to access engine memory
- * without using ptrace at all.
- */
-
- ctl->args_size = round_up(parasite_args_size, PAGE_SIZE);
- parasite_args_size = PARASITE_ARG_SIZE_MIN; /* reset for next task */
- map_exchange_size = pie_size(parasite_blob) + ctl->args_size;
- map_exchange_size += RESTORE_STACK_SIGFRAME + PARASITE_STACK_SIZE;
- if (item->nr_threads > 1)
- map_exchange_size += PARASITE_STACK_SIZE;
-
- memcpy(&item->core[0]->tc->blk_sigset, &ctl->orig.sigmask, sizeof(k_rtsigset_t));
-
- ret = parasite_map_exchange(ctl, map_exchange_size);
- if (ret)
- goto err_restore;
-
- pr_info("Putting parasite blob into %p->%p\n", ctl->local_map, ctl->remote_map);
- memcpy(ctl->local_map, parasite_blob, sizeof(parasite_blob));
-
- ELF_RELOCS_APPLY_PARASITE(ctl->local_map, ctl->remote_map);
-
- /* Setup the rest of a control block */
- ctl->parasite_ip = (unsigned long)parasite_sym(ctl->remote_map, __export_parasite_head_start);
- ctl->addr_cmd = parasite_sym(ctl->local_map, __export_parasite_cmd);
- ctl->addr_args = parasite_sym(ctl->local_map, __export_parasite_args);
-
- p = pie_size(parasite_blob) + ctl->args_size;
-
- ctl->rsigframe = ctl->remote_map + p;
- ctl->sigframe = ctl->local_map + p;
-
- p += RESTORE_STACK_SIGFRAME;
- p += PARASITE_STACK_SIZE;
- ctl->rstack = ctl->remote_map + p;
-
- if (item->nr_threads > 1) {
- p += PARASITE_STACK_SIZE;
- ctl->r_thread_stack = ctl->remote_map + p;
- }
-
- if (parasite_start_daemon(ctl, item))
- goto err_restore;
-
- return ctl;
-
-err_restore:
- parasite_cure_seized(ctl);
- return NULL;
-}
-
-int ptrace_stop_pie(pid_t pid, void *addr, enum trace_flags *tf)
-{
- int ret;
-
- ret = ptrace_set_breakpoint(pid, addr);
- if (ret < 0)
- return ret;
-
- if (ret > 0) {
- /*
- * PIE will stop on a breakpoint, next
- * stop after that will be syscall enter.
- */
- *tf = TRACE_EXIT;
- return 0;
- }
-
- /*
- * No breakpoints available -- start tracing it
- * in a per-syscall manner.
- */
- ret = ptrace(PTRACE_SYSCALL, pid, NULL, NULL);
- if (ret) {
- pr_perror("Unable to restart the %d process", pid);
- return -1;
- }
-
- *tf = TRACE_ENTER;
- return 0;
-}
diff --git a/pie/Makefile b/pie/Makefile
deleted file mode 100644
index fd48da92ea43..000000000000
--- a/pie/Makefile
+++ /dev/null
@@ -1,132 +0,0 @@
-targets += parasite
-targets += restorer
-
-# used by obj-x to identify shared files built for parasite/restorer binaries
-xsuffix := -pie-build
-
-obj-y += log-simple.o
-obj-x += util.o
-obj-x += util-fd.o
-
-ifeq ($(VDSO),y)
-obj-x += util-vdso.o
-obj-y += parasite-vdso.o
-obj-e += $(ARCH_DIR)/vdso-pie.o
-ifeq ($(SRCARCH),aarch64)
-asm-e += $(ARCH_DIR)/intraprocedure.o
-endif
-ifeq ($(SRCARCH), ppc64)
-asm-e += $(ARCH_DIR)/vdso-trampoline.o
-endif
-endif
-
-ifeq ($(SRCARCH), ppc64)
-asm-e += $(ARCH_DIR)/memcpy_power7.o
-asm-e += $(ARCH_DIR)/memcmp_64.o
-asm-e += $(ARCH_DIR)/misc.o
-endif
-
-parasite-obj-y += parasite.o
-parasite-asm-e += $(ARCH_DIR)/parasite-head.o
-parasite-libs-e += $(SYSCALL-LIB)
-
-restorer-obj-y += restorer.o
-restorer-obj-e += $(ARCH_DIR)/restorer.o
-restorer-libs-e += $(SYSCALL-LIB)
-
-#
-# We can't provide proper mount implementation
-# in parasite code -- it requires run-time rellocation
-# applications, which is not the target of the
-# project.
-#
-CFLAGS := $(filter-out -pg,$(CFLAGS)) -iquote pie/piegen
-
-ifneq ($(filter-out i386 ia32, $(ARCH)),)
-cflags-y += -DCR_NOGLIBC -fpie -Wa,--noexecstack -fno-stack-protector
-else
-cflags-y += -DCR_NOGLIBC -fno-pic -Wa,--noexecstack -fno-stack-protector
-endif
-
-ifeq ($(SRCARCH), arm)
- cflags-y += -marm
-endif
-ASMFLAGS += -D__ASSEMBLY__
-
-GEN-OFFSETS := ../scripts/gen-offsets.sh
-BLOBS := $(obj)/parasite-blob.h $(obj)/restorer-blob.h
-
-PIELDS := pie.lds.S
-
-.SECONDARY:
-
-ifeq ($(piegen-y),y)
-ldflags-y += -r
-target-name = $(patsubst pie/%-blob.h,%,$(1))
-
-ifeq ($(SRCARCH),ppc64)
-$(obj)/$(PIELDS): $(obj)/pie-reloc.lds.S.in
- $(E) " GEN " $@
- $(Q) echo "OUTPUT_ARCH($(LDARCH))" > $(obj)/$(PIELDS)
- $(Q) cat $< >> $(obj)/$(PIELDS)
-else
-ifeq ($(ARCH),x86)
-$(obj)/$(PIELDS): $(obj)/pie-reloc.lds.S.in
- $(E) " GEN " $@
- $(Q) echo "OUTPUT_ARCH(i386:x86-64)" > $(obj)/$(PIELDS)
- $(Q) echo "TARGET(elf64-x86-64)" >> $(obj)/$(PIELDS)
- $(Q) cat $< >> $(obj)/$(PIELDS)
-else # i386 ia32
-$(obj)/$(PIELDS): $(obj)/pie-reloc.lds.S.in
- $(E) " GEN " $@
- $(Q) echo "OUTPUT_ARCH(i386)" > $(obj)/$(PIELDS)
- $(Q) echo "TARGET(elf32-i386)" >> $(obj)/$(PIELDS)
- $(Q) cat $< >> $(obj)/$(PIELDS)
-endif
-endif
-
-ifeq ($(strip $(V)),)
-piegen_stdout = >/dev/null
-endif
-
-$(obj)/%.built-in.bin.o: $(obj)/%.built-in.o $(obj)/$(PIELDS)
- $(E) " GEN " $@
- $(Q) $(LD) $(ldflags-y) -T $(obj)/$(PIELDS) -o $@ $<
-
-$(obj)/%-blob.h: $(obj)/%.built-in.bin.o $(obj)/$(PIELDS) pie/piegen
- $(E) " GEN " $@
- $(Q) pie/piegen/piegen -f $< -v $(call target-name,$@)_relocs -p $(call target-name,$@)_blob_offset__ -s $(call target-name,$@)_blob -o $@ $(piegen_stdout)
-
-else
-
-$(obj)/$(PIELDS): $(obj)/$(PIELDS).in
- $(E) " GEN " $@
- $(Q) $(SH) -c "echo 'OUTPUT_ARCH($(LDARCH))' > $(obj)/$(PIELDS)"
- $(Q) $(SH) -c "cat $(obj)/$(PIELDS).in >> $(obj)/$(PIELDS)"
-
-$(obj)/%.built-in.bin.o: $(obj)/%.built-in.o $(obj)/$(PIELDS)
- $(E) " GEN " $@
- $(Q) $(LD) $(ldflags-y) -T $(obj)/$(PIELDS) -o $@ $<
-
-$(obj)/%.built-in.bin: $(obj)/%.built-in.bin.o
- $(E) " GEN " $@
- $(Q) $(OBJCOPY) -O binary $^ $@
-
-$(obj)/%-blob.h: $(obj)/%.built-in.bin $(obj)/$(GEN-OFFSETS)
- $(E) " GEN " $@
- $(Q) $(SH) $(obj)/$(GEN-OFFSETS) $(@:-blob.h=) $(notdir $(@:-blob.h=)) $(CROSS_COMPILE) > $@
-
-endif
-
-$(BLOBS): $(obj)/$(PIELDS)
-_all += $(BLOBS)
-
-cleanup-y += $(obj)/$(PIELDS)
-cleanup-y += $(obj)/*.bin
-cleanup-y += $(BLOBS)
-cleanup-y += $(obj)/*.built-in.bin.o
-cleanup-y += $(obj)/*.built-in.bin
-
-ifneq ($(MAKECMDGOALS),clean)
-incdeps := y
-endif
diff --git a/pie/log-simple.c b/pie/log-simple.c
deleted file mode 100644
index 1cc877d2817f..000000000000
--- a/pie/log-simple.c
+++ /dev/null
@@ -1,291 +0,0 @@
-#include <stdarg.h>
-
-#include "asm/bitsperlong.h"
-
-#include "syscall.h"
-#include "log.h"
-
-struct simple_buf {
- char buf[LOG_SIMPLE_CHUNK];
- char *bp;
- void (*flush)(struct simple_buf *b);
-};
-
-static int logfd = -1;
-static int cur_loglevel = DEFAULT_LOGLEVEL;
-
-static void sbuf_log_flush(struct simple_buf *b);
-
-static void sbuf_log_init(struct simple_buf *b)
-{
- b->buf[0] = 'p';
- b->buf[1] = 'i';
- b->buf[2] = 'e';
- b->buf[3] = ':';
- b->buf[4] = ' ';
- b->bp = b->buf + 5;
- b->flush = sbuf_log_flush;
-}
-
-static void sbuf_log_flush(struct simple_buf *b)
-{
- if (b->bp == b->buf + 5)
- return;
-
- sys_write(logfd, b->buf, b->bp - b->buf);
- sbuf_log_init(b);
-}
-
-static void sbuf_putc(struct simple_buf *b, char c)
-{
- /* TODO: maybe some warning or error here? */
- if (b->bp - b->buf >= LOG_SIMPLE_CHUNK)
- return;
-
- *b->bp = c;
- b->bp++;
- if (b->bp - b->buf >= LOG_SIMPLE_CHUNK - 2) {
- b->bp[0] = '>';
- b->bp[1] = '\n';
- b->bp += 2;
- if (b->flush)
- b->flush(b);
- }
-}
-
-void log_set_fd(int fd)
-{
- sys_close(logfd);
- logfd = fd;
-}
-
-void log_set_loglevel(unsigned int level)
-{
- cur_loglevel = level;
-}
-
-static void print_string(const char *msg, struct simple_buf *b)
-{
- while (*msg) {
- sbuf_putc(b, *msg);
- msg++;
- }
-}
-
-int vprint_num(char *buf, int blen, int num, char **ps)
-{
- int neg = 0;
- char *s;
-
- s = &buf[blen - 1];
-
- if (num < 0) {
- neg = 1;
- num = -num;
- } else if (num == 0) {
- *s = '0';
- s--;
- goto done;
- }
-
- while (num > 0) {
- *s = (num % 10) + '0';
- s--;
- num /= 10;
- }
-
- if (neg) {
- *s = '-';
- s--;
- }
-done:
- s++;
- *ps = s;
- return blen - (s - buf);
-}
-
-static void print_num(int num, struct simple_buf *b)
-{
- char buf[12], *s;
-
- buf[11] = '\0';
- vprint_num(buf, sizeof(buf) - 1, num, &s);
- print_string(s, b);
-}
-
-static void print_num_l(long num, struct simple_buf *b)
-{
- int neg = 0;
- char buf[22], *s;
-
- buf[21] = '\0';
- s = &buf[20];
-
- if (num < 0) {
- neg = 1;
- num = -num;
- } else if (num == 0) {
- *s = '0';
- s--;
- goto done;
- }
-
- while (num > 0) {
- *s = (num % 10) + '0';
- s--;
- num /= 10;
- }
-
- if (neg) {
- *s = '-';
- s--;
- }
-done:
- s++;
- print_string(s, b);
-}
-
-static void hexdigit(unsigned int v, char *to, char **z)
-{
- *to = "0123456789abcdef"[v & 0xf];
- if (*to != '0')
- *z = to;
-}
-
-static void print_hex(unsigned int num, struct simple_buf *b)
-{
- char buf[11], *z = &buf[9];
-
- buf[10] = '\0';
- hexdigit(num >> 0, &buf[9], &z);
- hexdigit(num >> 4, &buf[8], &z);
- hexdigit(num >> 8, &buf[7], &z);
- hexdigit(num >> 12, &buf[6], &z);
- hexdigit(num >> 16, &buf[5], &z);
- hexdigit(num >> 20, &buf[4], &z);
- hexdigit(num >> 24, &buf[3], &z);
- hexdigit(num >> 28, &buf[2], &z);
- z -= 2;
- z[0] = '0';
- z[1] = 'x';
-
- print_string(z, b);
-}
-
-static void print_hex_l(unsigned long num, struct simple_buf *b)
-{
- char buf[19], *z = &buf[17];
-
- buf[18] = '\0';
- hexdigit(num >> 0, &buf[17], &z);
- hexdigit(num >> 4, &buf[16], &z);
- hexdigit(num >> 8, &buf[15], &z);
- hexdigit(num >> 12, &buf[14], &z);
- hexdigit(num >> 16, &buf[13], &z);
- hexdigit(num >> 20, &buf[12], &z);
- hexdigit(num >> 24, &buf[11], &z);
- hexdigit(num >> 28, &buf[10], &z);
-
-#if BITS_PER_LONG == 64
- hexdigit(num >> 32, &buf[9], &z);
- hexdigit(num >> 36, &buf[8], &z);
- hexdigit(num >> 40, &buf[7], &z);
- hexdigit(num >> 44, &buf[6], &z);
- hexdigit(num >> 48, &buf[5], &z);
- hexdigit(num >> 52, &buf[4], &z);
- hexdigit(num >> 56, &buf[3], &z);
- hexdigit(num >> 60, &buf[2], &z);
-#endif
-
- z -= 2;
- z[0] = '0';
- z[1] = 'x';
-
- print_string(z, b);
-}
-
-void sbuf_printf(struct simple_buf *b, const char *format, va_list args)
-{
- const char *s = format;
- while (1) {
- int along = 0;
-
- if (*s == '\0')
- break;
-
- if (*s != '%') {
- sbuf_putc(b, *s);
- s++;
- continue;
- }
-
- s++;
- if (*s == 'l') {
- along = 1;
- s++;
- if (*s == 'l')
- s++;
- }
-
- switch (*s) {
- case 's':
- print_string(va_arg(args, char *), b);
- break;
- case 'd':
- if (along)
- print_num_l(va_arg(args, long), b);
- else
- print_num(va_arg(args, int), b);
- break;
- case 'x':
- if (along)
- print_hex_l(va_arg(args, long), b);
- else
- print_hex(va_arg(args, unsigned int), b);
- break;
- case 'p':
- print_hex_l((unsigned long)va_arg(args, void *), b);
- break;
- default:
- print_string("UNKNOWN FORMAT ", b);
- sbuf_putc(b, *s);
- break;
- }
- s++;
- }
-}
-
-void print_on_level(unsigned int loglevel, const char *format, ...)
-{
- va_list args;
- struct simple_buf b;
-
- if (loglevel > cur_loglevel)
- return;
-
- sbuf_log_init(&b);
-
- va_start(args, format);
- sbuf_printf(&b, format, args);
- va_end(args);
-
- sbuf_log_flush(&b);
-}
-
-void simple_sprintf(char output[LOG_SIMPLE_CHUNK], const char *format, ...)
-{
- va_list args;
- struct simple_buf b;
- char *p;
-
- b.bp = b.buf;
- b.flush = NULL;
-
- va_start(args, format);
- sbuf_printf(&b, format, args);
- va_end(args);
- *b.bp = 0;
-
- for (p = b.buf; p <= b.bp; p++)
- output[p - b.buf] = *p;
-}
diff --git a/pie/parasite-vdso.c b/pie/parasite-vdso.c
deleted file mode 100644
index 9ee42e52875a..000000000000
--- a/pie/parasite-vdso.c
+++ /dev/null
@@ -1,218 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <elf.h>
-#include <fcntl.h>
-#include <errno.h>
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-
-#include "asm/string.h"
-#include "asm/types.h"
-
-#include "syscall.h"
-#include "image.h"
-#include "parasite-vdso.h"
-#include "vma.h"
-#include "log.h"
-#include "bug.h"
-
-#ifdef LOG_PREFIX
-# undef LOG_PREFIX
-#endif
-#define LOG_PREFIX "vdso: "
-
-
-static int vdso_remap(char *who, unsigned long from, unsigned long to, size_t size)
-{
- unsigned long addr;
-
- pr_debug("Remap %s %lx -> %lx\n", who, from, to);
-
- addr = sys_mremap(from, size, size, MREMAP_MAYMOVE | MREMAP_FIXED, to);
- if (addr != to) {
- pr_err("Unable to remap %lx -> %lx %lx\n",
- from, to, addr);
- return -1;
- }
-
- return 0;
-}
-
-/* Park runtime vDSO in some safe place where it can be accessible from restorer */
-int vdso_do_park(struct vdso_symtable *sym_rt, unsigned long park_at, unsigned long park_size)
-{
- int ret;
-
- BUG_ON((vdso_vma_size(sym_rt) + vvar_vma_size(sym_rt)) < park_size);
-
- if (sym_rt->vvar_start != VDSO_BAD_ADDR) {
- if (sym_rt->vma_start < sym_rt->vvar_start) {
- ret = vdso_remap("rt-vdso", sym_rt->vma_start,
- park_at, vdso_vma_size(sym_rt));
- park_at += vdso_vma_size(sym_rt);
- ret |= vdso_remap("rt-vvar", sym_rt->vvar_start,
- park_at, vvar_vma_size(sym_rt));
- } else {
- ret = vdso_remap("rt-vvar", sym_rt->vvar_start,
- park_at, vvar_vma_size(sym_rt));
- park_at += vvar_vma_size(sym_rt);
- ret |= vdso_remap("rt-vdso", sym_rt->vma_start,
- park_at, vdso_vma_size(sym_rt));
- }
- } else
- ret = vdso_remap("rt-vdso", sym_rt->vma_start,
- park_at, vdso_vma_size(sym_rt));
- return ret;
-}
-
-int vdso_proxify(char *who, struct vdso_symtable *sym_rt,
- unsigned long vdso_rt_parked_at, size_t index,
- VmaEntry *vmas, size_t nr_vmas)
-{
- VmaEntry *vma_vdso = NULL, *vma_vvar = NULL;
- struct vdso_symtable s = VDSO_SYMTABLE_INIT;
- bool remap_rt = false;
-
- /*
- * Figure out which kind of vdso tuple we get.
- */
- if (vma_entry_is(&vmas[index], VMA_AREA_VDSO))
- vma_vdso = &vmas[index];
- else if (vma_entry_is(&vmas[index], VMA_AREA_VVAR))
- vma_vvar = &vmas[index];
-
- if (index < (nr_vmas - 1)) {
- if (vma_entry_is(&vmas[index + 1], VMA_AREA_VDSO))
- vma_vdso = &vmas[index + 1];
- else if (vma_entry_is(&vmas[index + 1], VMA_AREA_VVAR))
- vma_vvar = &vmas[index + 1];
- }
-
- if (!vma_vdso) {
- pr_err("Can't find vDSO area in image\n");
- return -1;
- }
-
- /*
- * vDSO mark overwrites Elf program header of proxy vDSO thus
- * it must never ever be greater in size.
- */
- BUILD_BUG_ON(sizeof(struct vdso_mark) > sizeof(Elf64_Phdr));
-
- /*
- * Find symbols in vDSO zone read from image.
- */
- if (vdso_fill_symtable((void *)vma_vdso->start, vma_entry_len(vma_vdso), &s))
- return -1;
-
- /*
- * Proxification strategy
- *
- * - There might be two vDSO zones: vdso code and optionally vvar data
- * - To be able to use in-place remapping we need
- *
- * a) Size and order of vDSO zones are to match
- * b) Symbols offsets must match
- * c) Have same number of vDSO zones
- */
- if (vma_entry_len(vma_vdso) == vdso_vma_size(sym_rt)) {
- size_t i;
-
- for (i = 0; i < ARRAY_SIZE(s.symbols); i++) {
- if (s.symbols[i].offset != sym_rt->symbols[i].offset)
- break;
- }
-
- if (i == ARRAY_SIZE(s.symbols)) {
- if (vma_vvar && sym_rt->vvar_start != VVAR_BAD_ADDR) {
- remap_rt = (vvar_vma_size(sym_rt) == vma_entry_len(vma_vvar));
- if (remap_rt) {
- long delta_rt = sym_rt->vvar_start - sym_rt->vma_start;
- long delta_this = vma_vvar->start - vma_vdso->start;
-
- remap_rt = (delta_rt ^ delta_this) < 0 ? false : true;
- }
- } else
- remap_rt = true;
- }
- }
-
- pr_debug("image [vdso] %lx-%lx [vvar] %lx-%lx\n",
- vma_vdso->start, vma_vdso->end,
- vma_vvar ? vma_vvar->start : VVAR_BAD_ADDR,
- vma_vvar ? vma_vvar->end : VVAR_BAD_ADDR);
-
- /*
- * Easy case -- the vdso from image has same offsets, order and size
- * as runtime, so we simply remap runtime vdso to dumpee position
- * without generating any proxy.
- *
- * Note we may remap VVAR vdso as well which might not yet been mapped
- * by a caller code. So drop VMA_AREA_REGULAR from it and caller would
- * not touch it anymore.
- */
- if (remap_rt) {
- int ret = 0;
-
- pr_info("Runtime vdso/vvar matches dumpee, remap inplace\n");
-
- if (sys_munmap((void *)vma_vdso->start, vma_entry_len(vma_vdso))) {
- pr_err("Failed to unmap %s\n", who);
- return -1;
- }
-
- if (vma_vvar) {
- if (sys_munmap((void *)vma_vvar->start, vma_entry_len(vma_vvar))) {
- pr_err("Failed to unmap %s\n", who);
- return -1;
- }
-
- if (vma_vdso->start < vma_vvar->start) {
- ret = vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
- vdso_rt_parked_at += vdso_vma_size(sym_rt);
- ret |= vdso_remap(who, vdso_rt_parked_at, vma_vvar->start, vvar_vma_size(sym_rt));
- } else {
- ret = vdso_remap(who, vdso_rt_parked_at, vma_vvar->start, vvar_vma_size(sym_rt));
- vdso_rt_parked_at += vvar_vma_size(sym_rt);
- ret |= vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
- }
- } else
- ret = vdso_remap(who, vdso_rt_parked_at, vma_vdso->start, vdso_vma_size(sym_rt));
-
- return ret;
- }
-
- /*
- * Now complex case -- we need to proxify calls. We redirect
- * calls from dumpee vdso to runtime vdso, making dumpee
- * to operate as proxy vdso.
- */
- pr_info("Runtime vdso mismatches dumpee, generate proxy\n");
-
- /*
- * Don't forget to shift if vvar is before vdso.
- */
- if (sym_rt->vvar_start != VDSO_BAD_ADDR &&
- sym_rt->vvar_start < sym_rt->vma_start)
- vdso_rt_parked_at += vvar_vma_size(sym_rt);
-
- if (vdso_redirect_calls(vdso_rt_parked_at,
- vma_vdso->start,
- sym_rt, &s)) {
- pr_err("Failed to proxify dumpee contents\n");
- return -1;
- }
-
- /*
- * Put a special mark into runtime vdso, thus at next checkpoint
- * routine we could detect this vdso and do not dump it, since
- * it's auto-generated every new session if proxy required.
- */
- sys_mprotect((void *)vdso_rt_parked_at, vdso_vma_size(sym_rt), PROT_WRITE);
- vdso_put_mark((void *)vdso_rt_parked_at, vma_vdso->start, vma_vvar ? vma_vvar->start : VVAR_BAD_ADDR);
- sys_mprotect((void *)vdso_rt_parked_at, vdso_vma_size(sym_rt), VDSO_PROT);
- return 0;
-}
diff --git a/pie/parasite.c b/pie/parasite.c
deleted file mode 100644
index 7b1e324c05b6..000000000000
--- a/pie/parasite.c
+++ /dev/null
@@ -1,727 +0,0 @@
-#include <sys/mman.h>
-#include <errno.h>
-#include <signal.h>
-#include <linux/limits.h>
-#include <linux/capability.h>
-#include <sys/mount.h>
-#include <stdarg.h>
-#include <sys/ioctl.h>
-
-#include "syscall.h"
-#include "parasite.h"
-#include "config.h"
-#include "fcntl.h"
-#include "prctl.h"
-#include "lock.h"
-#include "parasite-vdso.h"
-#include "log.h"
-#include "tty.h"
-
-#include <string.h>
-
-#include "asm/types.h"
-#include "asm/parasite.h"
-#include "asm/restorer.h"
-
-static int tsock = -1;
-
-static struct rt_sigframe *sigframe;
-
-/*
- * PARASITE_CMD_DUMPPAGES is called many times and the parasite args contains
- * an array of VMAs at this time, so VMAs can be unprotected in any moment
- */
-static struct parasite_dump_pages_args *mprotect_args = NULL;
-
-#ifndef SPLICE_F_GIFT
-#define SPLICE_F_GIFT 0x08
-#endif
-
-#ifndef PR_GET_PDEATHSIG
-#define PR_GET_PDEATHSIG 2
-#endif
-
-static int mprotect_vmas(struct parasite_dump_pages_args *args)
-{
- struct parasite_vma_entry *vmas, *vma;
- int ret = 0, i;
-
- vmas = pargs_vmas(args);
- for (i = 0; i < args->nr_vmas; i++) {
- vma = vmas + i;
- ret = sys_mprotect((void *)vma->start, vma->len, vma->prot | args->add_prot);
- if (ret) {
- pr_err("mprotect(%08lx, %lu) failed with code %d\n",
- vma->start, vma->len, ret);
- break;
- }
- }
-
- if (args->add_prot)
- mprotect_args = args;
- else
- mprotect_args = NULL;
-
- return ret;
-}
-
-static int dump_pages(struct parasite_dump_pages_args *args)
-{
- int p, ret;
- struct iovec *iovs;
-
- p = recv_fd(tsock);
- if (p < 0)
- return -1;
-
- iovs = pargs_iovs(args);
- ret = sys_vmsplice(p, &iovs[args->off], args->nr_segs,
- SPLICE_F_GIFT | SPLICE_F_NONBLOCK);
- if (ret != PAGE_SIZE * args->nr_pages) {
- sys_close(p);
- pr_err("Can't splice pages to pipe (%d/%d)\n", ret, args->nr_pages);
- return -1;
- }
-
- sys_close(p);
- return 0;
-}
-
-static int dump_sigact(struct parasite_dump_sa_args *da)
-{
- int sig, ret = 0;
-
- for (sig = 1; sig <= SIGMAX; sig++) {
- int i = sig - 1;
-
- if (sig == SIGKILL || sig == SIGSTOP)
- continue;
-
- ret = sys_sigaction(sig, NULL, &da->sas[i], sizeof(k_rtsigset_t));
- if (ret < 0) {
- pr_err("sys_sigaction failed (%d)\n", ret);
- break;
- }
- }
-
- return ret;
-}
-
-static int dump_itimers(struct parasite_dump_itimers_args *args)
-{
- int ret;
-
- ret = sys_getitimer(ITIMER_REAL, &args->real);
- if (!ret)
- ret = sys_getitimer(ITIMER_VIRTUAL, &args->virt);
- if (!ret)
- ret = sys_getitimer(ITIMER_PROF, &args->prof);
-
- if (ret)
- pr_err("getitimer failed (%d)\n", ret);
-
- return ret;
-}
-
-static int dump_posix_timers(struct parasite_dump_posix_timers_args *args)
-{
- int i;
- int ret = 0;
-
- for(i = 0; i < args->timer_n; i++) {
- ret = sys_timer_gettime(args->timer[i].it_id, &args->timer[i].val);
- if (ret < 0) {
- pr_err("sys_timer_gettime failed (%d)\n", ret);
- return ret;
- }
- args->timer[i].overrun = sys_timer_getoverrun(args->timer[i].it_id);
- ret = args->timer[i].overrun;
- if (ret < 0) {
- pr_err("sys_timer_getoverrun failed (%d)\n", ret);
- return ret;
- }
- }
-
- return ret;
-}
-
-static int dump_creds(struct parasite_dump_creds *args);
-
-static int dump_thread_common(struct parasite_dump_thread *ti)
-{
- int ret;
-
- arch_get_tls(&ti->tls);
- ret = sys_prctl(PR_GET_TID_ADDRESS, (unsigned long) &ti->tid_addr, 0, 0, 0);
- if (ret)
- goto out;
-
- ret = sys_sigaltstack(NULL, &ti->sas);
- if (ret)
- goto out;
-
- ret = sys_prctl(PR_GET_PDEATHSIG, (unsigned long)&ti->pdeath_sig, 0, 0, 0);
- if (ret)
- goto out;
-
- ret = dump_creds(ti->creds);
-out:
- return ret;
-}
-
-static int dump_misc(struct parasite_dump_misc *args)
-{
- args->brk = sys_brk(0);
-
- args->pid = sys_getpid();
- args->sid = sys_getsid();
- args->pgid = sys_getpgid(0);
- args->umask = sys_umask(0);
- sys_umask(args->umask); /* never fails */
- args->dumpable = sys_prctl(PR_GET_DUMPABLE, 0, 0, 0, 0);
-
- return 0;
-}
-
-static int dump_creds(struct parasite_dump_creds *args)
-{
- int ret, i, j;
- struct cap_data data[_LINUX_CAPABILITY_U32S_3];
- struct cap_header hdr = {_LINUX_CAPABILITY_VERSION_3, 0};
-
- ret = sys_capget(&hdr, data);
- if (ret < 0) {
- pr_err("Unable to get capabilities: %d\n", ret);
- return -1;
- }
-
- /*
- * Loop through the capability constants until we reach cap_last_cap.
- * The cap_bnd set is stored as a bitmask comprised of CR_CAP_SIZE number of
- * 32-bit uints, hence the inner loop from 0 to 32.
- */
- for (i = 0; i < CR_CAP_SIZE; i++) {
- args->cap_eff[i] = data[i].eff;
- args->cap_prm[i] = data[i].prm;
- args->cap_inh[i] = data[i].inh;
- args->cap_bnd[i] = 0;
-
- for (j = 0; j < 32; j++) {
- if (j + i * 32 > args->cap_last_cap)
- break;
- ret = sys_prctl(PR_CAPBSET_READ, j + i * 32, 0, 0, 0);
- if (ret < 0) {
- pr_err("Unable to read capability %d: %d\n",
- j + i * 32, ret);
- return -1;
- }
- if (ret)
- args->cap_bnd[i] |= (1 << j);
- }
- }
-
- args->secbits = sys_prctl(PR_GET_SECUREBITS, 0, 0, 0, 0);
-
- ret = sys_getgroups(0, NULL);
- if (ret < 0)
- goto grps_err;
-
- args->ngroups = ret;
- if (args->ngroups >= PARASITE_MAX_GROUPS) {
- pr_err("Too many groups in task %d\n", (int)args->ngroups);
- return -1;
- }
-
- ret = sys_getgroups(args->ngroups, args->groups);
- if (ret < 0)
- goto grps_err;
-
- if (ret != args->ngroups) {
- pr_err("Groups changed on the fly %d -> %d\n",
- args->ngroups, ret);
- return -1;
- }
-
- ret = sys_getresuid(&args->uids[0], &args->uids[1], &args->uids[2]);
- if (ret) {
- pr_err("Unable to get uids: %d\n", ret);
- return -1;
- }
-
- args->uids[3] = sys_setfsuid(-1L);
-
- ret = sys_getresgid(&args->gids[0], &args->gids[1], &args->gids[2]);
- if (ret) {
- pr_err("Unable to get uids: %d\n", ret);
- return -1;
- }
-
- args->gids[3] = sys_setfsgid(-1L);
-
- return 0;
-
-grps_err:
- pr_err("Error calling getgroups (%d)\n", ret);
- return -1;
-}
-
-static int drain_fds(struct parasite_drain_fd *args)
-{
- int ret;
-
- ret = send_fds(tsock, NULL, 0,
- args->fds, args->nr_fds, true);
- if (ret)
- pr_err("send_fds failed (%d)\n", ret);
-
- return ret;
-}
-
-static int dump_thread(struct parasite_dump_thread *args)
-{
- args->tid = sys_gettid();
- return dump_thread_common(args);
-}
-
-static char proc_mountpoint[] = "proc.crtools";
-static int parasite_get_proc_fd()
-{
- int ret, fd = -1;
- char buf[2];
-
- ret = sys_readlinkat(AT_FDCWD, "/proc/self", buf, sizeof(buf));
- if (ret < 0 && ret != -ENOENT) {
- pr_err("Can't readlink /proc/self (%d)\n", ret);
- return ret;
- }
-
- /* Fast path -- if /proc belongs to this pidns */
- if (ret == 1 && buf[0] == '1') {
- fd = sys_open("/proc", O_RDONLY, 0);
- goto out_send_fd;
- }
-
- ret = sys_mkdir(proc_mountpoint, 0700);
- if (ret) {
- pr_err("Can't create a directory (%d)\n", ret);
- return -1;
- }
-
- ret = sys_mount("proc", proc_mountpoint, "proc", MS_MGC_VAL, NULL);
- if (ret) {
- pr_err("mount failed (%d)\n", ret);
- sys_rmdir(proc_mountpoint);
- return -1;
- }
-
- fd = open_detach_mount(proc_mountpoint);
-out_send_fd:
- if (fd < 0)
- return fd;
- ret = send_fd(tsock, NULL, 0, fd);
- sys_close(fd);
- return ret;
-}
-
-static inline int tty_ioctl(int fd, int cmd, int *arg)
-{
- int ret;
-
- ret = sys_ioctl(fd, cmd, (unsigned long)arg);
- if (ret < 0) {
- if (ret != -ENOTTY)
- return ret;
- *arg = 0;
- }
- return 0;
-}
-
-/*
- * Stolen from kernel/fs/aio.c
- *
- * Is it valid to go to memory and check it? Should be,
- * as libaio does the same.
- */
-
-#define AIO_RING_MAGIC 0xa10a10a1
-#define AIO_RING_COMPAT_FEATURES 1
-#define AIO_RING_INCOMPAT_FEATURES 0
-
-struct aio_ring {
- unsigned id; /* kernel internal index number */
- unsigned nr; /* number of io_events */
- unsigned head; /* Written to by userland or under ring_lock
- * mutex by aio_read_events_ring(). */
- unsigned tail;
-
- unsigned magic;
- unsigned compat_features;
- unsigned incompat_features;
- unsigned header_length; /* size of aio_ring */
-
-
- /* struct io_event io_events[0]; */
-};
-
-static int sane_ring(struct aio_ring *ring)
-{
- return ring->magic == AIO_RING_MAGIC &&
- ring->compat_features == AIO_RING_COMPAT_FEATURES &&
- ring->incompat_features == AIO_RING_INCOMPAT_FEATURES &&
- ring->header_length == sizeof(struct aio_ring);
-}
-
-static int parasite_check_aios(struct parasite_check_aios_args *args)
-{
- int i;
-
- for (i = 0; i < args->nr_rings; i++) {
- struct aio_ring *ring;
-
- ring = (struct aio_ring *)args->ring[i].ctx;
- if (!sane_ring(ring)) {
- pr_err("Not valid ring #%d\n", i);
- pr_info(" `- magic %x\n", ring->magic);
- pr_info(" `- cf %d\n", ring->compat_features);
- pr_info(" `- if %d\n", ring->incompat_features);
- pr_info(" `- size %d (%zd)\n", ring->header_length, sizeof(struct aio_ring));
- return -1;
- }
-
- /*
- * XXX what else can we do if there are requests
- * in the ring?
- */
- if (ring->head != ring->tail) {
- pr_err("Pending AIO requests in ring #%d\n", i);
- return -1;
- }
-
- args->ring[i].max_reqs = ring->nr;
- }
-
- return 0;
-}
-
-static int parasite_dump_tty(struct parasite_tty_args *args)
-{
- int ret;
-
-#ifndef TIOCGPKT
-# define TIOCGPKT _IOR('T', 0x38, int)
-#endif
-
-#ifndef TIOCGPTLCK
-# define TIOCGPTLCK _IOR('T', 0x39, int)
-#endif
-
-#ifndef TIOCGEXCL
-# define TIOCGEXCL _IOR('T', 0x40, int)
-#endif
-
- args->sid = 0;
- args->pgrp = 0;
- args->st_pckt = 0;
- args->st_lock = 0;
- args->st_excl = 0;
-
-#define __tty_ioctl(cmd, arg) \
- do { \
- ret = tty_ioctl(args->fd, cmd, &arg); \
- if (ret < 0) { \
- if (ret == -ENOTTY) \
- arg = 0; \
- else if (ret == -EIO) \
- goto err_io; \
- else \
- goto err; \
- } \
- } while (0)
-
- __tty_ioctl(TIOCGSID, args->sid);
- __tty_ioctl(TIOCGPGRP, args->pgrp);
- __tty_ioctl(TIOCGEXCL, args->st_excl);
-
- if (args->type == TTY_TYPE__PTY) {
- __tty_ioctl(TIOCGPKT, args->st_pckt);
- __tty_ioctl(TIOCGPTLCK, args->st_lock);
- }
-
- args->hangup = false;
- return 0;
-
-err:
- pr_err("tty: Can't fetch params: err = %d\n", ret);
- return -1;
-err_io:
-
- /* kernel reports EIO for get ioctls on pair-less ptys */
- args->hangup = true;
- return 0;
-#undef __tty_ioctl
-}
-
-#ifdef CONFIG_VDSO
-static int parasite_check_vdso_mark(struct parasite_vdso_vma_entry *args)
-{
- struct vdso_mark *m = (void *)args->start;
-
- if (is_vdso_mark(m)) {
- /*
- * Make sure we don't meet some corrupted entry
- * where signature matches but verions is not!
- */
- if (m->version != VDSO_MARK_CUR_VERSION) {
- pr_err("vdso: Mark version mismatch!\n");
- return -EINVAL;
- }
- args->is_marked = 1;
- args->proxy_vdso_addr = m->proxy_vdso_addr;
- args->proxy_vvar_addr = m->proxy_vvar_addr;
- } else {
- args->is_marked = 0;
- args->proxy_vdso_addr = VDSO_BAD_ADDR;
- args->proxy_vvar_addr = VVAR_BAD_ADDR;
-
- if (args->try_fill_symtable) {
- struct vdso_symtable t;
-
- if (vdso_fill_symtable((void *)args->start, args->len, &t))
- args->is_vdso = false;
- else
- args->is_vdso = true;
- }
- }
-
- return 0;
-}
-#else
-static inline int parasite_check_vdso_mark(struct parasite_vdso_vma_entry *args)
-{
- pr_err("Unexpected VDSO check command\n");
- return -1;
-}
-#endif
-
-static int __parasite_daemon_reply_ack(unsigned int cmd, int err)
-{
- struct ctl_msg m;
- int ret;
-
- m = ctl_msg_ack(cmd, err);
- ret = sys_sendto(tsock, &m, sizeof(m), 0, NULL, 0);
- if (ret != sizeof(m)) {
- pr_err("Sent only %d bytes while %zd expected\n", ret, sizeof(m));
- return -1;
- }
-
- pr_debug("__sent ack msg: %d %d %d\n",
- m.cmd, m.ack, m.err);
-
- return 0;
-}
-
-static int __parasite_daemon_wait_msg(struct ctl_msg *m)
-{
- int ret;
-
- pr_debug("Daemon waits for command\n");
-
- while (1) {
- *m = (struct ctl_msg){ };
- ret = sys_recvfrom(tsock, m, sizeof(*m), MSG_WAITALL, NULL, 0);
- if (ret != sizeof(*m)) {
- pr_err("Trimmed message received (%d/%d)\n",
- (int)sizeof(*m), ret);
- return -1;
- }
-
- pr_debug("__fetched msg: %d %d %d\n",
- m->cmd, m->ack, m->err);
- return 0;
- }
-
- return -1;
-}
-
-static noinline void fini_sigreturn(unsigned long new_sp)
-{
- ARCH_RT_SIGRETURN(new_sp);
-}
-
-static int fini()
-{
- unsigned long new_sp;
-
- if (mprotect_args) {
- mprotect_args->add_prot = 0;
- mprotect_vmas(mprotect_args);
- }
-
- new_sp = (long)sigframe + SIGFRAME_OFFSET;
- pr_debug("%ld: new_sp=%lx ip %lx\n", sys_gettid(),
- new_sp, RT_SIGFRAME_REGIP(sigframe));
-
- sys_close(tsock);
- log_set_fd(-1);
-
- fini_sigreturn(new_sp);
-
- BUG();
-
- return -1;
-}
-
-static noinline __used int noinline parasite_daemon(void *args)
-{
- struct ctl_msg m = { };
- int ret = -1;
-
- pr_debug("Running daemon thread leader\n");
-
- /* Reply we're alive */
- if (__parasite_daemon_reply_ack(PARASITE_CMD_INIT_DAEMON, 0))
- goto out;
-
- ret = 0;
-
- while (1) {
- if (__parasite_daemon_wait_msg(&m))
- break;
-
- if (ret && m.cmd != PARASITE_CMD_FINI) {
- pr_err("Command rejected\n");
- continue;
- }
-
- switch (m.cmd) {
- case PARASITE_CMD_FINI:
- goto out;
- case PARASITE_CMD_DUMPPAGES:
- ret = dump_pages(args);
- break;
- case PARASITE_CMD_MPROTECT_VMAS:
- ret = mprotect_vmas(args);
- break;
- case PARASITE_CMD_DUMP_SIGACTS:
- ret = dump_sigact(args);
- break;
- case PARASITE_CMD_DUMP_ITIMERS:
- ret = dump_itimers(args);
- break;
- case PARASITE_CMD_DUMP_POSIX_TIMERS:
- ret = dump_posix_timers(args);
- break;
- case PARASITE_CMD_DUMP_THREAD:
- ret = dump_thread(args);
- break;
- case PARASITE_CMD_DUMP_MISC:
- ret = dump_misc(args);
- break;
- case PARASITE_CMD_DRAIN_FDS:
- ret = drain_fds(args);
- break;
- case PARASITE_CMD_GET_PROC_FD:
- ret = parasite_get_proc_fd();
- break;
- case PARASITE_CMD_DUMP_TTY:
- ret = parasite_dump_tty(args);
- break;
- case PARASITE_CMD_CHECK_AIOS:
- ret = parasite_check_aios(args);
- break;
- case PARASITE_CMD_CHECK_VDSO_MARK:
- ret = parasite_check_vdso_mark(args);
- break;
- default:
- pr_err("Unknown command in parasite daemon thread leader: %d\n", m.cmd);
- ret = -1;
- break;
- }
-
- if (__parasite_daemon_reply_ack(m.cmd, ret))
- break;
-
- if (ret) {
- pr_err("Close the control socket for writing\n");
- sys_shutdown(tsock, SHUT_WR);
- }
- }
-
-out:
- fini();
-
- return 0;
-}
-
-static noinline int unmap_itself(void *data)
-{
- struct parasite_unmap_args *args = data;
-
- sys_munmap(args->parasite_start, args->parasite_len);
- /*
- * This call to sys_munmap must never return. Instead, the controlling
- * process must trap us on the exit from munmap.
- */
-
- BUG();
- return -1;
-}
-
-static noinline __used int parasite_init_daemon(void *data)
-{
- struct parasite_init_args *args = data;
- int ret;
-
- args->sigreturn_addr = fini_sigreturn;
- sigframe = args->sigframe;
-
- tsock = sys_socket(PF_UNIX, SOCK_SEQPACKET, 0);
- if (tsock < 0) {
- pr_err("Can't create socket: %d\n", tsock);
- goto err;
- }
-
- ret = sys_connect(tsock, (struct sockaddr *)&args->h_addr, args->h_addr_len);
- if (ret < 0) {
- pr_err("Can't connect the control socket\n");
- goto err;
- }
-
- ret = recv_fd(tsock);
- if (ret >= 0) {
- log_set_fd(ret);
- log_set_loglevel(args->log_level);
- ret = 0;
- } else
- goto err;
-
- parasite_daemon(data);
-
-err:
- fini();
- BUG();
-
- return -1;
-}
-
-#ifndef __parasite_entry
-# define __parasite_entry
-#endif
-
-int __used __parasite_entry parasite_service(unsigned int cmd, void *args)
-{
- pr_info("Parasite cmd %d/%x process\n", cmd, cmd);
-
- switch (cmd) {
- case PARASITE_CMD_DUMP_THREAD:
- return dump_thread(args);
- case PARASITE_CMD_INIT_DAEMON:
- return parasite_init_daemon(args);
- case PARASITE_CMD_UNMAP:
- return unmap_itself(args);
- }
-
- pr_err("Unknown command to parasite: %d\n", cmd);
- return -EINVAL;
-}
diff --git a/pie/pie-reloc.lds.S.in b/pie/pie-reloc.lds.S.in
deleted file mode 100644
index 051d1d42740d..000000000000
--- a/pie/pie-reloc.lds.S.in
+++ /dev/null
@@ -1,30 +0,0 @@
-SECTIONS
-{
- .text : {
- *(.head.text)
- *(.text*)
- }
-
- .data : {
- *(.data*)
- *(.bss*)
- }
-
- .rodata : {
- *(.rodata*)
- *(.got*)
- }
-
- .toc : ALIGN(8) {
- *(.toc*)
- }
-
- /DISCARD/ : {
- *(.debug*)
- *(.comment*)
- *(.note*)
- *(.group*)
- *(.eh_frame*)
- }
-__export_parasite_args = .;
-}
diff --git a/pie/pie-relocs.c b/pie/pie-relocs.c
deleted file mode 100644
index 7e825b2320d9..000000000000
--- a/pie/pie-relocs.c
+++ /dev/null
@@ -1,47 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <unistd.h>
-#include <stdint.h>
-#include <string.h>
-
-#include <fcntl.h>
-#include <elf.h>
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-
-#include "asm-generic/int.h"
-
-#include "compiler.h"
-#include "piegen/uapi/types.h"
-#include "bug.h"
-
-__maybe_unused void elf_relocs_apply(void *mem, void *vbase, size_t size, elf_reloc_t *elf_relocs, size_t nr_relocs)
-{
- size_t i, j;
-
- for (i = 0, j = 0; i < nr_relocs; i++) {
- if (elf_relocs[i].type & PIEGEN_TYPE_LONG) {
- long *where = mem + elf_relocs[i].offset;
- long *p = mem + size;
-
- if (elf_relocs[i].type & PIEGEN_TYPE_GOTPCREL) {
- int *value = (int *)where;
- int rel;
-
- p[j] = (long)vbase + elf_relocs[i].value;
- rel = (unsigned)((void *)&p[j] - (void *)mem) - elf_relocs[i].offset + elf_relocs[i].addend;
-
- *value = rel;
- j++;
- } else
- *where = elf_relocs[i].value + elf_relocs[i].addend + (unsigned long)vbase;
- } else if (elf_relocs[i].type & PIEGEN_TYPE_INT) {
- int *where = (mem + elf_relocs[i].offset);
- *where = elf_relocs[i].value + elf_relocs[i].addend + (unsigned long)vbase;
- } else
- BUG();
- }
-}
diff --git a/pie/pie-relocs.h b/pie/pie-relocs.h
deleted file mode 100644
index 1449ca630908..000000000000
--- a/pie/pie-relocs.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef __PIE_RELOCS_H__
-#define __PIE_RELOCS_H__
-
-#include "piegen/uapi/types.h"
-
-#include "compiler.h"
-#include "config.h"
-
-#ifdef CONFIG_PIEGEN
-
-extern __maybe_unused void elf_relocs_apply(void *mem, void *vbase, size_t size,
- elf_reloc_t *elf_relocs, size_t nr_relocs);
-#define pie_size(__blob_name) (round_up(sizeof(__blob_name) + nr_gotpcrel * sizeof(long), page_size()))
-#define ELF_RELOCS_APPLY_PARASITE(__mem, __vbase) \
- elf_relocs_apply(__mem, __vbase, sizeof(parasite_blob), \
- parasite_relocs, ARRAY_SIZE(parasite_relocs))
-#define ELF_RELOCS_APPLY_RESTORER(__mem, __vbase) \
- elf_relocs_apply(__mem, __vbase, sizeof(restorer_blob), \
- restorer_relocs, ARRAY_SIZE(restorer_relocs))
-
-#else
-
-#define pie_size(__blob_name) (round_up(sizeof(__blob_name), page_size()))
-#define ELF_RELOCS_APPLY_PARASITE(__mem, __vbase)
-#define ELF_RELOCS_APPLY_RESTORER(__mem, __vbase)
-
-#endif
-
-#endif /* __PIE_RELOCS_H__ */
diff --git a/pie/pie.lds.S.in b/pie/pie.lds.S.in
deleted file mode 100644
index 9e9c97f003c3..000000000000
--- a/pie/pie.lds.S.in
+++ /dev/null
@@ -1,29 +0,0 @@
-SECTIONS
-{
- .crblob 0x0 : {
- *(.head.text)
- *(.text*)
- . = ALIGN(32);
- *(.data*)
- . = ALIGN(32);
- *(.rodata*)
- . = ALIGN(32);
- *(.bss*)
- . = ALIGN(32);
- *(.got*)
- . = ALIGN(32);
- *(.toc*)
- . = ALIGN(32);
- } =0x00000000,
-
- /DISCARD/ : {
- *(.debug*)
- *(.comment*)
- *(.note*)
- *(.group*)
- *(.eh_frame*)
- *(*)
- }
-
-__export_parasite_args = .;
-}
diff --git a/pie/piegen/Makefile b/pie/piegen/Makefile
deleted file mode 100644
index 5c3d68b84817..000000000000
--- a/pie/piegen/Makefile
+++ /dev/null
@@ -1,17 +0,0 @@
-CFLAGS += -iquote pie/piegen
-
-obj-y += main.o
-ifneq ($(filter ia32 x86, $(ARCH)),)
-obj-y += elf-x86-32.o
-obj-y += elf-x86-64.o
-endif
-ifeq ($(SRCARCH),ppc64)
-obj-y += elf-ppc64.o
-endif
-
-cleanup-y += $(obj)/piegen
-cleanup-y += $(obj)/*.o
-
-ifneq ($(MAKECMDGOALS),clean)
-incdeps := y
-endif
diff --git a/pie/piegen/elf-ppc64.c b/pie/piegen/elf-ppc64.c
deleted file mode 100644
index 472725f9fe7c..000000000000
--- a/pie/piegen/elf-ppc64.c
+++ /dev/null
@@ -1,16 +0,0 @@
-#define ELF_PPC64
-#define handle_elf handle_elf_ppc64
-
-#define Ehdr_t Elf64_Ehdr
-#define Shdr_t Elf64_Shdr
-#define Sym_t Elf64_Sym
-#define Rel_t Elf64_Rel
-#define Rela_t Elf64_Rela
-
-#define ELF_ST_TYPE ELF64_ST_TYPE
-#define ELF_ST_BIND ELF64_ST_BIND
-
-#define ELF_R_SYM ELF64_R_SYM
-#define ELF_R_TYPE ELF64_R_TYPE
-
-#include "elf.c"
diff --git a/pie/piegen/elf-x86-32.c b/pie/piegen/elf-x86-32.c
deleted file mode 100644
index 413113ef396b..000000000000
--- a/pie/piegen/elf-x86-32.c
+++ /dev/null
@@ -1,16 +0,0 @@
-#define ELF_X86_32
-#define handle_elf handle_elf_x86_32
-
-#define Ehdr_t Elf32_Ehdr
-#define Shdr_t Elf32_Shdr
-#define Sym_t Elf32_Sym
-#define Rel_t Elf32_Rel
-#define Rela_t Elf32_Rela
-
-#define ELF_ST_TYPE ELF32_ST_TYPE
-#define ELF_ST_BIND ELF32_ST_BIND
-
-#define ELF_R_SYM ELF32_R_SYM
-#define ELF_R_TYPE ELF32_R_TYPE
-
-#include "elf.c"
diff --git a/pie/piegen/elf-x86-64.c b/pie/piegen/elf-x86-64.c
deleted file mode 100644
index 8ba26672bc82..000000000000
--- a/pie/piegen/elf-x86-64.c
+++ /dev/null
@@ -1,16 +0,0 @@
-#define ELF_X86_64
-#define handle_elf handle_elf_x86_64
-
-#define Ehdr_t Elf64_Ehdr
-#define Shdr_t Elf64_Shdr
-#define Sym_t Elf64_Sym
-#define Rel_t Elf64_Rel
-#define Rela_t Elf64_Rela
-
-#define ELF_ST_TYPE ELF64_ST_TYPE
-#define ELF_ST_BIND ELF64_ST_BIND
-
-#define ELF_R_SYM ELF64_R_SYM
-#define ELF_R_TYPE ELF64_R_TYPE
-
-#include "elf.c"
diff --git a/pie/piegen/elf.c b/pie/piegen/elf.c
deleted file mode 100644
index c6b97257ba61..000000000000
--- a/pie/piegen/elf.c
+++ /dev/null
@@ -1,512 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <unistd.h>
-#include <stdint.h>
-#include <string.h>
-
-#include <fcntl.h>
-#include <elf.h>
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-
-#include "asm-generic/int.h"
-
-#include "compiler.h"
-#include "piegen.h"
-
-static bool __ptr_oob(const void *ptr, const void *start, const size_t size)
-{
- const void *end = (const void *)((const unsigned long)start + size);
- return ptr > end || ptr < start;
-}
-
-static bool test_pointer(const void *ptr, const void *start, const size_t size,
- const char *name, const char *file, const int line)
-{
- if (__ptr_oob(ptr, start, size)) {
- pr_err("Corrupted pointer %p (%s) at %s:%d\n",
- ptr, name, file, line);
- return true;
- }
- return false;
-}
-
-#define ptr_func_exit(__ptr) \
- do { \
- if (test_pointer((__ptr), mem, size, #__ptr, \
- __FILE__, __LINE__)) { \
- free(sec_hdrs); \
- return -1; \
- } \
- } while (0)
-
-#ifdef ELF_PPC64
-static int do_relative_toc(long value, uint16_t *location,
- unsigned long mask, int complain_signed)
-{
- if (complain_signed && (value + 0x8000 > 0xffff)) {
- pr_err("TOC16 relocation overflows (%ld)\n", value);
- return -1;
- }
-
- if ((~mask & 0xffff) & value) {
- pr_err("bad TOC16 relocation (%ld) (0x%lx)\n", value, (~mask & 0xffff) & value);
- return -1;
- }
-
- *location = (*location & ~mask) | (value & mask);
- return 0;
-}
-#endif
-
-int handle_elf(void *mem, size_t size)
-{
- const char *symstrings = NULL;
- Shdr_t *symtab_hdr = NULL;
- Sym_t *symbols = NULL;
- Ehdr_t *hdr = mem;
-
- Shdr_t *secstrings_hdr = NULL;
- Shdr_t *strtab_hdr = NULL;
- Shdr_t **sec_hdrs = NULL;
- const char *secstrings;
-
- size_t i, k, nr_gotpcrel = 0;
-#ifdef ELF_PPC64
- s64 toc_offset = 0;
-#endif
-
- pr_debug("Header\n");
- pr_debug("------------\n");
- pr_debug("\ttype 0x%x machine 0x%x version 0x%x\n",
- (unsigned)hdr->e_type, (unsigned)hdr->e_machine, (unsigned)hdr->e_version);
-
-#ifdef ELF_X86_64
- if (hdr->e_type != ET_REL || hdr->e_machine != EM_X86_64 || hdr->e_version != EV_CURRENT) {
- pr_err("Unsupported header detected\n");
- goto err;
- }
-#endif
-
-#ifdef ELF_X86_32
- if (hdr->e_type != ET_REL || hdr->e_machine != EM_386 || hdr->e_version != EV_CURRENT) {
- pr_err("Unsupported header detected\n");
- goto err;
- }
-#endif
-
- sec_hdrs = malloc(sizeof(*sec_hdrs) * hdr->e_shnum);
- if (!sec_hdrs) {
- pr_err("No memory for section headers\n");
- goto err;
- }
-
- secstrings_hdr = mem + hdr->e_shoff + hdr->e_shentsize * hdr->e_shstrndx;
- secstrings = mem + secstrings_hdr->sh_offset;
- ptr_func_exit(secstrings_hdr);
- ptr_func_exit(secstrings);
-
- pr_debug("Sections\n");
- pr_debug("------------\n");
- for (i = 0; i < hdr->e_shnum; i++) {
- Shdr_t *sh = mem + hdr->e_shoff + hdr->e_shentsize * i;
- ptr_func_exit(sh);
-
- if (sh->sh_type == SHT_SYMTAB)
- symtab_hdr = sh;
-
- ptr_func_exit(&secstrings[sh->sh_name]);
- pr_debug("\t index %-2zd type 0x%-2x name %s\n", i,
- (unsigned)sh->sh_type, &secstrings[sh->sh_name]);
-
- sec_hdrs[i] = sh;
-
-#ifdef ELF_PPC64
- if (!strcmp(&secstrings[sh->sh_name], ".toc")) {
- toc_offset = sh->sh_addr + 0x8000;
- pr_debug("\t\tTOC offset 0x%lx\n", toc_offset);
- }
-#endif
- }
-
- if (!symtab_hdr) {
- pr_err("No symbol table present\n");
- goto err;
- }
-
- if (!symtab_hdr->sh_link || symtab_hdr->sh_link >= hdr->e_shnum) {
- pr_err("Corrupted symtab header\n");
- goto err;
- }
-
- pr_debug("Symbols\n");
- pr_debug("------------\n");
- strtab_hdr = sec_hdrs[symtab_hdr->sh_link];
- ptr_func_exit(strtab_hdr);
-
- symbols = mem + symtab_hdr->sh_offset;
- ptr_func_exit(symbols);
- symstrings = mem + strtab_hdr->sh_offset;
- ptr_func_exit(symstrings);
-
- if (sizeof(*symbols) != symtab_hdr->sh_entsize) {
- pr_err("Symbol table align differ\n");
- goto err;
- }
-
- pr_out("/* Autogenerated from %s */\n", opts.input_filename);
- pr_out("#include \"piegen/uapi/types.h\"\n");
-
- for (i = 0; i < symtab_hdr->sh_size / symtab_hdr->sh_entsize; i++) {
- Sym_t *sym = &symbols[i];
- const char *name;
- Shdr_t *sh_src;
-
- ptr_func_exit(sym);
- name = &symstrings[sym->st_name];
- ptr_func_exit(name);
-
- if (*name) {
- pr_debug("\ttype 0x%-2x bind 0x%-2x shndx 0x%-4x value 0x%-2lx name %s\n",
- (unsigned)ELF_ST_TYPE(sym->st_info), (unsigned)ELF_ST_BIND(sym->st_info),
- (unsigned)sym->st_shndx, (unsigned long)sym->st_value, name);
-#ifdef ELF_PPC64
- if (!sym->st_value && !strncmp(name, ".TOC.", 6)) {
- if (!toc_offset) {
- pr_err("No TOC pointer\n");
- goto err;
- }
- sym->st_value = toc_offset;
- continue;
- }
-#endif
- if (strncmp(name, "__export", 8))
- continue;
- if ((sym->st_shndx && sym->st_shndx < hdr->e_shnum) || sym->st_shndx == SHN_ABS) {
- if (sym->st_shndx == SHN_ABS) {
- sh_src = NULL;
- } else {
- sh_src = sec_hdrs[sym->st_shndx];
- ptr_func_exit(sh_src);
- }
- pr_out("#define %s%s 0x%lx\n",
- opts.prefix_name, name,
- (unsigned long)(sym->st_value + (sh_src ? sh_src->sh_addr : 0)));
- }
- }
- }
-
- pr_out("static __maybe_unused elf_reloc_t %s[] = {\n", opts.var_name);
-
- pr_debug("Relocations\n");
- pr_debug("------------\n");
- for (i = 0; i < hdr->e_shnum; i++) {
- Shdr_t *sh = sec_hdrs[i];
- Shdr_t *sh_rel;
-
- if (sh->sh_type != SHT_REL && sh->sh_type != SHT_RELA)
- continue;
-
- sh_rel = sec_hdrs[sh->sh_info];
- ptr_func_exit(sh_rel);
-
- pr_debug("\tsection %2zd type 0x%-2x link 0x%-2x info 0x%-2x name %s\n", i,
- (unsigned)sh->sh_type, (unsigned)sh->sh_link,
- (unsigned)sh->sh_info, &secstrings[sh->sh_name]);
-
- for (k = 0; k < sh->sh_size / sh->sh_entsize; k++) {
- s64 __maybe_unused addend64, __maybe_unused value64;
- s32 addend32, value32;
- unsigned long place;
- const char *name;
- void *where;
- Sym_t *sym;
-
- union {
- Rel_t rel;
- Rela_t rela;
- } *r = mem + sh->sh_offset + sh->sh_entsize * k;
- ptr_func_exit(r);
-
- sym = &symbols[ELF_R_SYM(r->rel.r_info)];
- ptr_func_exit(sym);
-
- name = &symstrings[sym->st_name];
- ptr_func_exit(name);
-
- where = mem + sh_rel->sh_offset + r->rel.r_offset;
- ptr_func_exit(where);
-
- pr_debug("\t\tr_offset 0x%-4lx r_info 0x%-4lx / sym 0x%-2lx type 0x%-2lx symsecoff 0x%-4lx\n",
- (unsigned long)r->rel.r_offset, (unsigned long)r->rel.r_info,
- (unsigned long)ELF_R_SYM(r->rel.r_info),
- (unsigned long)ELF_R_TYPE(r->rel.r_info),
- (unsigned long)sh_rel->sh_addr);
-
- if (sym->st_shndx == SHN_UNDEF) {
-#ifdef ELF_PPC64
- /* On PowerPC, TOC symbols appear to be
- * undefined but should be processed as well.
- * Their type is STT_NOTYPE, so report any
- * other one.
- */
- if (ELF32_ST_TYPE(sym->st_info) != STT_NOTYPE
- || strncmp(name, ".TOC.", 6)) {
- pr_err("Unexpected undefined symbol:%s\n", name);
- goto err;
- }
-#else
- continue;
-#endif
- }
-
- if (sh->sh_type == SHT_REL) {
- addend32 = *(s32 *)where;
- addend64 = *(s64 *)where;
- } else {
- addend32 = (s32)r->rela.r_addend;
- addend64 = (s64)r->rela.r_addend;
- }
-
- place = sh_rel->sh_addr + r->rel.r_offset;
-
- pr_debug("\t\t\tvalue 0x%-8lx addend32 %-4d addend64 %-8ld place %-8lx symname %s\n",
- (unsigned long)sym->st_value, addend32, (long)addend64, (long)place, name);
-
- if (sym->st_shndx == SHN_ABS) {
- value32 = (s32)sym->st_value;
- value64 = (s64)sym->st_value;
- } else {
- Shdr_t *sh_src;
-
- if ((unsigned)sym->st_shndx > (unsigned)hdr->e_shnum) {
- pr_err("Unexpected symbol section index %u/%u\n",
- (unsigned)sym->st_shndx, hdr->e_shnum);
- goto err;
- }
- sh_src = sec_hdrs[sym->st_shndx];
- ptr_func_exit(sh_src);
-
- value32 = (s32)sh_src->sh_addr + (s32)sym->st_value;
- value64 = (s64)sh_src->sh_addr + (s64)sym->st_value;
- }
-
-#ifdef ELF_PPC64
-/* Snippet from the OpenPOWER ABI for Linux Supplement:
- * The OpenPOWER ABI uses the three most-significant bits in the symbol
- * st_other field specifies the number of instructions between a function's
- * global entry point and local entry point. The global entry point is used
- * when it is necessary to set up the TOC pointer (r2) for the function. The
- * local entry point is used when r2 is known to already be valid for the
- * function. A value of zero in these bits asserts that the function does
- * not use r2.
- * The st_other values have the following meanings:
- * 0 and 1, the local and global entry points are the same.
- * 2, the local entry point is at 1 instruction past the global entry point.
- * 3, the local entry point is at 2 instructions past the global entry point.
- * 4, the local entry point is at 4 instructions past the global entry point.
- * 5, the local entry point is at 8 instructions past the global entry point.
- * 6, the local entry point is at 16 instructions past the global entry point.
- * 7, reserved.
- *
- * Here we are only handle the case '3' which is the most commonly seen.
- */
-#define LOCAL_OFFSET(s) ((s->st_other >> 5) & 0x7)
- if (LOCAL_OFFSET(sym)) {
- if (LOCAL_OFFSET(sym) != 3) {
- pr_err("Unexpected local offset value %d\n",
- LOCAL_OFFSET(sym));
- goto err;
- }
- pr_debug("\t\t\tUsing local offset\n");
- value64 += 8;
- value32 += 8;
- }
-#endif
-
- switch (ELF_R_TYPE(r->rel.r_info)) {
-#ifdef ELF_PPC64
- case R_PPC64_REL24:
- /* Update PC relative offset, linker has not done this yet */
- pr_debug("\t\t\tR_PPC64_REL24 at 0x%-4lx val 0x%lx\n",
- place, value64);
- /* Convert value to relative */
- value64 -= place;
- if (value64 + 0x2000000 > 0x3ffffff || (value64 & 3) != 0) {
- pr_err("REL24 %li out of range!\n", (long int)value64);
- goto err;
- }
- /* Only replace bits 2 through 26 */
- *(uint32_t *)where = (*(uint32_t *)where & ~0x03fffffc) |
- (value64 & 0x03fffffc);
- break;
-
- case R_PPC64_ADDR32:
- pr_debug("\t\t\tR_PPC64_ADDR32 at 0x%-4lx val 0x%x\n",
- place, (unsigned int)(value32 + addend32));
- pr_out(" { .offset = 0x%-8x, .type = PIEGEN_TYPE_INT, "
- " .addend = %-8d, .value = 0x%-16x, "
- "}, /* R_PPC64_ADDR32 */\n",
- (unsigned int) place, addend32, value32);
- break;
-
- case R_PPC64_ADDR64:
- case R_PPC64_REL64:
- pr_debug("\t\t\tR_PPC64_ADDR64 at 0x%-4lx val 0x%lx\n",
- place, value64 + addend64);
- pr_out("\t{ .offset = 0x%-8x, .type = PIEGEN_TYPE_LONG,"
- " .addend = %-8ld, .value = 0x%-16lx, "
- "}, /* R_PPC64_ADDR64 */\n",
- (unsigned int) place, (long)addend64, (long)value64);
- break;
-
- case R_PPC64_TOC16_HA:
- pr_debug("\t\t\tR_PPC64_TOC16_HA at 0x%-4lx val 0x%lx\n",
- place, value64 + addend64 - toc_offset + 0x8000);
- if (do_relative_toc((value64 + addend64 - toc_offset + 0x8000) >> 16,
- where, 0xffff, 1))
- goto err;
- break;
-
- case R_PPC64_TOC16_LO:
- pr_debug("\t\t\tR_PPC64_TOC16_LO at 0x%-4lx val 0x%lx\n",
- place, value64 + addend64 - toc_offset);
- if (do_relative_toc(value64 + addend64 - toc_offset,
- where, 0xffff, 1))
- goto err;
- break;
-
- case R_PPC64_TOC16_LO_DS:
- pr_debug("\t\t\tR_PPC64_TOC16_LO_DS at 0x%-4lx val 0x%lx\n",
- place, value64 + addend64 - toc_offset);
- if (do_relative_toc(value64 + addend64 - toc_offset,
- where, 0xfffc, 0))
- goto err;
- break;
-
- case R_PPC64_REL16_HA:
- value64 += addend64 - place;
- pr_debug("\t\t\tR_PPC64_REL16_HA at 0x%-4lx val 0x%lx\n",
- place, value64);
- /* check that we are dealing with the addis 2,12 instruction */
- if (((*(uint32_t*)where) & 0xffff0000) != 0x3c4c0000) {
- pr_err("Unexpected instruction for R_PPC64_REL16_HA\n");
- goto err;
- }
- *(uint16_t *)where = ((value64 + 0x8000) >> 16) & 0xffff;
- break;
-
- case R_PPC64_REL16_LO:
- value64 += addend64 - place;
- pr_debug("\t\t\tR_PPC64_REL16_LO at 0x%-4lx val 0x%lx\n",
- place, value64);
- /* check that we are dealing with the addi 2,2 instruction */
- if (((*(uint32_t*)where) & 0xffff0000) != 0x38420000) {
- pr_err("Unexpected instruction for R_PPC64_REL16_LO\n");
- goto err;
- }
- *(uint16_t *)where = value64 & 0xffff;
- break;
-
-#endif /* ELF_PPC64 */
-
-#ifdef ELF_X86_64
- case R_X86_64_32: /* Symbol + Addend (4 bytes) */
- pr_debug("\t\t\t\tR_X86_64_32 at 0x%-4lx val 0x%x\n", place, value32);
- pr_out(" { .offset = 0x%-8x, .type = PIEGEN_TYPE_INT, "
- ".addend = %-8d, .value = 0x%-16x, }, /* R_X86_64_32 */\n",
- (unsigned int)place, addend32, value32);
- break;
- case R_X86_64_64: /* Symbol + Addend (8 bytes) */
- pr_debug("\t\t\t\tR_X86_64_64 at 0x%-4lx val 0x%lx\n", place, (long)value64);
- pr_out(" { .offset = 0x%-8x, .type = PIEGEN_TYPE_LONG, "
- ".addend = %-8ld, .value = 0x%-16lx, }, /* R_X86_64_64 */\n",
- (unsigned int)place, (long)addend64, (long)value64);
- break;
- case R_X86_64_PC32: /* Symbol + Addend - Place (4 bytes) */
- pr_debug("\t\t\t\tR_X86_64_PC32 at 0x%-4lx val 0x%x\n", place, value32 + addend32 - (s32)place);
- /*
- * R_X86_64_PC32 are relative, patch them inplace.
- */
- *((s32 *)where) = value32 + addend32 - place;
- break;
- case R_X86_64_PLT32: /* ProcLinkage + Addend - Place (4 bytes) */
- pr_debug("\t\t\t\tR_X86_64_PLT32 at 0x%-4lx val 0x%x\n", place, value32 + addend32 - (s32)place);
- /*
- * R_X86_64_PLT32 are relative, patch them inplace.
- */
- *((s32 *)where) = value32 + addend32 - place;
- break;
- case R_X86_64_GOTPCREL: /* SymbolOffsetInGot + GOT + Addend - Place (4 bytes) */
- pr_debug("\t\t\t\tR_X86_64_GOTPCREL at 0x%-4lx val 0x%x\n", place, value32);
- pr_out(" { .offset = 0x%-8x, .type = PIEGEN_TYPE_LONG | PIEGEN_TYPE_GOTPCREL, "
- ".addend = %-8d, .value = 0x%-16x, }, /* R_X86_64_GOTPCREL */\n",
- (unsigned int)place, addend32, value32);
- nr_gotpcrel++;
- break;
-#endif
-
-#ifdef ELF_X86_32
- case R_386_32: /* Symbol + Addend */
- pr_debug("\t\t\t\tR_386_32 at 0x%-4lx val 0x%x\n", place, value32 + addend32);
- pr_out(" { .offset = 0x%-8x, .type = PIEGEN_TYPE_INT, "
- ".addend = %-4d, .value = 0x%x, },\n",
- (unsigned int)place, addend32, value32);
- break;
- case R_386_PC32: /* Symbol + Addend - Place */
- pr_debug("\t\t\t\tR_386_PC32 at 0x%-4lx val 0x%x\n", place, value32 + addend32 - (s32)place);
- /*
- * R_386_PC32 are relative, patch them inplace.
- */
- *((s32 *)where) = value32 + addend32 - place;
- break;
-#endif
-
- default:
- pr_err("Unsupported relocation\n");
- goto err;
- }
- }
- }
- pr_out("};\n");
- pr_out("static __maybe_unused size_t %s = %zd;\n", opts.nrgotpcrel_name, nr_gotpcrel);
-
- pr_out("static __maybe_unused const char %s[] = {\n\t", opts.stream_name);
-
- for (i=0, k=0; i < hdr->e_shnum; i++) {
- Shdr_t *sh = sec_hdrs[i];
- unsigned char *shdata;
- size_t j;
-
- if (!(sh->sh_flags & SHF_ALLOC) || !sh->sh_size)
- continue;
-
- shdata = mem + sh->sh_offset;
- pr_debug("Copying section '%s'\n" \
- "\tstart:0x%lx (gap:0x%lx) size:0x%lx\n",
- &secstrings[sh->sh_name], (unsigned long) sh->sh_addr,
- (unsigned long)(sh->sh_addr - k), (unsigned long) sh->sh_size);
-
- /* write 0 in the gap between the 2 sections */
- for (;k < sh->sh_addr; k++) {
- if (k && (k % 8) == 0)
- pr_out("\n\t");
- pr_out("0x00,");
- }
-
- for (j=0; j < sh->sh_size; j++, k++) {
- if (k && (k % 8) == 0)
- pr_out("\n\t");
- pr_out("0x%02x,", shdata[j]);
- }
- }
- pr_out("};\n");
- free(sec_hdrs);
- return 0;
-err:
- free(sec_hdrs);
- return -1;
-}
diff --git a/pie/piegen/main.c b/pie/piegen/main.c
deleted file mode 100644
index d3ad823339bb..000000000000
--- a/pie/piegen/main.c
+++ /dev/null
@@ -1,154 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <unistd.h>
-#include <stdint.h>
-#include <getopt.h>
-#include <string.h>
-
-#include <fcntl.h>
-#include <elf.h>
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-
-#include "compiler.h"
-#include "config.h"
-#include "piegen.h"
-
-piegen_opt_t opts = {
- .input_filename = "file.o",
- .stream_name = "stream",
- .prefix_name = "__",
- .var_name = "elf_relocs",
- .nrgotpcrel_name = "nr_gotpcrel",
-};
-
-FILE *fout;
-
-static int handle_elf(void *mem, size_t size)
-{
-#if defined(CONFIG_X86_32) || defined(CONFIG_X86_64)
- unsigned char elf_ident_x86_32[EI_NIDENT] = {
- 0x7f, 0x45, 0x4c, 0x46, 0x01, 0x01, 0x01, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- };
-
- unsigned char elf_ident_x86_64[EI_NIDENT] = {
- 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- };
-
- if (memcmp(mem, elf_ident_x86_32, sizeof(elf_ident_x86_32)) == 0)
- return handle_elf_x86_32(mem, size);
- else if (memcmp(mem, elf_ident_x86_64, sizeof(elf_ident_x86_64)) == 0)
- return handle_elf_x86_64(mem, size);
-#endif
-
-#if defined(CONFIG_PPC64)
- const unsigned char elf_ident[EI_NIDENT] = {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
- 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-#else
- 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x02, 0x01, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-#endif
- };
-
- if (memcmp(mem, elf_ident, sizeof(elf_ident)) == 0)
- return handle_elf_ppc64(mem, size);
-#endif /* CONFIG_PPC64 */
-
- pr_err("Unsupported Elf format detected\n");
- return -1;
-}
-
-/*
- * That;s the tool to generate patches object files.
- */
-int main(int argc, char *argv[])
-{
- struct stat st;
- int opt, idx;
- void *mem;
- int fd;
-
- static const char short_opts[] = "f:o:s:p:v:h";
- static struct option long_opts[] = {
- { "file", required_argument, 0, 'f' },
- { "output", required_argument, 0, 'o' },
- { "stream", required_argument, 0, 's' },
- { "sym-prefix", required_argument, 0, 'p' },
- { "variable", required_argument, 0, 'v' },
- { "help", required_argument, 0, 'h' },
- { },
- };
-
- if (argc < 3)
- goto usage;
-
- while (1) {
- idx = -1;
- opt = getopt_long(argc, argv, short_opts, long_opts, &idx);
- if (opt == -1)
- break;
- switch (opt) {
- case 'f':
- opts.input_filename = optarg;
- break;
- case 'o':
- opts.output_filename = optarg;
- break;
- case 's':
- opts.stream_name = optarg;
- break;
- case 'p':
- opts.prefix_name = optarg;
- break;
- case 'v':
- opts.var_name = optarg;
- break;
- case 'h':
- default:
- goto usage;
- }
- }
-
- fd = open(opts.input_filename, O_RDONLY);
- if (fd < 0) {
- pr_perror("Can't open file %s", opts.input_filename);
- goto err;
- }
-
- if (fstat(fd, &st)) {
- pr_perror("Can't stat file %s", opts.input_filename);
- goto err;
- }
-
- fout = fopen(opts.output_filename, "w");
- if (fout == NULL) {
- pr_perror("Can't open %s", opts.output_filename);
- goto err;
- }
-
- mem = mmap(NULL, st.st_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FILE, fd, 0);
- if (mem == MAP_FAILED) {
- pr_perror("Can't mmap file %s", opts.input_filename);
- goto err;
- }
-
- if (handle_elf(mem, st.st_size)) {
- fclose(fout);
- unlink(opts.output_filename);
- goto err;
- }
- fclose(fout);
- printf("%s generated successfully.\n", opts.output_filename);
- return 0;
-usage:
- fprintf(stderr, "Usage: %s -f filename\n", argv[0]);
-err:
- return 1;
-}
diff --git a/pie/piegen/piegen.h b/pie/piegen/piegen.h
deleted file mode 100644
index 8488c0abb989..000000000000
--- a/pie/piegen/piegen.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef __ELFTIL_H__
-#define __ELFTIL_H__
-
-#include <stdio.h>
-#include <unistd.h>
-
-typedef struct {
- char *input_filename;
- char *output_filename;
- char *stream_name;
- char *prefix_name;
- char *var_name;
- char *nrgotpcrel_name;
-} piegen_opt_t;
-
-extern piegen_opt_t opts;
-extern FILE *fout;
-
-#if defined(CONFIG_X86_32) || defined(CONFIG_X86_64)
-extern int handle_elf_x86_32(void *mem, size_t size);
-extern int handle_elf_x86_64(void *mem, size_t size);
-#endif
-
-#if defined(CONFIG_PPC64)
-extern int handle_elf_ppc64(void *mem, size_t size);
-#endif
-
-#define pr_out(fmt, ...) fprintf(fout, fmt, ##__VA_ARGS__)
-
-#define pr_debug(fmt, ...) printf("%s: "fmt, opts.stream_name, ##__VA_ARGS__)
-
-#define pr_err(fmt, ...) fprintf(stderr, "%s: Error (%s:%d): "fmt, opts.stream_name, __FILE__, __LINE__, ##__VA_ARGS__)
-#define pr_perror(fmt, ...) fprintf(stderr, "%s: Error (%s:%d): "fmt "%m\n", opts.stream_name, __FILE__, __LINE__, ##__VA_ARGS__)
-
-#endif /* __ELFTIL_H__ */
diff --git a/pie/piegen/uapi/types.h b/pie/piegen/uapi/types.h
deleted file mode 100644
index 34696e8c6aa5..000000000000
--- a/pie/piegen/uapi/types.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef __PIEGEN_TYPES_H__
-#define __PIEGEN_TYPES_H__
-
-#define PIEGEN_TYPE_INT (1u << 0)
-#define PIEGEN_TYPE_LONG (1u << 1)
-#define PIEGEN_TYPE_GOTPCREL (1u << 2)
-
-typedef struct {
- unsigned int offset;
- unsigned int type;
- long addend;
- long value;
-} elf_reloc_t;
-
-#endif /* __PIEGEN_TYPES_H__ */
diff --git a/pie/restorer.c b/pie/restorer.c
deleted file mode 100644
index 8cec7f9c419b..000000000000
--- a/pie/restorer.c
+++ /dev/null
@@ -1,1335 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <linux/securebits.h>
-#include <linux/capability.h>
-#include <sys/types.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/wait.h>
-#include <sys/time.h>
-#include <sys/shm.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <sched.h>
-#include <sys/resource.h>
-#include <signal.h>
-
-#include "compiler.h"
-#include "asm/types.h"
-#include "syscall.h"
-#include "config.h"
-#include "prctl.h"
-#include "log.h"
-#include "util.h"
-#include "image.h"
-#include "sk-inet.h"
-#include "vma.h"
-
-#include "crtools.h"
-#include "lock.h"
-#include "restorer.h"
-#include "aio.h"
-#include "seccomp.h"
-
-#include "protobuf/creds.pb-c.h"
-#include "protobuf/mm.pb-c.h"
-
-#include "asm/restorer.h"
-
-#ifndef PR_SET_PDEATHSIG
-#define PR_SET_PDEATHSIG 1
-#endif
-
-#define sys_prctl_safe(opcode, val1, val2, val3) \
- ({ \
- long __ret = sys_prctl(opcode, val1, val2, val3, 0); \
- if (__ret) \
- pr_err("prctl failed @%d with %ld\n", __LINE__, __ret);\
- __ret; \
- })
-
-static struct task_entries *task_entries;
-static futex_t thread_inprogress;
-static pid_t *helpers;
-static int n_helpers;
-static pid_t *zombies;
-static int n_zombies;
-
-extern void cr_restore_rt (void) asm ("__cr_restore_rt")
- __attribute__ ((visibility ("hidden")));
-
-static void sigchld_handler(int signal, siginfo_t *siginfo, void *data)
-{
- char *r;
- int i;
-
- /* We can ignore helpers that die, we expect them to after
- * CR_STATE_RESTORE is finished. */
- for (i = 0; i < n_helpers; i++)
- if (siginfo->si_pid == helpers[i])
- return;
-
- for (i = 0; i < n_zombies; i++)
- if (siginfo->si_pid == zombies[i])
- return;
-
- if (siginfo->si_code & CLD_EXITED)
- r = " exited, status=";
- else if (siginfo->si_code & CLD_KILLED)
- r = " killed by signal ";
- else
- r = "disappeared with ";
-
- pr_info("Task %d %s %d\n", siginfo->si_pid, r, siginfo->si_status);
-
- futex_abort_and_wake(&task_entries->nr_in_progress);
- /* sa_restorer may be unmaped, so we can't go back to userspace*/
- sys_kill(sys_getpid(), SIGSTOP);
- sys_exit_group(1);
-}
-
-static int lsm_set_label(char *label, int procfd)
-{
- int ret = -1, len, lsmfd;
- char path[LOG_SIMPLE_CHUNK];
-
- if (!label)
- return 0;
-
- pr_info("restoring lsm profile %s\n", label);
-
- simple_sprintf(path, "self/task/%ld/attr/current", sys_gettid());
-
- lsmfd = sys_openat(procfd, path, O_WRONLY, 0);
- if (lsmfd < 0) {
- pr_err("failed openat %d\n", lsmfd);
- return -1;
- }
-
- for (len = 0; label[len]; len++)
- ;
-
- ret = sys_write(lsmfd, label, len);
- sys_close(lsmfd);
- if (ret < 0) {
- pr_err("can't write lsm profile %d\n", ret);
- return -1;
- }
-
- return 0;
-}
-
-static int restore_creds(struct thread_creds_args *args, int procfd)
-{
- CredsEntry *ce = &args->creds;
- int b, i, ret;
- struct cap_header hdr;
- struct cap_data data[_LINUX_CAPABILITY_U32S_3];
-
- /*
- * We're still root here and thus can do it without failures.
- */
-
- /*
- * Setup supplementary group IDs early.
- */
- if (args->groups) {
- ret = sys_setgroups(ce->n_groups, args->groups);
- if (ret) {
- pr_err("Can't setup supplementary group IDs: %d\n", ret);
- return -1;
- }
- }
-
- /*
- * First -- set the SECURE_NO_SETUID_FIXUP bit not to
- * lose caps bits when changing xids.
- */
-
- ret = sys_prctl(PR_SET_SECUREBITS, 1 << SECURE_NO_SETUID_FIXUP, 0, 0, 0);
- if (ret) {
- pr_err("Unable to set SECURE_NO_SETUID_FIXUP: %d\n", ret);
- return -1;
- }
-
- /*
- * Second -- restore xids. Since we still have the CAP_SETUID
- * capability nothing should fail. But call the setfsXid last
- * to override the setresXid settings.
- */
-
- ret = sys_setresuid(ce->uid, ce->euid, ce->suid);
- if (ret) {
- pr_err("Unable to set real, effective and saved user ID: %d\n", ret);
- return -1;
- }
-
- sys_setfsuid(ce->fsuid);
- if (sys_setfsuid(-1) != ce->fsuid) {
- pr_err("Unable to set fsuid\n");
- return -1;
- }
-
- ret = sys_setresgid(ce->gid, ce->egid, ce->sgid);
- if (ret) {
- pr_err("Unable to set real, effective and saved group ID: %d\n", ret);
- return -1;
- }
-
- sys_setfsgid(ce->fsgid);
- if (sys_setfsgid(-1) != ce->fsgid) {
- pr_err("Unable to set fsgid\n");
- return -1;
- }
-
- /*
- * Third -- restore securebits. We don't need them in any
- * special state any longer.
- */
-
- ret = sys_prctl(PR_SET_SECUREBITS, ce->secbits, 0, 0, 0);
- if (ret) {
- pr_err("Unable to set PR_SET_SECUREBITS: %d\n", ret);
- return -1;
- }
-
- /*
- * Fourth -- trim bset. This can only be done while
- * having the CAP_SETPCAP capablity.
- */
-
- for (b = 0; b < CR_CAP_SIZE; b++) {
- for (i = 0; i < 32; i++) {
- if (b * 32 + i > args->cap_last_cap)
- break;
- if (args->cap_bnd[b] & (1 << i))
- /* already set */
- continue;
- ret = sys_prctl(PR_CAPBSET_DROP, i + b * 32, 0, 0, 0);
- if (ret) {
- pr_err("Unable to drop capability %d: %d\n",
- i + b * 32, ret);
- return -1;
- }
- }
- }
-
- /*
- * Fifth -- restore caps. Nothing but cap bits are changed
- * at this stage, so just do it.
- */
-
- hdr.version = _LINUX_CAPABILITY_VERSION_3;
- hdr.pid = 0;
-
- BUILD_BUG_ON(_LINUX_CAPABILITY_U32S_3 != CR_CAP_SIZE);
-
- for (i = 0; i < CR_CAP_SIZE; i++) {
- data[i].eff = args->cap_eff[i];
- data[i].prm = args->cap_prm[i];
- data[i].inh = args->cap_inh[i];
- }
-
- ret = sys_capset(&hdr, data);
- if (ret) {
- pr_err("Unable to restore capabilities: %d\n", ret);
- return -1;
- }
-
- if (lsm_set_label(args->lsm_profile, procfd) < 0)
- return -1;
- return 0;
-}
-
-/*
- * This should be done after creds restore, as
- * some creds changes might drop the value back
- * to zero.
- */
-
-static inline int restore_pdeath_sig(struct thread_restore_args *ta)
-{
- if (ta->pdeath_sig)
- return sys_prctl(PR_SET_PDEATHSIG, ta->pdeath_sig, 0, 0, 0);
- else
- return 0;
-}
-
-static int restore_dumpable_flag(MmEntry *mme)
-{
- int current_dumpable;
- int ret;
-
- if (!mme->has_dumpable) {
- pr_warn("Dumpable flag not present in criu dump.\n");
- return 0;
- }
-
- if (mme->dumpable == 0 || mme->dumpable == 1) {
- ret = sys_prctl(PR_SET_DUMPABLE, mme->dumpable, 0, 0, 0);
- if (ret) {
- pr_err("Unable to set PR_SET_DUMPABLE: %d\n", ret);
- return -1;
- }
- return 0;
- }
-
- /*
- * If dumpable flag is present but it is not 0 or 1, then we can not
- * use prctl to set it back. Try to see if it is already correct
- * (which is likely if sysctl fs.suid_dumpable is the same when dump
- * and restore are run), in which case there is nothing to do.
- * Otherwise, set dumpable to 0 which should be a secure fallback.
- */
- current_dumpable = sys_prctl(PR_GET_DUMPABLE, 0, 0, 0, 0);
- if (mme->dumpable != current_dumpable) {
- pr_warn("Dumpable flag [%d] does not match current [%d]. "
- "Will fallback to setting it to 0 to disable it.\n",
- mme->dumpable, current_dumpable);
- ret = sys_prctl(PR_SET_DUMPABLE, 0, 0, 0, 0);
- if (ret) {
- pr_err("Unable to set PR_SET_DUMPABLE: %d\n", ret);
- return -1;
- }
- }
- return 0;
-}
-
-static void restore_sched_info(struct rst_sched_param *p)
-{
- struct sched_param parm;
-
- pr_info("Restoring scheduler params %d.%d.%d\n",
- p->policy, p->nice, p->prio);
-
- sys_setpriority(PRIO_PROCESS, 0, p->nice);
- parm.sched_priority = p->prio;
- sys_sched_setscheduler(0, p->policy, &parm);
-}
-
-static void restore_rlims(struct task_restore_args *ta)
-{
- int r;
-
- for (r = 0; r < ta->rlims_n; r++) {
- struct krlimit krlim;
-
- krlim.rlim_cur = ta->rlims[r].rlim_cur;
- krlim.rlim_max = ta->rlims[r].rlim_max;
- sys_setrlimit(r, &krlim);
- }
-}
-
-static int restore_signals(siginfo_t *ptr, int nr, bool group)
-{
- int ret, i;
-
- for (i = 0; i < nr; i++) {
- siginfo_t *info = ptr + i;
-
- pr_info("Restore signal %d group %d\n", info->si_signo, group);
- if (group)
- ret = sys_rt_sigqueueinfo(sys_getpid(), info->si_signo, info);
- else
- ret = sys_rt_tgsigqueueinfo(sys_getpid(),
- sys_gettid(), info->si_signo, info);
- if (ret) {
- pr_err("Unable to send siginfo %d %x with code %d\n",
- info->si_signo, info->si_code, ret);
- return -1;;
- }
- }
-
- return 0;
-}
-
-static int restore_seccomp(struct task_restore_args *args)
-{
- int ret;
-
- switch (args->seccomp_mode) {
- case SECCOMP_MODE_DISABLED:
- return 0;
- case SECCOMP_MODE_STRICT:
- ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0);
- if (ret < 0) {
- pr_err("prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT) returned %d\n", ret);
- goto die;
- }
- return 0;
- case SECCOMP_MODE_FILTER: {
- int i;
- void *filter_data;
-
- filter_data = &args->seccomp_filters[args->seccomp_filters_n];
-
- for (i = 0; i < args->seccomp_filters_n; i++) {
- struct sock_fprog *fprog = &args->seccomp_filters[i];
-
- fprog->filter = filter_data;
-
- /* We always TSYNC here, since we require that the
- * creds for all threads be the same; this means we
- * don't have to restore_seccomp() in threads, and that
- * future TSYNC behavior will be correct.
- */
- ret = sys_seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, (char *) fprog);
- if (ret < 0) {
- pr_err("sys_seccomp() returned %d\n", ret);
- goto die;
- }
-
- filter_data += fprog->len * sizeof(struct sock_filter);
- }
-
- return 0;
- }
- default:
- goto die;
- }
-
- return 0;
-die:
- return -1;
-}
-
-static int restore_thread_common(struct rt_sigframe *sigframe,
- struct thread_restore_args *args)
-{
- sys_set_tid_address((int *)decode_pointer(args->clear_tid_addr));
-
- if (args->has_futex && args->futex_rla_len) {
- int ret;
-
- ret = sys_set_robust_list(decode_pointer(args->futex_rla),
- args->futex_rla_len);
- if (ret) {
- pr_err("Failed to recover futex robust list: %d\n", ret);
- return -1;
- }
- }
-
- restore_sched_info(&args->sp);
-
- if (restore_nonsigframe_gpregs(&args->gpregs))
- return -1;
-
- restore_tls(&args->tls);
-
- return 0;
-}
-
-static void noinline rst_sigreturn(unsigned long new_sp)
-{
- ARCH_RT_SIGRETURN(new_sp);
-}
-
-/*
- * Threads restoration via sigreturn. Note it's locked
- * routine and calls for unlock at the end.
- */
-long __export_restore_thread(struct thread_restore_args *args)
-{
- struct rt_sigframe *rt_sigframe;
- k_rtsigset_t to_block;
- unsigned long new_sp;
- int my_pid = sys_gettid();
- int ret;
-
- if (my_pid != args->pid) {
- pr_err("Thread pid mismatch %d/%d\n", my_pid, args->pid);
- goto core_restore_end;
- }
-
- /* All signals must be handled by thread leader */
- ksigfillset(&to_block);
- ret = sys_sigprocmask(SIG_SETMASK, &to_block, NULL, sizeof(k_rtsigset_t));
- if (ret) {
- pr_err("Unable to block signals %d\n", ret);
- goto core_restore_end;
- }
-
- rt_sigframe = (void *)args->mem_zone.rt_sigframe;
-
- if (restore_thread_common(rt_sigframe, args))
- goto core_restore_end;
-
- ret = restore_creds(args->creds_args, args->ta->proc_fd);
- if (ret)
- goto core_restore_end;
-
- ret = restore_dumpable_flag(&args->ta->mm);
- if (ret)
- goto core_restore_end;
-
- pr_info("%ld: Restored\n", sys_gettid());
-
- restore_finish_stage(CR_STATE_RESTORE);
-
- if (restore_signals(args->siginfo, args->siginfo_n, false))
- goto core_restore_end;
-
- restore_finish_stage(CR_STATE_RESTORE_SIGCHLD);
- restore_pdeath_sig(args);
-
- if (args->ta->seccomp_mode != SECCOMP_MODE_DISABLED)
- pr_info("Restoring seccomp mode %d for %ld\n", args->ta->seccomp_mode, sys_getpid());
-
- restore_finish_stage(CR_STATE_RESTORE_CREDS);
-
- futex_dec_and_wake(&thread_inprogress);
-
- new_sp = (long)rt_sigframe + SIGFRAME_OFFSET;
- rst_sigreturn(new_sp);
-
-core_restore_end:
- pr_err("Restorer abnormal termination for %ld\n", sys_getpid());
- futex_abort_and_wake(&task_entries->nr_in_progress);
- sys_exit_group(1);
- return -1;
-}
-
-static long restore_self_exe_late(struct task_restore_args *args)
-{
- int fd = args->fd_exe_link, ret;
-
- pr_info("Restoring EXE link\n");
- ret = sys_prctl_safe(PR_SET_MM, PR_SET_MM_EXE_FILE, fd, 0);
- if (ret)
- pr_err("Can't restore EXE link (%d)\n", ret);
- sys_close(fd);
-
- return ret;
-}
-
-static unsigned long restore_mapping(const VmaEntry *vma_entry)
-{
- int prot = vma_entry->prot;
- int flags = vma_entry->flags | MAP_FIXED;
- unsigned long addr;
-
- if (vma_entry_is(vma_entry, VMA_AREA_SYSVIPC))
- return sys_shmat(vma_entry->fd, decode_pointer(vma_entry->start),
- (vma_entry->prot & PROT_WRITE) ? 0 : SHM_RDONLY);
-
- /*
- * Restore or shared mappings are tricky, since
- * we open anonymous mapping via map_files/
- * MAP_ANONYMOUS should be eliminated so fd would
- * be taken into account by a kernel.
- */
- if (vma_entry_is(vma_entry, VMA_ANON_SHARED) && (vma_entry->fd != -1UL))
- flags &= ~MAP_ANONYMOUS;
-
- /* A mapping of file with MAP_SHARED is up to date */
- if (vma_entry->fd == -1 || !(vma_entry->flags & MAP_SHARED))
- prot |= PROT_WRITE;
-
- pr_debug("\tmmap(%"PRIx64" -> %"PRIx64", %x %x %d)\n",
- vma_entry->start, vma_entry->end,
- prot, flags, (int)vma_entry->fd);
- /*
- * Should map memory here. Note we map them as
- * writable since we're going to restore page
- * contents.
- */
- addr = sys_mmap(decode_pointer(vma_entry->start),
- vma_entry_len(vma_entry),
- prot, flags,
- vma_entry->fd,
- vma_entry->pgoff);
-
- if (vma_entry->fd != -1)
- sys_close(vma_entry->fd);
-
- return addr;
-}
-
-static void rst_tcp_repair_off(struct rst_tcp_sock *rts)
-{
- int aux, ret;
-
- aux = rts->reuseaddr;
- pr_debug("pie: Turning repair off for %d (reuse %d)\n", rts->sk, aux);
- tcp_repair_off(rts->sk);
-
- ret = sys_setsockopt(rts->sk, SOL_SOCKET, SO_REUSEADDR, &aux, sizeof(aux));
- if (ret < 0)
- pr_err("Failed to restore of SO_REUSEADDR on socket (%d)\n", ret);
-}
-
-static void rst_tcp_socks_all(struct task_restore_args *ta)
-{
- int i;
-
- for (i = 0; i < ta->tcp_socks_n; i++)
- rst_tcp_repair_off(&ta->tcp_socks[i]);
-}
-
-static int vma_remap(unsigned long src, unsigned long dst, unsigned long len)
-{
- unsigned long guard = 0, tmp;
-
- pr_info("Remap %lx->%lx len %lx\n", src, dst, len);
-
- if (src - dst < len)
- guard = dst;
- else if (dst - src < len)
- guard = dst + len - PAGE_SIZE;
-
- if (src == dst)
- return 0;
-
- if (guard != 0) {
- /*
- * mremap() returns an error if a target and source vma-s are
- * overlapped. In this case the source vma are remapped in
- * a temporary place and then remapped to the target address.
- * Here is one hack to find non-ovelapped temporary place.
- *
- * 1. initial placement. We need to move src -> tgt.
- * | |+++++src+++++|
- * |-----tgt-----| |
- *
- * 2. map a guard page at the non-ovelapped border of a target vma.
- * | |+++++src+++++|
- * |G|----tgt----| |
- *
- * 3. remap src to any other place.
- * G prevents src from being remaped on tgt again
- * | |-------------| -> |+++++src+++++|
- * |G|---tgt-----| |
- *
- * 4. remap src to tgt, no overlapping any longer
- * |+++++src+++++| <---- |-------------|
- * |G|---tgt-----| |
- */
-
- unsigned long addr;
-
- /* Map guard page (step 2) */
- tmp = sys_mmap((void *) guard, PAGE_SIZE, PROT_NONE,
- MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
- if (tmp != guard) {
- pr_err("Unable to map a guard page %lx (%lx)\n", guard, tmp);
- return -1;
- }
-
- /* Move src to non-overlapping place (step 3) */
- addr = sys_mmap(NULL, len, PROT_NONE,
- MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
- if (addr == (unsigned long) MAP_FAILED) {
- pr_err("Unable to reserve memory (%lx)\n", addr);
- return -1;
- }
-
- tmp = sys_mremap(src, len, len,
- MREMAP_MAYMOVE | MREMAP_FIXED, addr);
- if (tmp != addr) {
- pr_err("Unable to remap %lx -> %lx (%lx)\n", src, addr, tmp);
- return -1;
- }
-
- src = addr;
- }
-
- tmp = sys_mremap(src, len, len, MREMAP_MAYMOVE | MREMAP_FIXED, dst);
- if (tmp != dst) {
- pr_err("Unable to remap %lx -> %lx\n", src, dst);
- return -1;
- }
-
- return 0;
-}
-
-static int timerfd_arm(struct task_restore_args *args)
-{
- int i;
-
- for (i = 0; i < args->timerfd_n; i++) {
- struct restore_timerfd *t = &args->timerfd[i];
- int ret;
-
- pr_debug("timerfd: arm for fd %d (%d)\n", t->fd, i);
-
- if (t->settime_flags & TFD_TIMER_ABSTIME) {
- struct timespec ts = { };
-
- /*
- * We might need to adjust value because the checkpoint
- * and restore procedure takes some time itself. Note
- * we don't adjust nanoseconds, since the result may
- * overflow the limit NSEC_PER_SEC FIXME
- */
- if (sys_clock_gettime(t->clockid, &ts)) {
- pr_err("Can't get current time\n");
- return -1;
- }
-
- t->val.it_value.tv_sec += (time_t)ts.tv_sec;
-
- pr_debug("Ajust id %#x it_value(%llu, %llu) -> it_value(%llu, %llu)\n",
- t->id, (unsigned long long)ts.tv_sec,
- (unsigned long long)ts.tv_nsec,
- (unsigned long long)t->val.it_value.tv_sec,
- (unsigned long long)t->val.it_value.tv_nsec);
- }
-
- ret = sys_timerfd_settime(t->fd, t->settime_flags, &t->val, NULL);
- if (t->ticks)
- ret |= sys_ioctl(t->fd, TFD_IOC_SET_TICKS, (unsigned long)&t->ticks);
- if (ret) {
- pr_err("Can't restore ticks/time for timerfd - %d\n", i);
- return ret;
- }
- }
- return 0;
-}
-
-static int create_posix_timers(struct task_restore_args *args)
-{
- int ret, i;
- kernel_timer_t next_id;
- struct sigevent sev;
-
- for (i = 0; i < args->posix_timers_n; i++) {
- sev.sigev_notify = args->posix_timers[i].spt.it_sigev_notify;
- sev.sigev_signo = args->posix_timers[i].spt.si_signo;
- sev.sigev_value.sival_ptr = args->posix_timers[i].spt.sival_ptr;
-
- while (1) {
- ret = sys_timer_create(args->posix_timers[i].spt.clock_id, &sev, &next_id);
- if (ret < 0) {
- pr_err("Can't create posix timer - %d\n", i);
- return ret;
- }
-
- if (next_id == args->posix_timers[i].spt.it_id)
- break;
-
- ret = sys_timer_delete(next_id);
- if (ret < 0) {
- pr_err("Can't remove temporaty posix timer 0x%x\n", next_id);
- return ret;
- }
-
- if ((long)next_id > args->posix_timers[i].spt.it_id) {
- pr_err("Can't create timers, kernel don't give them consequently\n");
- return -1;
- }
- }
- }
-
- return 0;
-}
-
-static void restore_posix_timers(struct task_restore_args *args)
-{
- int i;
- struct restore_posix_timer *rt;
-
- for (i = 0; i < args->posix_timers_n; i++) {
- rt = &args->posix_timers[i];
- sys_timer_settime((kernel_timer_t)rt->spt.it_id, 0, &rt->val, NULL);
- }
-}
-static void *bootstrap_start;
-static unsigned int bootstrap_len;
-
-/*
- * sys_munmap must not return here. The controll process must
- * trap us on the exit from sys_munmap.
- */
-#ifdef CONFIG_VDSO
-static unsigned long vdso_rt_size;
-#else
-#define vdso_rt_size (0)
-#endif
-
-void __export_unmap(void)
-{
- sys_munmap(bootstrap_start, bootstrap_len - vdso_rt_size);
-}
-
-/*
- * This function unmaps all VMAs, which don't belong to
- * the restored process or the restorer.
- *
- * The restorer memory is two regions -- area with restorer, its stack
- * and arguments and the one with private vmas of the tasks we restore
- * (a.k.a. premmaped area):
- *
- * 0 task_size
- * +----+====+----+====+---+
- *
- * Thus to unmap old memory we have to do 3 unmaps:
- * [ 0 -- 1st area start ]
- * [ 1st end -- 2nd start ]
- * [ 2nd start -- task_size ]
- */
-static int unmap_old_vmas(void *premmapped_addr, unsigned long premmapped_len,
- void *bootstrap_start, unsigned long bootstrap_len,
- unsigned long task_size)
-{
- unsigned long s1, s2;
- void *p1, *p2;
- int ret;
-
- if (premmapped_addr < bootstrap_start) {
- p1 = premmapped_addr;
- s1 = premmapped_len;
- p2 = bootstrap_start;
- s2 = bootstrap_len;
- } else {
- p2 = premmapped_addr;
- s2 = premmapped_len;
- p1 = bootstrap_start;
- s1 = bootstrap_len;
- }
-
- ret = sys_munmap(NULL, p1 - NULL);
- if (ret) {
- pr_err("Unable to unmap (%p-%p): %d\n", NULL, p1, ret);
- return -1;
- }
-
- ret = sys_munmap(p1 + s1, p2 - (p1 + s1));
- if (ret) {
- pr_err("Unable to unmap (%p-%p): %d\n", p1 + s1, p2, ret);
- return -1;
- }
-
- ret = sys_munmap(p2 + s2, task_size - (unsigned long)(p2 + s2));
- if (ret) {
- pr_err("Unable to unmap (%p-%p): %d\n",
- p2 + s2, (void *)task_size, ret);
- return -1;
- }
-
- return 0;
-}
-
-static int wait_helpers(struct task_restore_args *task_args)
-{
- int i;
-
- for (i = 0; i < task_args->helpers_n; i++) {
- int status;
- pid_t pid = task_args->helpers[i];
-
- /* Check that a helper completed. */
- if (sys_wait4(pid, &status, 0, NULL) == -1) {
- /* It has been waited in sigchld_handler */
- continue;
- }
- if (!WIFEXITED(status) || WEXITSTATUS(status)) {
- pr_err("%d exited with non-zero code (%d,%d)\n", pid,
- WEXITSTATUS(status), WTERMSIG(status));
- return -1;
- }
- }
-
- return 0;
-}
-
-static int wait_zombies(struct task_restore_args *task_args)
-{
- int i;
-
- atomic_add(task_args->zombies_n, &task_entries->nr_zombies);
-
- for (i = 0; i < task_args->zombies_n; i++) {
- if (sys_waitid(P_PID, task_args->zombies[i], NULL, WNOWAIT | WEXITED, NULL) < 0) {
- pr_err("Wait on %d zombie failed\n", task_args->zombies[i]);
- return -1;
- }
- pr_debug("%ld: Collect a zombie with pid %d\n",
- sys_getpid(), task_args->zombies[i]);
- futex_dec_and_wake(&task_entries->nr_in_progress);
- }
-
- return 0;
-}
-
-/*
- * The main routine to restore task via sigreturn.
- * This one is very special, we never return there
- * but use sigreturn facility to restore core registers
- * and jump execution to some predefined ip read from
- * core file.
- */
-long __export_restore_task(struct task_restore_args *args)
-{
- long ret = -1;
- int i;
- VmaEntry *vma_entry;
- unsigned long va;
-
- struct rt_sigframe *rt_sigframe;
- struct prctl_mm_map prctl_map;
- unsigned long new_sp;
- k_rtsigset_t to_block;
- pid_t my_pid = sys_getpid();
- rt_sigaction_t act;
-
- bootstrap_start = args->bootstrap_start;
- bootstrap_len = args->bootstrap_len;
-
-#ifdef CONFIG_VDSO
- vdso_rt_size = args->vdso_rt_size;
-#endif
-
- task_entries = args->task_entries;
- helpers = args->helpers;
- n_helpers = args->helpers_n;
- zombies = args->zombies;
- n_zombies = args->zombies_n;
- *args->breakpoint = rst_sigreturn;
-
- ksigfillset(&act.rt_sa_mask);
- act.rt_sa_handler = sigchld_handler;
- act.rt_sa_flags = SA_SIGINFO | SA_RESTORER | SA_RESTART;
- act.rt_sa_restorer = cr_restore_rt;
- sys_sigaction(SIGCHLD, &act, NULL, sizeof(k_rtsigset_t));
-
- log_set_fd(args->logfd);
- log_set_loglevel(args->loglevel);
-
- pr_info("Switched to the restorer %d\n", my_pid);
-
- if (vdso_do_park(&args->vdso_sym_rt, args->vdso_rt_parked_at, vdso_rt_size))
- goto core_restore_end;
-
- if (unmap_old_vmas((void *)args->premmapped_addr, args->premmapped_len,
- bootstrap_start, bootstrap_len, args->task_size))
- goto core_restore_end;
-
- /* Shift private vma-s to the left */
- for (i = 0; i < args->vmas_n; i++) {
- vma_entry = args->vmas + i;
-
- if (!vma_entry_is_private(vma_entry, args->task_size))
- continue;
-
- if (vma_entry->end >= args->task_size)
- continue;
-
- if (vma_entry->start > vma_entry->shmid)
- break;
-
- if (vma_remap(vma_premmaped_start(vma_entry),
- vma_entry->start, vma_entry_len(vma_entry)))
- goto core_restore_end;
- }
-
- /* Shift private vma-s to the right */
- for (i = args->vmas_n - 1; i >= 0; i--) {
- vma_entry = args->vmas + i;
-
- if (!vma_entry_is_private(vma_entry, args->task_size))
- continue;
-
- if (vma_entry->start > args->task_size)
- continue;
-
- if (vma_entry->start < vma_entry->shmid)
- break;
-
- if (vma_remap(vma_premmaped_start(vma_entry),
- vma_entry->start, vma_entry_len(vma_entry)))
- goto core_restore_end;
- }
-
- /*
- * OK, lets try to map new one.
- */
- for (i = 0; i < args->vmas_n; i++) {
- vma_entry = args->vmas + i;
-
- if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR))
- continue;
-
- if (vma_entry_is_private(vma_entry, args->task_size))
- continue;
-
- va = restore_mapping(vma_entry);
-
- if (va != vma_entry->start) {
- pr_err("Can't restore %"PRIx64" mapping with %lx\n", vma_entry->start, va);
- goto core_restore_end;
- }
- }
-
-#ifdef CONFIG_VDSO
- /*
- * Proxify vDSO.
- */
- for (i = 0; i < args->vmas_n; i++) {
- if (vma_entry_is(&args->vmas[i], VMA_AREA_VDSO) ||
- vma_entry_is(&args->vmas[i], VMA_AREA_VVAR)) {
- if (vdso_proxify("dumpee", &args->vdso_sym_rt,
- args->vdso_rt_parked_at,
- i, args->vmas, args->vmas_n))
- goto core_restore_end;
- break;
- }
- }
-#endif
-
- /*
- * Walk though all VMAs again to drop PROT_WRITE
- * if it was not there.
- */
- for (i = 0; i < args->vmas_n; i++) {
- vma_entry = args->vmas + i;
-
- if (!(vma_entry_is(vma_entry, VMA_AREA_REGULAR)))
- continue;
-
- if (vma_entry->prot & PROT_WRITE)
- continue;
-
- sys_mprotect(decode_pointer(vma_entry->start),
- vma_entry_len(vma_entry),
- vma_entry->prot);
- }
-
- /*
- * Finally restore madivse() bits
- */
- for (i = 0; i < args->vmas_n; i++) {
- unsigned long m;
-
- vma_entry = args->vmas + i;
- if (!vma_entry->has_madv || !vma_entry->madv)
- continue;
-
- for (m = 0; m < sizeof(vma_entry->madv) * 8; m++) {
- if (vma_entry->madv & (1ul << m)) {
- ret = sys_madvise(vma_entry->start,
- vma_entry_len(vma_entry),
- m);
- if (ret) {
- pr_err("madvise(%"PRIx64", %"PRIu64", %ld) "
- "failed with %ld\n",
- vma_entry->start,
- vma_entry_len(vma_entry),
- m, ret);
- goto core_restore_end;
- }
- }
- }
- }
-
- /*
- * Now when all VMAs are in their places time to set
- * up AIO rings.
- */
-
- for (i = 0; i < args->rings_n; i++) {
- struct rst_aio_ring *raio = &args->rings[i];
- unsigned long ctx = 0;
- int ret;
-
- ret = sys_io_setup(raio->nr_req, &ctx);
- if (ret < 0) {
- pr_err("Ring setup failed with %d\n", ret);
- goto core_restore_end;
- }
-
- if (ctx == raio->addr) /* Lucky bastards we are! */
- continue;
-
- /*
- * If we failed to get the proper nr_req right and
- * created smaller or larger ring, then this remap
- * will (should) fail, since AIO rings has immutable
- * size.
- *
- * This is not great, but anyway better than putting
- * a ring of wrong size into correct place.
- */
-
- ctx = sys_mremap(ctx, raio->len, raio->len,
- MREMAP_FIXED | MREMAP_MAYMOVE,
- raio->addr);
- if (ctx != raio->addr) {
- pr_err("Ring remap failed with %ld\n", ctx);
- goto core_restore_end;
- }
-
- /*
- * Now check that kernel not just remapped the
- * ring into new place, but updated the internal
- * context state respectively.
- */
-
- ret = sys_io_getevents(ctx, 0, 1, NULL, NULL);
- if (ret != 0) {
- if (ret < 0)
- pr_err("Kernel doesn't remap AIO rings\n");
- else
- pr_err("AIO context screwed up\n");
-
- goto core_restore_end;
- }
- }
-
- ret = 0;
-
- /*
- * Tune up the task fields.
- */
- ret = sys_prctl_safe(PR_SET_NAME, (long)args->comm, 0, 0);
- if (ret)
- goto core_restore_end;
-
- /*
- * New kernel interface with @PR_SET_MM_MAP will become
- * more widespread once kernel get deployed over the world.
- * Thus lets be opportunistic and use new inteface as a try.
- */
- prctl_map = (struct prctl_mm_map) {
- .start_code = args->mm.mm_start_code,
- .end_code = args->mm.mm_end_code,
- .start_data = args->mm.mm_start_data,
- .end_data = args->mm.mm_end_data,
- .start_stack = args->mm.mm_start_stack,
- .start_brk = args->mm.mm_start_brk,
- .brk = args->mm.mm_brk,
- .arg_start = args->mm.mm_arg_start,
- .arg_end = args->mm.mm_arg_end,
- .env_start = args->mm.mm_env_start,
- .env_end = args->mm.mm_env_end,
- .auxv = (void *)args->mm_saved_auxv,
- .auxv_size = args->mm_saved_auxv_size,
- .exe_fd = args->fd_exe_link,
- };
- ret = sys_prctl(PR_SET_MM, PR_SET_MM_MAP, (long)&prctl_map, sizeof(prctl_map), 0);
- if (ret == -EINVAL) {
- ret = sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_CODE, (long)args->mm.mm_start_code, 0);
- ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_CODE, (long)args->mm.mm_end_code, 0);
- ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_DATA, (long)args->mm.mm_start_data, 0);
- ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_DATA, (long)args->mm.mm_end_data, 0);
- ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_STACK, (long)args->mm.mm_start_stack, 0);
- ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_BRK, (long)args->mm.mm_start_brk, 0);
- ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_BRK, (long)args->mm.mm_brk, 0);
- ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_START, (long)args->mm.mm_arg_start, 0);
- ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_END, (long)args->mm.mm_arg_end, 0);
- ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_START, (long)args->mm.mm_env_start, 0);
- ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_END, (long)args->mm.mm_env_end, 0);
- ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_AUXV, (long)args->mm_saved_auxv, args->mm_saved_auxv_size);
-
- /*
- * Because of requirements applied from kernel side
- * we need to restore /proc/pid/exe symlink late,
- * after old existing VMAs are superseded with
- * new ones from image file.
- */
- ret |= restore_self_exe_late(args);
- } else
- sys_close(args->fd_exe_link);
-
- if (ret)
- goto core_restore_end;
-
- /*
- * We need to prepare a valid sigframe here, so
- * after sigreturn the kernel will pick up the
- * registers from the frame, set them up and
- * finally pass execution to the new IP.
- */
- rt_sigframe = (void *)args->t->mem_zone.rt_sigframe;
-
- if (restore_thread_common(rt_sigframe, args->t))
- goto core_restore_end;
-
- /*
- * Threads restoration. This requires some more comments. This
- * restorer routine and thread restorer routine has the following
- * memory map, prepared by a caller code.
- *
- * | <-- low addresses high addresses --> |
- * +-------------------------------------------------------+-----------------------+
- * | this proc body | own stack | rt_sigframe space | thread restore zone |
- * +-------------------------------------------------------+-----------------------+
- *
- * where each thread restore zone is the following
- *
- * | <-- low addresses high addresses --> |
- * +--------------------------------------------------------------------------+
- * | thread restore proc | thread1 stack | thread1 rt_sigframe |
- * +--------------------------------------------------------------------------+
- */
-
- if (args->nr_threads > 1) {
- struct thread_restore_args *thread_args = args->thread_args;
- long clone_flags = CLONE_VM | CLONE_FILES | CLONE_SIGHAND |
- CLONE_THREAD | CLONE_SYSVSEM;
- long last_pid_len;
- long parent_tid;
- int i, fd;
-
- fd = sys_openat(args->proc_fd, LAST_PID_PATH, O_RDWR, 0);
- if (fd < 0) {
- pr_err("can't open last pid fd %d\n", fd);
- goto core_restore_end;
- }
-
- ret = sys_flock(fd, LOCK_EX);
- if (ret) {
- pr_err("Can't lock last_pid %d\n", fd);
- sys_close(fd);
- goto core_restore_end;
- }
-
- for (i = 0; i < args->nr_threads; i++) {
- char last_pid_buf[16], *s;
-
- /* skip self */
- if (thread_args[i].pid == args->t->pid)
- continue;
-
- new_sp = restorer_stack(thread_args + i);
- last_pid_len = vprint_num(last_pid_buf, sizeof(last_pid_buf), thread_args[i].pid - 1, &s);
- sys_lseek(fd, 0, SEEK_SET);
- ret = sys_write(fd, s, last_pid_len);
- if (ret < 0) {
- pr_err("Can't set last_pid %ld/%s\n", ret, last_pid_buf);
- sys_close(fd);
- goto core_restore_end;
- }
-
- /*
- * To achieve functionality like libc's clone()
- * we need a pure assembly here, because clone()'ed
- * thread will run with own stack and we must not
- * have any additional instructions... oh, dear...
- */
-
- RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, thread_args, args->clone_restore_fn);
- }
-
- ret = sys_flock(fd, LOCK_UN);
- if (ret) {
- pr_err("Can't unlock last_pid %ld\n", ret);
- sys_close(fd);
- goto core_restore_end;
- }
-
- sys_close(fd);
- }
-
- restore_rlims(args);
-
- ret = create_posix_timers(args);
- if (ret < 0) {
- pr_err("Can't restore posix timers %ld\n", ret);
- goto core_restore_end;
- }
-
- ret = timerfd_arm(args);
- if (ret < 0) {
- pr_err("Can't restore timerfd %ld\n", ret);
- goto core_restore_end;
- }
-
- pr_info("%ld: Restored\n", sys_getpid());
-
- restore_finish_stage(CR_STATE_RESTORE);
-
- if (wait_zombies(args) < 0)
- goto core_restore_end;
-
- if (wait_helpers(args) < 0)
- goto core_restore_end;
-
- ksigfillset(&to_block);
- ret = sys_sigprocmask(SIG_SETMASK, &to_block, NULL, sizeof(k_rtsigset_t));
- if (ret) {
- pr_err("Unable to block signals %ld\n", ret);
- goto core_restore_end;
- }
-
- sys_sigaction(SIGCHLD, &args->sigchld_act, NULL, sizeof(k_rtsigset_t));
-
- ret = restore_signals(args->siginfo, args->siginfo_n, true);
- if (ret)
- goto core_restore_end;
-
- ret = restore_signals(args->t->siginfo, args->t->siginfo_n, false);
- if (ret)
- goto core_restore_end;
-
- restore_finish_stage(CR_STATE_RESTORE_SIGCHLD);
-
- rst_tcp_socks_all(args);
-
- /* The kernel restricts setting seccomp to uid 0 in the current user
- * ns, so we must do this before restore_creds.
- */
- pr_info("restoring seccomp mode %d for %ld\n", args->seccomp_mode, sys_getpid());
- if (restore_seccomp(args))
- goto core_restore_end;
-
- /*
- * Writing to last-pid is CAP_SYS_ADMIN protected,
- * turning off TCP repair is CAP_SYS_NED_ADMIN protected,
- * thus restore* creds _after_ all of the above.
- */
- ret = restore_creds(args->t->creds_args, args->proc_fd);
- ret = ret || restore_dumpable_flag(&args->mm);
- ret = ret || restore_pdeath_sig(args->t);
-
- futex_set_and_wake(&thread_inprogress, args->nr_threads);
-
- restore_finish_stage(CR_STATE_RESTORE_CREDS);
-
- if (ret)
- BUG();
-
- /* Wait until children stop to use args->task_entries */
- futex_wait_while_gt(&thread_inprogress, 1);
-
- sys_close(args->proc_fd);
- log_set_fd(-1);
-
- /*
- * The code that prepared the itimers makes shure the
- * code below doesn't fail due to bad timing values.
- */
-
-#define itimer_armed(args, i) \
- (args->itimers[i].it_interval.tv_sec || \
- args->itimers[i].it_interval.tv_usec)
-
- if (itimer_armed(args, 0))
- sys_setitimer(ITIMER_REAL, &args->itimers[0], NULL);
- if (itimer_armed(args, 1))
- sys_setitimer(ITIMER_VIRTUAL, &args->itimers[1], NULL);
- if (itimer_armed(args, 2))
- sys_setitimer(ITIMER_PROF, &args->itimers[2], NULL);
-
- restore_posix_timers(args);
-
- sys_munmap(args->rst_mem, args->rst_mem_size);
-
- /*
- * Sigframe stack.
- */
- new_sp = (long)rt_sigframe + SIGFRAME_OFFSET;
-
- /*
- * Prepare the stack and call for sigreturn,
- * pure assembly since we don't need any additional
- * code insns from gcc.
- */
- rst_sigreturn(new_sp);
-
-core_restore_end:
- futex_abort_and_wake(&task_entries->nr_in_progress);
- pr_err("Restorer fail %ld\n", sys_getpid());
- sys_exit_group(1);
- return -1;
-}
diff --git a/pie/util-fd.c b/pie/util-fd.c
deleted file mode 100644
index d90fd12b236c..000000000000
--- a/pie/util-fd.c
+++ /dev/null
@@ -1,168 +0,0 @@
-#include <sys/socket.h>
-#include <sys/un.h>
-#include <sys/mount.h>
-
-#include <errno.h>
-
-#include "compiler.h"
-#include "log.h"
-#include "asm/string.h"
-#include "asm/types.h"
-
-#ifdef CR_NOGLIBC
-# include "syscall.h"
-# define __sys(foo) sys_##foo
-#else
-# define __sys(foo) foo
-#endif
-
-#include "util-pie.h"
-#include "fcntl.h"
-
-#include "bug.h"
-
-static void scm_fdset_init_chunk(struct scm_fdset *fdset, int nr_fds)
-{
- struct cmsghdr *cmsg;
-
- fdset->hdr.msg_controllen = CMSG_LEN(sizeof(int) * nr_fds);
-
- cmsg = CMSG_FIRSTHDR(&fdset->hdr);
- cmsg->cmsg_len = fdset->hdr.msg_controllen;
-}
-
-static int *scm_fdset_init(struct scm_fdset *fdset, struct sockaddr_un *saddr,
- int saddr_len, bool with_flags)
-{
- struct cmsghdr *cmsg;
-
- BUILD_BUG_ON(sizeof(fdset->msg_buf) < (CMSG_SPACE(sizeof(int) * CR_SCM_MAX_FD)));
-
- fdset->iov.iov_base = fdset->opts;
- fdset->iov.iov_len = with_flags ? sizeof(fdset->opts) : 1;
-
- fdset->hdr.msg_iov = &fdset->iov;
- fdset->hdr.msg_iovlen = 1;
- fdset->hdr.msg_name = (struct sockaddr *)saddr;
- fdset->hdr.msg_namelen = saddr_len;
-
- fdset->hdr.msg_control = &fdset->msg_buf;
- fdset->hdr.msg_controllen = CMSG_LEN(sizeof(int) * CR_SCM_MAX_FD);
-
- cmsg = CMSG_FIRSTHDR(&fdset->hdr);
- cmsg->cmsg_len = fdset->hdr.msg_controllen;
- cmsg->cmsg_level = SOL_SOCKET;
- cmsg->cmsg_type = SCM_RIGHTS;
-
- return (int *)CMSG_DATA(cmsg);
-}
-
-int send_fds(int sock, struct sockaddr_un *saddr, int len,
- int *fds, int nr_fds, bool with_flags)
-{
- struct scm_fdset fdset;
- int *cmsg_data;
- int i, min_fd, ret;
-
- cmsg_data = scm_fdset_init(&fdset, saddr, len, with_flags);
- for (i = 0; i < nr_fds; i += min_fd) {
- min_fd = min(CR_SCM_MAX_FD, nr_fds - i);
- scm_fdset_init_chunk(&fdset, min_fd);
- builtin_memcpy(cmsg_data, &fds[i], sizeof(int) * min_fd);
-
- if (with_flags) {
- int j;
-
- for (j = 0; j < min_fd; j++) {
- int flags, fd = fds[i + j];
- struct fd_opts *p = fdset.opts + j;
- struct f_owner_ex owner_ex;
- u32 v[2];
-
- flags = __sys(fcntl)(fd, F_GETFD, 0);
- if (flags < 0) {
- pr_err("fcntl(%d, F_GETFD) -> %d\n", fd, flags);
- return -1;
- }
-
- p->flags = (char)flags;
-
- ret = __sys(fcntl)(fd, F_GETOWN_EX, (long)&owner_ex);
- if (ret) {
- pr_err("fcntl(%d, F_GETOWN_EX) -> %d\n", fd, ret);
- return -1;
- }
-
- /*
- * Simple case -- nothing is changed.
- */
- if (owner_ex.pid == 0) {
- p->fown.pid = 0;
- continue;
- }
-
- ret = __sys(fcntl)(fd, F_GETOWNER_UIDS, (long)&v);
- if (ret) {
- pr_err("fcntl(%d, F_GETOWNER_UIDS) -> %d\n", fd, ret);
- return -1;
- }
-
- p->fown.uid = v[0];
- p->fown.euid = v[1];
- p->fown.pid_type = owner_ex.type;
- p->fown.pid = owner_ex.pid;
- }
- }
-
- ret = __sys(sendmsg)(sock, &fdset.hdr, 0);
- if (ret <= 0)
- return ret ? : -1;
- }
-
- return 0;
-}
-
-int recv_fds(int sock, int *fds, int nr_fds, struct fd_opts *opts)
-{
- struct scm_fdset fdset;
- struct cmsghdr *cmsg;
- int *cmsg_data;
- int ret;
- int i, min_fd;
-
- cmsg_data = scm_fdset_init(&fdset, NULL, 0, opts != NULL);
- for (i = 0; i < nr_fds; i += min_fd) {
- min_fd = min(CR_SCM_MAX_FD, nr_fds - i);
- scm_fdset_init_chunk(&fdset, min_fd);
-
- ret = __sys(recvmsg)(sock, &fdset.hdr, 0);
- if (ret <= 0)
- return ret ? : -1;
-
- cmsg = CMSG_FIRSTHDR(&fdset.hdr);
- if (!cmsg || cmsg->cmsg_type != SCM_RIGHTS)
- return -EINVAL;
- if (fdset.hdr.msg_flags & MSG_CTRUNC)
- return -ENFILE;
-
- min_fd = (cmsg->cmsg_len - sizeof(struct cmsghdr)) / sizeof(int);
- /*
- * In case if kernel screwed the recepient, most probably
- * the caller stack frame will be overwriten, just scream
- * and exit.
- *
- * FIXME Need to sanitize util.h to be able to include it
- * into files which do not have glibc and a couple of
- * sys_write_ helpers. Meawhile opencoded BUG_ON here.
- */
- BUG_ON(min_fd > CR_SCM_MAX_FD);
-
- if (unlikely(min_fd <= 0))
- return -1;
- builtin_memcpy(&fds[i], cmsg_data, sizeof(int) * min_fd);
- if (opts)
- builtin_memcpy(opts + i, fdset.opts, sizeof(struct fd_opts) * min_fd);
- }
-
- return 0;
-}
diff --git a/pie/util-vdso.c b/pie/util-vdso.c
deleted file mode 100644
index e93b110fe43b..000000000000
--- a/pie/util-vdso.c
+++ /dev/null
@@ -1,210 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <elf.h>
-#include <fcntl.h>
-#include <errno.h>
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-
-#include "asm/string.h"
-#include "asm/types.h"
-
-#include "image.h"
-#include "util-vdso.h"
-#include "vma.h"
-#include "log.h"
-#include "bug.h"
-
-#ifdef LOG_PREFIX
-# undef LOG_PREFIX
-#endif
-#define LOG_PREFIX "vdso: "
-
-/* Check if pointer is out-of-bound */
-static bool __ptr_oob(void *ptr, void *start, size_t size)
-{
- void *end = (void *)((unsigned long)start + size);
- return ptr > end || ptr < start;
-}
-
-/*
- * Elf hash, see format specification.
- */
-static unsigned long elf_hash(const unsigned char *name)
-{
- unsigned long h = 0, g;
-
- while (*name) {
- h = (h << 4) + *name++;
- g = h & 0xf0000000ul;
- if (g)
- h ^= g >> 24;
- h &= ~g;
- }
- return h;
-}
-
-int vdso_fill_symtable(char *mem, size_t size, struct vdso_symtable *t)
-{
- const char *vdso_symbols[VDSO_SYMBOL_MAX] = {
- ARCH_VDSO_SYMBOLS
- };
-
- Elf64_Phdr *dynamic = NULL, *load = NULL;
- Elf64_Ehdr *ehdr = (void *)mem;
- Elf64_Dyn *dyn_strtab = NULL;
- Elf64_Dyn *dyn_symtab = NULL;
- Elf64_Dyn *dyn_strsz = NULL;
- Elf64_Dyn *dyn_syment = NULL;
- Elf64_Dyn *dyn_hash = NULL;
- Elf64_Word *hash = NULL;
- Elf64_Phdr *phdr;
- Elf64_Dyn *d;
-
- Elf64_Word *bucket, *chain;
- Elf64_Word nbucket, nchain;
-
- /*
- * See Elf specification for this magic values.
- */
- static const char elf_ident[] = {
- 0x7f, 0x45, 0x4c, 0x46, 0x02, 0x01, 0x01, 0x00,
- 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
- };
-
- char *dynsymbol_names;
- unsigned int i, j, k;
-
- BUILD_BUG_ON(sizeof(elf_ident) != sizeof(ehdr->e_ident));
-
- pr_debug("Parsing at %lx %lx\n", (long)mem, (long)mem + (long)size);
-
- /*
- * Make sure it's a file we support.
- */
- if (builtin_memcmp(ehdr->e_ident, elf_ident, sizeof(elf_ident))) {
- pr_err("Elf header magic mismatch\n");
- return -EINVAL;
- }
-
- /*
- * We need PT_LOAD and PT_DYNAMIC here. Each once.
- */
- phdr = (void *)&mem[ehdr->e_phoff];
- for (i = 0; i < ehdr->e_phnum; i++, phdr++) {
- if (__ptr_oob(phdr, mem, size))
- goto err_oob;
- switch (phdr->p_type) {
- case PT_DYNAMIC:
- if (dynamic) {
- pr_err("Second PT_DYNAMIC header\n");
- return -EINVAL;
- }
- dynamic = phdr;
- break;
- case PT_LOAD:
- if (load) {
- pr_err("Second PT_LOAD header\n");
- return -EINVAL;
- }
- load = phdr;
- break;
- }
- }
-
- if (!load || !dynamic) {
- pr_err("One of obligated program headers is missed\n");
- return -EINVAL;
- }
-
- pr_debug("PT_LOAD p_vaddr: %lx\n", (unsigned long)load->p_vaddr);
-
- /*
- * Dynamic section tags should provide us the rest of information
- * needed. Note that we're interested in a small set of tags.
- */
- d = (void *)&mem[dynamic->p_offset];
- for (i = 0; i < dynamic->p_filesz / sizeof(*d); i++, d++) {
- if (__ptr_oob(d, mem, size))
- goto err_oob;
-
- if (d->d_tag == DT_NULL) {
- break;
- } else if (d->d_tag == DT_STRTAB) {
- dyn_strtab = d;
- pr_debug("DT_STRTAB: %lx\n", (unsigned long)d->d_un.d_ptr);
- } else if (d->d_tag == DT_SYMTAB) {
- dyn_symtab = d;
- pr_debug("DT_SYMTAB: %lx\n", (unsigned long)d->d_un.d_ptr);
- } else if (d->d_tag == DT_STRSZ) {
- dyn_strsz = d;
- pr_debug("DT_STRSZ: %lx\n", (unsigned long)d->d_un.d_val);
- } else if (d->d_tag == DT_SYMENT) {
- dyn_syment = d;
- pr_debug("DT_SYMENT: %lx\n", (unsigned long)d->d_un.d_val);
- } else if (d->d_tag == DT_HASH) {
- dyn_hash = d;
- pr_debug("DT_HASH: %lx\n", (unsigned long)d->d_un.d_ptr);
- }
- }
-
- if (!dyn_strtab || !dyn_symtab || !dyn_strsz || !dyn_syment || !dyn_hash) {
- pr_err("Not all dynamic entries are present\n");
- return -EINVAL;
- }
-
- dynsymbol_names = &mem[dyn_strtab->d_un.d_val - load->p_vaddr];
- if (__ptr_oob(dynsymbol_names, mem, size))
- goto err_oob;
-
- hash = (void *)&mem[(unsigned long)dyn_hash->d_un.d_ptr - (unsigned long)load->p_vaddr];
- if (__ptr_oob(hash, mem, size))
- goto err_oob;
-
- nbucket = hash[0];
- nchain = hash[1];
- bucket = &hash[2];
- chain = &hash[nbucket + 2];
-
- pr_debug("nbucket %lx nchain %lx bucket %lx chain %lx\n",
- (long)nbucket, (long)nchain, (unsigned long)bucket, (unsigned long)chain);
-
- for (i = 0; i < VDSO_SYMBOL_MAX; i++) {
- const char * symbol = vdso_symbols[i];
- k = elf_hash((const unsigned char *)symbol);
-
- for (j = bucket[k % nbucket]; j < nchain && chain[j] != STN_UNDEF; j = chain[j]) {
- Elf64_Sym *sym = (void *)&mem[dyn_symtab->d_un.d_ptr - load->p_vaddr];
- char *name;
-
- sym = &sym[j];
- if (__ptr_oob(sym, mem, size))
- continue;
-
- if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC &&
- ELF64_ST_BIND(sym->st_info) != STB_GLOBAL)
- continue;
-
- name = &dynsymbol_names[sym->st_name];
- if (__ptr_oob(name, mem, size))
- continue;
-
- if (builtin_strcmp(name, symbol))
- continue;
-
- builtin_memcpy(t->symbols[i].name, name, sizeof(t->symbols[i].name));
- t->symbols[i].offset = (unsigned long)sym->st_value - load->p_vaddr;
- break;
- }
- }
-
- return 0;
-
-err_oob:
- pr_err("Corrupted Elf data\n");
- return -EFAULT;
-}
-
diff --git a/pie/util.c b/pie/util.c
deleted file mode 100644
index 354667294e37..000000000000
--- a/pie/util.c
+++ /dev/null
@@ -1,47 +0,0 @@
-#include <sys/socket.h>
-#include <sys/un.h>
-#include <sys/mount.h>
-#include <unistd.h>
-#include <errno.h>
-
-#include "compiler.h"
-#include "asm/string.h"
-#include "asm/types.h"
-#include "fcntl.h"
-#include "log.h"
-#include "util-pie.h"
-
-#ifdef CR_NOGLIBC
-# include "syscall.h"
-# define __sys(foo) sys_##foo
-#else
-# define __sys(foo) foo
-#endif
-
-int open_detach_mount(char *dir)
-{
- int fd, ret;
-
- fd = __sys(open)(dir, O_RDONLY | O_DIRECTORY, 0);
- if (fd < 0)
- pr_err("Can't open directory %s: %d\n", dir, fd);
-
- ret = __sys(umount2)(dir, MNT_DETACH);
- if (ret) {
- pr_err("Can't detach mount %s: %d\n", dir, ret);
- goto err_close;
- }
-
- ret = __sys(rmdir)(dir);
- if (ret) {
- pr_err("Can't remove tmp dir %s: %d\n", dir, ret);
- goto err_close;
- }
-
- return fd;
-
-err_close:
- if (fd >= 0)
- __sys(close)(fd);
- return -1;
-}
diff --git a/pipes.c b/pipes.c
deleted file mode 100644
index a1552127837f..000000000000
--- a/pipes.c
+++ /dev/null
@@ -1,521 +0,0 @@
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include <sys/mman.h>
-
-#include "imgset.h"
-#include "image.h"
-#include "files.h"
-#include "pipes.h"
-#include "util-pie.h"
-
-#include "protobuf.h"
-#include "protobuf/pipe.pb-c.h"
-#include "protobuf/pipe-data.pb-c.h"
-
-static LIST_HEAD(pipes);
-
-static void show_saved_pipe_fds(struct pipe_info *pi)
-{
- struct fdinfo_list_entry *fle;
-
- pr_info(" `- ID %p %#xpn", pi, pi->pe->id);
- list_for_each_entry(fle, &pi->d.fd_info_head, desc_list)
- pr_info(" `- FD %d pid %d\n", fle->fe->fd, fle->pid);
-}
-
-static int pipe_data_read(struct cr_img *img, struct pipe_data_rst *r)
-{
- unsigned long bytes = r->pde->bytes;
-
- if (!bytes)
- return 0;
-
- /*
- * We potentially allocate more memory than required for data,
- * but this is OK. Look at restore_pipe_data -- it vmsplice-s
- * this into the kernel with F_GIFT flag (since some time it
- * works on non-aligned data), thus just giving this page to
- * pipe buffer. And since kernel allocates pipe buffers in pages
- * anyway we don't increase memory consumption :)
- */
-
- r->data = mmap(NULL, bytes, PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_ANON, 0, 0);
- if (r->data == MAP_FAILED) {
- pr_perror("Can't map mem for pipe buffers");
- return -1;
- }
-
- return read_img_buf(img, r->data, bytes);
-}
-
-int collect_pipe_data(int img_type, struct pipe_data_rst **hash)
-{
- int ret;
- struct cr_img *img;
- struct pipe_data_rst *r = NULL;
-
- img = open_image(img_type, O_RSTR);
- if (!img)
- return -1;
-
- while (1) {
- ret = -1;
- r = xmalloc(sizeof(*r));
- if (!r)
- break;
-
- ret = pb_read_one_eof(img, &r->pde, PB_PIPE_DATA);
- if (ret <= 0)
- break;
-
- ret = pipe_data_read(img, r);
- if (ret < 0)
- break;
-
- ret = r->pde->pipe_id & PIPE_DATA_HASH_MASK;
- r->next = hash[ret];
- hash[ret] = r;
-
- pr_info("Collected pipe data for %#x (chain %u)\n",
- r->pde->pipe_id, ret);
- }
-
- if (r && r->pde)
- pipe_data_entry__free_unpacked(r->pde, NULL);
- xfree(r);
-
- close_image(img);
- return ret;
-}
-
-/* Choose who will restore a pipe. */
-void mark_pipe_master(void)
-{
- LIST_HEAD(head);
-
- pr_info("Pipes:\n");
-
- while (1) {
- struct fdinfo_list_entry *fle;
- struct pipe_info *pi, *pic, *p;
- struct pipe_info *pr = NULL, *pw = NULL;
-
- if (list_empty(&pipes))
- break;
-
- pi = list_first_entry(&pipes, struct pipe_info, list);
- list_move(&pi->list, &head);
-
- pr_info(" `- PIPE ID %#x\n", pi->pe->pipe_id);
- show_saved_pipe_fds(pi);
-
- fle = file_master(&pi->d);
- p = pi;
- if (!(pi->pe->flags & O_LARGEFILE)) {
- if (pi->pe->flags & O_WRONLY) {
- if (pw == NULL)
- pw = pi;
- } else {
- if (pr == NULL)
- pr = pi;
- }
- }
-
- list_for_each_entry(pic, &pi->pipe_list, pipe_list) {
- struct fdinfo_list_entry *f;
-
- list_move(&pic->list, &head);
- f = file_master(&pic->d);
- if (fdinfo_rst_prio(f, fle)) {
- p = pic;
- fle = f;
- }
-
- if (!(pic->pe->flags & O_LARGEFILE)) {
- if (pic->pe->flags & O_WRONLY) {
- if (pw == NULL)
- pw = pic;
- } else {
- if (pr == NULL)
- pr = pic;
- }
- }
-
- show_saved_pipe_fds(pic);
- }
- p->create = 1;
- if (pr)
- pr->reopen = 0;
- if (pw)
- pw->reopen = 0;
- pr_info(" by %#x\n", p->pe->id);
- }
-
- list_splice(&head, &pipes);
-}
-
-static struct pipe_data_rst *pd_hash_pipes[PIPE_DATA_HASH_SIZE];
-
-int restore_pipe_data(int img_type, int pfd, u32 id, struct pipe_data_rst **hash)
-{
- int ret;
- struct pipe_data_rst *pd;
- struct iovec iov;
-
- for (pd = hash[id & PIPE_DATA_HASH_MASK]; pd != NULL; pd = pd->next)
- if (pd->pde->pipe_id == id)
- break;
-
- if (!pd) { /* no data for this pipe */
- pr_info("No data for pipe %#x\n", id);
- return 0;
- }
-
- if (!pd->pde->bytes)
- goto out;
-
- if (!pd->data) {
- pr_err("Double data restore occurred on %#x\n", id);
- return -1;
- }
-
- iov.iov_base = pd->data;
- iov.iov_len = pd->pde->bytes;
-
- while (iov.iov_len > 0) {
- ret = vmsplice(pfd, &iov, 1, SPLICE_F_GIFT | SPLICE_F_NONBLOCK);
- if (ret < 0) {
- pr_perror("%#x: Error splicing data", id);
- goto err;
- }
-
- if (ret == 0 || ret > iov.iov_len /* sanity */) {
- pr_err("%#x: Wanted to restore %zu bytes, but got %d\n", id,
- iov.iov_len, ret);
- ret = -1;
- goto err;
- }
-
- iov.iov_base += ret;
- iov.iov_len -= ret;
- }
-
- /*
- * 3 reasons for killing the buffer from our address space:
- *
- * 1. We gifted the pages to the kernel to optimize memory usage, thus
- * accidental memory corruption can change the pipe buffer.
- * 2. This will make the vmas restoration a bit faster due to less self
- * mappings to be unmapped.
- * 3. We can catch bugs with double pipe data restore.
- */
-
- munmap(pd->data, pd->pde->bytes);
- pd->data = NULL;
-out:
- ret = 0;
- if (pd->pde->has_size) {
- pr_info("Restoring size %#x for %#x\n",
- pd->pde->size, pd->pde->pipe_id);
- ret = fcntl(pfd, F_SETPIPE_SZ, pd->pde->size);
- if (ret < 0)
- pr_perror("Can't restore pipe size");
- else
- ret = 0;
- }
-err:
- return ret;
-}
-
-static int reopen_pipe(int fd, int flags)
-{
- int ret;
- char path[PSFDS];
-
- sprintf(path, "/proc/self/fd/%d", fd);
- ret = open(path, flags);
- if (ret < 0)
- pr_perror("Unable to reopen the pipe %s", path);
- close(fd);
-
- return ret;
-}
-
-static int recv_pipe_fd(struct pipe_info *pi)
-{
- struct fdinfo_list_entry *fle;
- int tmp, fd;
-
- fle = file_master(&pi->d);
- fd = fle->fe->fd;
-
- pr_info("\tWaiting fd for %d\n", fd);
-
- tmp = recv_fd(fd);
- if (tmp < 0) {
- pr_err("Can't get fd %d\n", tmp);
- return -1;
- }
- close(fd);
-
- if (pi->reopen)
- fd = reopen_pipe(tmp, pi->pe->flags);
- else
- fd = tmp;
- if (fd >= 0) {
- if (rst_file_params(fd, pi->pe->fown, pi->pe->flags)) {
- close(fd);
- return -1;
- }
- }
-
- return fd;
-}
-
-static char *pipe_d_name(struct file_desc *d, char *buf, size_t s)
-{
- struct pipe_info *pi;
-
- pi = container_of(d, struct pipe_info, d);
- if (snprintf(buf, s, "pipe:[%d]", pi->pe->pipe_id) >= s) {
- pr_err("Not enough room for pipe %d identifier string\n",
- pi->pe->pipe_id);
- return NULL;
- }
-
- return buf;
-}
-
-static int open_pipe(struct file_desc *d)
-{
- struct pipe_info *pi, *p;
- int ret, tmp;
- int pfd[2];
- int sock;
-
- pi = container_of(d, struct pipe_info, d);
- pr_info("\t\tCreating pipe pipe_id=%#x id=%#x\n", pi->pe->pipe_id, pi->pe->id);
- if (inherited_fd(d, &tmp)) {
- if (tmp < 0)
- return tmp;
-
- pi->reopen = 1;
- goto out;
- }
-
- if (!pi->create)
- return recv_pipe_fd(pi);
-
- if (pipe(pfd) < 0) {
- pr_perror("Can't create pipe");
- return -1;
- }
-
- ret = restore_pipe_data(CR_FD_PIPES_DATA, pfd[1],
- pi->pe->pipe_id, pd_hash_pipes);
- if (ret)
- return -1;
-
- sock = socket(PF_UNIX, SOCK_DGRAM, 0);
- if (sock < 0) {
- pr_perror("Can't create socket");
- return -1;
- }
-
- list_for_each_entry(p, &pi->pipe_list, pipe_list) {
- struct fdinfo_list_entry *fle;
- int fd;
-
- fle = file_master(&p->d);
- fd = pfd[p->pe->flags & O_WRONLY];
-
- if (send_fd_to_peer(fd, fle, sock)) {
- pr_perror("Can't send file descriptor");
- return -1;
- }
- }
-
- close(sock);
-
- close(pfd[!(pi->pe->flags & O_WRONLY)]);
- tmp = pfd[pi->pe->flags & O_WRONLY];
-
-out:
- if (pi->reopen)
- tmp = reopen_pipe(tmp, pi->pe->flags);
-
- if (tmp >= 0)
- if (rst_file_params(tmp, pi->pe->fown, pi->pe->flags))
- return -1;
-
- return tmp;
-}
-
-static int want_transport(FdinfoEntry *fe, struct file_desc *d)
-{
- struct pipe_info *pi;
-
- pi = container_of(d, struct pipe_info, d);
- return !pi->create;
-}
-
-static struct file_desc_ops pipe_desc_ops = {
- .type = FD_TYPES__PIPE,
- .open = open_pipe,
- .want_transport = want_transport,
- .name = pipe_d_name,
-};
-
-static int collect_one_pipe(void *o, ProtobufCMessage *base)
-{
- struct pipe_info *pi = o, *tmp;
-
- pi->pe = pb_msg(base, PipeEntry);
-
- pi->create = 0;
- pi->reopen = 1;
- pr_info("Collected pipe entry ID %#x PIPE ID %#x\n",
- pi->pe->id, pi->pe->pipe_id);
-
- if (file_desc_add(&pi->d, pi->pe->id, &pipe_desc_ops))
- return -1;
-
- INIT_LIST_HEAD(&pi->pipe_list);
- if (!inherited_fd(&pi->d, NULL)) {
- list_for_each_entry(tmp, &pipes, list)
- if (pi->pe->pipe_id == tmp->pe->pipe_id)
- break;
-
- if (&tmp->list != &pipes)
- list_add(&pi->pipe_list, &tmp->pipe_list);
- }
-
- list_add_tail(&pi->list, &pipes);
-
- return 0;
-}
-
-struct collect_image_info pipe_cinfo = {
- .fd_type = CR_FD_PIPES,
- .pb_type = PB_PIPE,
- .priv_size = sizeof(struct pipe_info),
- .collect = collect_one_pipe,
-};
-
-int collect_pipes(void)
-{
- return collect_pipe_data(CR_FD_PIPES_DATA, pd_hash_pipes);
-}
-
-int dump_one_pipe_data(struct pipe_data_dump *pd, int lfd, const struct fd_parms *p)
-{
- struct cr_img *img;
- int pipe_size, i, bytes;
- int steal_pipe[2];
- int ret = -1;
- PipeDataEntry pde = PIPE_DATA_ENTRY__INIT;
-
- if (p->flags & O_WRONLY)
- return 0;
-
- /* Maybe we've dumped it already */
- for (i = 0; i < pd->nr; i++) {
- if (pd->ids[i] == pipe_id(p))
- return 0;
- }
-
- pr_info("Dumping data from pipe %#x fd %d\n", pipe_id(p), lfd);
-
- if (pd->nr >= NR_PIPES_WITH_DATA) {
- pr_err("OOM storing pipe\n");
- return -1;
- }
-
- img = img_from_set(glob_imgset, pd->img_type);
- pd->ids[pd->nr++] = pipe_id(p);
-
- pipe_size = fcntl(lfd, F_GETPIPE_SZ);
- if (pipe_size < 0) {
- pr_err("Can't obtain piped data size\n");
- goto err;
- }
-
- if (pipe(steal_pipe) < 0) {
- pr_perror("Can't create pipe for stealing data");
- goto err;
- }
-
- bytes = tee(lfd, steal_pipe[1], pipe_size, SPLICE_F_NONBLOCK);
- if (bytes < 0) {
- if (errno != EAGAIN) {
- pr_perror("Can't pick pipe data");
- goto err_close;
- }
-
- bytes = 0;
- }
-
- pde.pipe_id = pipe_id(p);
- pde.bytes = bytes;
- pde.has_size = true;
- pde.size = pipe_size;
-
- if (pb_write_one(img, &pde, PB_PIPE_DATA))
- goto err_close;
-
- if (bytes) {
- int wrote;
-
- wrote = splice(steal_pipe[0], NULL, img_raw_fd(img), NULL, bytes, 0);
- if (wrote < 0) {
- pr_perror("Can't push pipe data");
- goto err_close;
- } else if (wrote != bytes) {
- pr_err("%#x: Wanted to write %d bytes, but wrote %d\n",
- pipe_id(p), bytes, wrote);
- goto err_close;
- }
- }
-
- ret = 0;
-
-err_close:
- close(steal_pipe[0]);
- close(steal_pipe[1]);
-err:
- return ret;
-}
-
-static struct pipe_data_dump pd_pipes = { .img_type = CR_FD_PIPES_DATA, };
-
-static int dump_one_pipe(int lfd, u32 id, const struct fd_parms *p)
-{
- PipeEntry pe = PIPE_ENTRY__INIT;
-
- pr_info("Dumping pipe %d with id %#x pipe_id %#x\n",
- lfd, id, pipe_id(p));
-
- if (p->flags & O_DIRECT) {
- pr_err("The packetized mode for pipes is not supported yet\n");
- return -1;
- }
-
- pe.id = id;
- pe.pipe_id = pipe_id(p);
- pe.flags = p->flags;
- pe.fown = (FownEntry *)&p->fown;
-
- if (pb_write_one(img_from_set(glob_imgset, CR_FD_PIPES), &pe, PB_PIPE))
- return -1;
-
- return dump_one_pipe_data(&pd_pipes, lfd, p);
-}
-
-const struct fdtype_ops pipe_dump_ops = {
- .type = FD_TYPES__PIPE,
- .dump = dump_one_pipe,
-};
diff --git a/plugin.c b/plugin.c
deleted file mode 100644
index f764ae7d3e63..000000000000
--- a/plugin.c
+++ /dev/null
@@ -1,247 +0,0 @@
-#include <unistd.h>
-#include <stdlib.h>
-#include <string.h>
-#include <dirent.h>
-#include <stdio.h>
-#include <errno.h>
-#include <dlfcn.h>
-
-#include "cr_options.h"
-#include "compiler.h"
-#include "xmalloc.h"
-#include "plugin.h"
-#include "list.h"
-#include "log.h"
-
-cr_plugin_ctl_t cr_plugin_ctl;
-
-/*
- * If we met old version of a plugin, selfgenerate a plugin descriptor for it.
- */
-static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path)
-{
- cr_plugin_desc_t *d;
-
- d = xzalloc(sizeof(*d));
- if (!d)
- return NULL;
-
- d->name = strdup(path);
- d->max_hooks = CR_PLUGIN_HOOK__MAX;
- d->version = CRIU_PLUGIN_VERSION_OLD;
-
- pr_warn("Generating dynamic descriptor for plugin `%s'."
- "Won't work in next version of the program."
- "Please update your plugin.\n", path);
-
-#define __assign_hook(__hook, __name) \
- do { \
- void *name; \
- name = dlsym(h, __name); \
- if (name) \
- d->hooks[CR_PLUGIN_HOOK__ ##__hook] = name; \
- } while (0)
-
- __assign_hook(DUMP_UNIX_SK, "cr_plugin_dump_unix_sk");
- __assign_hook(RESTORE_UNIX_SK, "cr_plugin_restore_unix_sk");
- __assign_hook(DUMP_EXT_FILE, "cr_plugin_dump_file");
- __assign_hook(RESTORE_EXT_FILE, "cr_plugin_restore_file");
- __assign_hook(DUMP_EXT_MOUNT, "cr_plugin_dump_ext_mount");
- __assign_hook(RESTORE_EXT_MOUNT, "cr_plugin_restore_ext_mount");
- __assign_hook(DUMP_EXT_LINK, "cr_plugin_dump_ext_link");
-
-#undef __assign_hook
-
- d->init = dlsym(h, "cr_plugin_init");
- d->exit = dlsym(h, "cr_plugin_fini");
-
- return d;
-}
-
-static void show_plugin_desc(cr_plugin_desc_t *d)
-{
- size_t i;
-
- pr_debug("Plugin \"%s\" (version %u hooks %u)\n",
- d->name, d->version, d->max_hooks);
- for (i = 0; i < d->max_hooks; i++) {
- if (d->hooks[i])
- pr_debug("\t%4zu -> %p\n", i, d->hooks[i]);
- }
-}
-
-static int verify_plugin(cr_plugin_desc_t *d)
-{
- if (d->version > CRIU_PLUGIN_VERSION) {
- pr_debug("Plugin %s has version %x while max %x supported\n",
- d->name, d->version, CRIU_PLUGIN_VERSION);
- return -1;
- }
-
- if (d->max_hooks > CR_PLUGIN_HOOK__MAX) {
- pr_debug("Plugin %s has %u assigned while max %u supported\n",
- d->name, d->max_hooks, CR_PLUGIN_HOOK__MAX);
- return -1;
- }
-
- return 0;
-}
-
-static int cr_lib_load(int stage, char *path)
-{
- cr_plugin_desc_t *d;
- plugin_desc_t *this;
- size_t i;
- void *h;
-
- h = dlopen(path, RTLD_LAZY);
- if (h == NULL) {
- pr_err("Unable to load %s: %s\n", path, dlerror());
- return -1;
- }
-
- /*
- * Load plugin descriptor. If plugin is too old -- create
- * dynamic plugin descriptor. In most cases this won't
- * be a common operation and plugins are not supposed to
- * be changing own format frequently.
- */
- d = dlsym(h, "CR_PLUGIN_DESC");
- if (!d)
- d = cr_gen_plugin_desc(h, path);
- if (!d) {
- pr_err("Can't load plugin %s\n", path);
- dlclose(h);
- return -1;
- }
-
- this = xzalloc(sizeof(*this));
- if (!this) {
- dlclose(h);
- return -1;
- }
-
- if (verify_plugin(d)) {
- pr_err("Corrupted plugin %s\n", path);
- xfree(this);
- dlclose(h);
- return -1;
- }
-
- this->d = d;
- this->dlhandle = h;
- INIT_LIST_HEAD(&this->list);
-
- for (i = 0; i < d->max_hooks; i++)
- INIT_LIST_HEAD(&this->link[i]);
-
- list_add_tail(&this->list, &cr_plugin_ctl.head);
- show_plugin_desc(d);
-
- if (d->init && d->init(stage)) {
- pr_err("Failed in init(%d) of \"%s\"\n", stage, d->name);
- list_del(&this->list);
- xfree(this);
- dlclose(h);
- return -1;
- }
-
- /*
- * Chain hooks into appropriate places for
- * fast handler access.
- */
- for (i = 0; i < d->max_hooks; i++) {
- if (!d->hooks[i])
- continue;
- list_add_tail(&this->link[i], &cr_plugin_ctl.hook_chain[i]);
- }
-
- return 0;
-}
-
-void cr_plugin_fini(int stage, int ret)
-{
- plugin_desc_t *this, *tmp;
-
- list_for_each_entry_safe(this, tmp, &cr_plugin_ctl.head, list) {
- void *h = this->dlhandle;
- size_t i;
-
- list_del(&this->list);
- if (this->d->exit)
- this->d->exit(stage, ret);
-
- for (i = 0; i < this->d->max_hooks; i++) {
- if (!list_empty(&this->link[i]))
- list_del(&this->link[i]);
- }
-
- if (this->d->version == CRIU_PLUGIN_VERSION_OLD)
- xfree(this->d);
- dlclose(h);
- }
-}
-
-int cr_plugin_init(int stage)
-{
- int exit_code = -1;
- char *path;
- size_t i;
- DIR *d;
-
- INIT_LIST_HEAD(&cr_plugin_ctl.head);
- for (i = 0; i < ARRAY_SIZE(cr_plugin_ctl.hook_chain); i++)
- INIT_LIST_HEAD(&cr_plugin_ctl.hook_chain[i]);
-
- if (opts.libdir == NULL) {
- path = getenv("CRIU_LIBS_DIR");
- if (path)
- opts.libdir = path;
- else {
- if (access(CR_PLUGIN_DEFAULT, F_OK))
- return 0;
-
- opts.libdir = CR_PLUGIN_DEFAULT;
- }
- }
-
- d = opendir(opts.libdir);
- if (d == NULL) {
- pr_perror("Unable to open directory %s", opts.libdir);
- return -1;
- }
-
- while (1) {
- char path[PATH_MAX];
- struct dirent *de;
- int len;
-
- errno = 0;
- de = readdir(d);
- if (de == NULL) {
- if (errno == 0)
- break;
- pr_perror("Unable to read the libraries directory");
- goto err;
- }
-
- len = strlen(de->d_name);
-
- if (len < 3 || strncmp(de->d_name + len - 3, ".so", 3))
- continue;
-
- snprintf(path, sizeof(path), "%s/%s", opts.libdir, de->d_name);
-
- if (cr_lib_load(stage, path))
- goto err;
- }
-
- exit_code = 0;
-err:
- closedir(d);
-
- if (exit_code)
- cr_plugin_fini(stage, exit_code);
-
- return exit_code;
-}
diff --git a/proc_parse.c b/proc_parse.c
deleted file mode 100644
index c7c577554bae..000000000000
--- a/proc_parse.c
+++ /dev/null
@@ -1,2444 +0,0 @@
-#include <stdio.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <dirent.h>
-#include <errno.h>
-#include <sys/stat.h>
-#include <string.h>
-#include <ctype.h>
-#include <linux/fs.h>
-
-#include "asm/types.h"
-#include "list.h"
-#include "util.h"
-#include "mount.h"
-#include "mman.h"
-#include "cpu.h"
-#include "file-lock.h"
-#include "pstree.h"
-#include "fsnotify.h"
-#include "posix-timer.h"
-#include "kerndat.h"
-#include "vdso.h"
-#include "vma.h"
-#include "bfd.h"
-#include "proc_parse.h"
-#include "cr_options.h"
-#include "sysfs_parse.h"
-#include "seccomp.h"
-#include "string.h"
-#include "namespaces.h"
-#include "files-reg.h"
-
-#include "protobuf.h"
-#include "protobuf/fdinfo.pb-c.h"
-#include "protobuf/mnt.pb-c.h"
-
-#include <stdlib.h>
-
-struct buffer {
- char buf[PAGE_SIZE];
- char end; /* '\0' */
-};
-
-static struct buffer __buf;
-static char *buf = __buf.buf;
-
-#define BUF_SIZE sizeof(__buf.buf)
-
-/*
- * This is how AIO ring buffers look like in proc
- */
-
-#define AIO_FNAME "/[aio]"
-
-/* check the @line starts with "%lx-%lx" format */
-static bool is_vma_range_fmt(char *line)
-{
-#define ____is_vma_addr_char(__c) \
- (((__c) <= '9' && (__c) >= '0') || \
- ((__c) <= 'f' && (__c) >= 'a'))
-
- while (*line && ____is_vma_addr_char(*line))
- line++;
-
- if (*line++ != '-')
- return false;
-
- while (*line && ____is_vma_addr_char(*line))
- line++;
-
- if (*line++ != ' ')
- return false;
-
- return true;
-#undef ____is_vma_addr_char
-}
-
-static int parse_vmflags(char *buf, struct vma_area *vma_area)
-{
- char *tok;
- bool shared = false;
- bool maywrite = false;
-
- if (!buf[0])
- return 0;
-
- tok = strtok(buf, " \n");
- if (!tok)
- return 0;
-
-#define _vmflag_match(_t, _s) (_t[0] == _s[0] && _t[1] == _s[1])
-
- do {
- /* open() block */
- if (_vmflag_match(tok, "sh"))
- shared = true;
- else if (_vmflag_match(tok, "mw"))
- maywrite = true;
-
- /* mmap() block */
- if (_vmflag_match(tok, "gd"))
- vma_area->e->flags |= MAP_GROWSDOWN;
- else if (_vmflag_match(tok, "lo"))
- vma_area->e->flags |= MAP_LOCKED;
- else if (_vmflag_match(tok, "nr"))
- vma_area->e->flags |= MAP_NORESERVE;
- else if (_vmflag_match(tok, "ht"))
- vma_area->e->flags |= MAP_HUGETLB;
-
- /* madvise() block */
- if (_vmflag_match(tok, "sr"))
- vma_area->e->madv |= (1ul << MADV_SEQUENTIAL);
- else if (_vmflag_match(tok, "rr"))
- vma_area->e->madv |= (1ul << MADV_RANDOM);
- else if (_vmflag_match(tok, "dc"))
- vma_area->e->madv |= (1ul << MADV_DONTFORK);
- else if (_vmflag_match(tok, "dd"))
- vma_area->e->madv |= (1ul << MADV_DONTDUMP);
- else if (_vmflag_match(tok, "mg"))
- vma_area->e->madv |= (1ul << MADV_MERGEABLE);
- else if (_vmflag_match(tok, "hg"))
- vma_area->e->madv |= (1ul << MADV_HUGEPAGE);
- else if (_vmflag_match(tok, "nh"))
- vma_area->e->madv |= (1ul << MADV_NOHUGEPAGE);
-
- /* vmsplice doesn't work for VM_IO and VM_PFNMAP mappings. */
- if (_vmflag_match(tok, "io") || _vmflag_match(tok, "pf")) {
- /*
- * VVAR area mapped by the kernel as
- * VM_IO | VM_PFNMAP| VM_DONTEXPAND | VM_DONTDUMP
- */
- if (!vma_area_is(vma_area, VMA_AREA_VVAR))
- vma_area->e->status |= VMA_UNSUPP;
- }
-
- /*
- * Anything else is just ignored.
- */
- } while ((tok = strtok(NULL, " \n")));
-
-#undef _vmflag_match
-
- if (shared && maywrite)
- vma_area->e->fdflags = O_RDWR;
- else
- vma_area->e->fdflags = O_RDONLY;
- vma_area->e->has_fdflags = true;
-
- if (vma_area->e->madv)
- vma_area->e->has_madv = true;
-
- return 0;
-}
-
-static inline int is_anon_shmem_map(dev_t dev)
-{
- return kdat.shmem_dev == dev;
-}
-
-struct vma_file_info {
- int dev_maj;
- int dev_min;
- unsigned long ino;
- struct vma_area *vma;
-};
-
-static inline int vfi_equal(struct vma_file_info *a, struct vma_file_info *b)
-{
- return ((a->ino ^ b->ino) |
- (a->dev_maj ^ b->dev_maj) |
- (a->dev_min ^ b->dev_min)) == 0;
-}
-
-static int vma_get_mapfile(char *fname, struct vma_area *vma, DIR *mfd,
- struct vma_file_info *vfi, struct vma_file_info *prev_vfi)
-{
- char path[32];
- int flags;
-
- if (prev_vfi->vma && vfi_equal(vfi, prev_vfi)) {
- struct vma_area *prev = prev_vfi->vma;
-
- /*
- * If vfi is equal (!) and negative @vm_file_fd --
- * we have nothing to borrow for sure.
- */
- if (prev->vm_file_fd < 0)
- return 0;
-
- pr_debug("vma %"PRIx64" borrows vfi from previous %"PRIx64"\n",
- vma->e->start, prev->e->start);
- vma->vm_file_fd = prev->vm_file_fd;
- if (prev->e->status & VMA_AREA_SOCKET)
- vma->e->status |= VMA_AREA_SOCKET | VMA_AREA_REGULAR;
-
- /*
- * FIXME -- in theory there can be vmas that have
- * dev:ino match, but live in different mount
- * namespaces. However, we only borrow files for
- * subsequent vmas. These are _very_ likely to
- * have files from the same namespaces.
- */
- vma->file_borrowed = true;
-
- return 0;
- }
-
- /* Figure out if it's file mapping */
- snprintf(path, sizeof(path), "%"PRIx64"-%"PRIx64, vma->e->start, vma->e->end);
-
- /*
- * Note that we "open" it in dumper process space
- * so later we might refer to it via /proc/self/fd/vm_file_fd
- * if needed.
- */
- flags = O_PATH;
- if (vfi->dev_maj == 0)
- /*
- * Opening with O_PATH omits calling kernel ->open
- * method, thus for some special files their type
- * detection might be broken. Thus we open those with
- * the O_RDONLY to potentially get ENXIO and check
- * it below.
- */
- flags = O_RDONLY;
-
- vma->vm_file_fd = openat(dirfd(mfd), path, flags);
- if (vma->vm_file_fd < 0) {
- if (errno == ENOENT)
- /* Just mapping w/o map_files link */
- return 0;
-
- if (errno == ENXIO) {
- struct stat buf;
-
- if (fstatat(dirfd(mfd), path, &buf, 0))
- return -1;
-
- if (S_ISSOCK(buf.st_mode)) {
- pr_info("Found socket mapping @%"PRIx64"\n", vma->e->start);
- vma->vm_socket_id = buf.st_ino;
- vma->e->status |= VMA_AREA_SOCKET | VMA_AREA_REGULAR;
- return 0;
- }
-
- if ((buf.st_mode & S_IFMT) == 0 && !strncmp(fname, AIO_FNAME, sizeof(AIO_FNAME) - 1)) {
- /* AIO ring, let's try */
- close(vma->vm_file_fd);
- vma->aio_nr_req = -1;
- vma->e->status = VMA_AREA_AIORING;
- return 0;
- }
-
- pr_err("Unknown shit %o (%s)\n", buf.st_mode, fname);
- return -1;
- }
-
- if (errno == EPERM && !opts.aufs) {
- int fd;
- dev_t vfi_dev;
-
- /*
- * Kernel prohibits reading map_files for users. The
- * best we can do here is fill stat using the information
- * from smaps file and ... hope for the better :\
- *
- * Here we'll miss AIO-s and sockets :(
- */
-
- if (fname[0] == '\0') {
- /*
- * Another bad thing is that kernel first checks
- * for permission access to ANY map_files link,
- * then checks for its existance. So we have to
- * check for file path being empty to "emulate"
- * the ENOENT case.
- */
-
- if (vfi->dev_maj != 0 || vfi->dev_min != 0 || vfi->ino != 0) {
- pr_err("Strange file mapped at %lx [%s]:%d.%d.%ld\n",
- (unsigned long)vma->e->start, fname,
- vfi->dev_maj, vfi->dev_min, vfi->ino);
- return -1;
- }
-
- return 0;
- } else if (fname[0] != '/') {
- /*
- * This should be some kind of
- * special mapping like [heap], [vdso]
- * and such, the caller should take care
- * of the @fname and vma status.
- */
- return 0;
- }
-
- vfi_dev = makedev(vfi->dev_maj, vfi->dev_min);
- if (is_anon_shmem_map(vfi_dev)) {
- if (!(vma->e->flags & MAP_SHARED))
- return -1;
-
- vma->e->flags |= MAP_ANONYMOUS;
- vma->e->status |= VMA_ANON_SHARED;
- vma->e->shmid = vfi->ino;
-
- if (!strncmp(fname, "/SYSV", 5))
- vma->e->status |= VMA_AREA_SYSVIPC;
-
- return 0;
- }
-
- pr_info("Failed to open map_files/%s, try to go via [%s] path\n", path, fname);
- fd = open(fname, O_RDONLY);
- if (fd < 0) {
- pr_perror("Can't open mapped [%s]", fname);
- return -1;
- }
-
- vma->vmst = xmalloc(sizeof(struct stat));
- if (!vma->vmst) {
- close(fd);
- return -1;
- }
-
- if (fstat(fd, vma->vmst) < 0) {
- pr_perror("Can't stat [%s]\n", fname);
- close(fd);
- return -1;
- }
-
- if (vma->vmst->st_dev != vfi_dev ||
- vma->vmst->st_ino != vfi->ino) {
- pr_err("Failed to resolve mapping %lx filename\n",
- (unsigned long)vma->e->start);
- close(fd);
- return -1;
- }
-
- vma->vm_file_fd = fd;
- return 0;
- }
-
- pr_perror("Can't open map_files");
- return -1;
- }
-
- vma->vmst = xmalloc(sizeof(struct stat));
- if (!vma->vmst)
- return -1;
-
- /*
- * For AUFS support, we need to check if the symbolic link
- * points to a branch. If it does, we cannot fstat() its file
- * descriptor because it would return a different dev/ino than
- * the real file. If fixup_aufs_vma_fd() returns positive,
- * it means that it has stat()'ed using the full pathname.
- * Zero return means that the symbolic link does not point to
- * a branch and we can do fstat() below.
- */
- if (opts.aufs) {
- int ret;
-
- ret = fixup_aufs_vma_fd(vma);
- if (ret < 0)
- return -1;
- if (ret > 0)
- return 0;
- }
-
- if (fstat(vma->vm_file_fd, vma->vmst) < 0) {
- pr_perror("Failed fstat on map %"PRIx64"", vma->e->start);
- return -1;
- }
-
- return 0;
-}
-
-int parse_self_maps_lite(struct vm_area_list *vms)
-{
- FILE *maps;
-
- vm_area_list_init(vms);
-
- maps = fopen_proc(PROC_SELF, "maps");
- if (maps == NULL) {
- pr_perror("Can't open self maps");
- return -1;
- }
-
- while (fgets(buf, BUF_SIZE, maps) != NULL) {
- struct vma_area *vma;
- char *end;
-
- vma = alloc_vma_area();
- if (!vma) {
- fclose(maps);
- return -1;
- }
-
- vma->e->start = strtoul(buf, &end, 16);
- vma->e->end = strtoul(end + 1, NULL, 16);
- list_add_tail(&vma->list, &vms->h);
- vms->nr++;
-
- pr_debug("Parsed %"PRIx64"-%"PRIx64" vma\n", vma->e->start, vma->e->end);
- }
-
- fclose(maps);
- return 0;
-}
-
-#ifdef CONFIG_VDSO
-static inline int handle_vdso_vma(struct vma_area *vma)
-{
- vma->e->status |= VMA_AREA_REGULAR;
- if ((vma->e->prot & VDSO_PROT) == VDSO_PROT)
- vma->e->status |= VMA_AREA_VDSO;
- return 0;
-}
-
-static inline int handle_vvar_vma(struct vma_area *vma)
-{
- vma->e->status |= VMA_AREA_REGULAR;
- if ((vma->e->prot & VVAR_PROT) == VVAR_PROT)
- vma->e->status |= VMA_AREA_VVAR;
- return 0;
-}
-#else
-static inline int handle_vdso_vma(struct vma_area *vma)
-{
- pr_warn_once("Found vDSO area without support\n");
- return -1;
-}
-
-static inline int handle_vvar_vma(struct vma_area *vma)
-{
- pr_warn_once("Found VVAR area without support\n");
- return -1;
-}
-#endif
-
-static int handle_vma(pid_t pid, struct vma_area *vma_area,
- char *file_path, DIR *map_files_dir,
- struct vma_file_info *vfi,
- struct vma_file_info *prev_vfi,
- struct vm_area_list *vma_area_list)
-{
- if (vma_get_mapfile(file_path, vma_area, map_files_dir, vfi, prev_vfi))
- goto err_bogus_mapfile;
-
- if (vma_area->e->status != 0) {
- if (vma_area->e->status & VMA_AREA_AIORING)
- vma_area_list->nr_aios++;
- return 0;
- } else if (!strcmp(file_path, "[vsyscall]") ||
- !strcmp(file_path, "[vectors]")) {
- vma_area->e->status |= VMA_AREA_VSYSCALL;
- } else if (!strcmp(file_path, "[vdso]")) {
- if (handle_vdso_vma(vma_area))
- goto err;
- } else if (!strcmp(file_path, "[vvar]")) {
- if (handle_vvar_vma(vma_area))
- goto err;
- } else if (!strcmp(file_path, "[heap]")) {
- vma_area->e->status |= VMA_AREA_REGULAR | VMA_AREA_HEAP;
- } else {
- vma_area->e->status = VMA_AREA_REGULAR;
- }
-
- /*
- * Some mapping hints for restore, we save this on
- * disk and restore might need to analyze it.
- */
- if (vma_area->file_borrowed) {
- struct vma_area *prev = prev_vfi->vma;
-
- /*
- * Pick-up flags that might be set in the branch below.
- * Status is copied as-is as it should be zero here,
- * and have full match with the previous.
- */
- vma_area->e->flags |= (prev->e->flags & MAP_ANONYMOUS);
- vma_area->e->status = prev->e->status;
- vma_area->e->shmid = prev->e->shmid;
- vma_area->vmst = prev->vmst;
- vma_area->mnt_id = prev->mnt_id;
- } else if (vma_area->vm_file_fd >= 0) {
- struct stat *st_buf = vma_area->vmst;
-
- if (S_ISREG(st_buf->st_mode))
- /* regular file mapping -- supported */;
- else if (S_ISCHR(st_buf->st_mode) && (st_buf->st_rdev == DEVZERO))
- /* devzero mapping -- also makes sense */;
- else {
- pr_err("Can't handle non-regular mapping on %d's map %"PRIx64"\n", pid, vma_area->e->start);
- goto err;
- }
-
- /*
- * /dev/zero stands for anon-shared mapping
- * otherwise it's some file mapping.
- */
- if (is_anon_shmem_map(st_buf->st_dev)) {
- if (!(vma_area->e->flags & MAP_SHARED))
- goto err_bogus_mapping;
- vma_area->e->flags |= MAP_ANONYMOUS;
- vma_area->e->status |= VMA_ANON_SHARED;
- vma_area->e->shmid = st_buf->st_ino;
-
- if (!strncmp(file_path, "/SYSV", 5)) {
- pr_info("path: %s\n", file_path);
- vma_area->e->status |= VMA_AREA_SYSVIPC;
- }
- } else {
- if (vma_area->e->flags & MAP_PRIVATE)
- vma_area->e->status |= VMA_FILE_PRIVATE;
- else
- vma_area->e->status |= VMA_FILE_SHARED;
- }
-
- /*
- * We cannot use the mnt_id value provided by the kernel
- * for vm_file_fd if it is an AUFS file (the value is
- * wrong). In such a case, fixup_aufs_vma_fd() has set
- * mnt_id to -1 to mimic pre-3.15 kernels that didn't
- * have mnt_id.
- */
- if (vma_area->mnt_id != -1 &&
- get_fd_mntid(vma_area->vm_file_fd, &vma_area->mnt_id))
- return -1;
- } else {
- /*
- * No file but mapping -- anonymous one.
- */
- if (vma_area->e->flags & MAP_SHARED) {
- vma_area->e->status |= VMA_ANON_SHARED;
- vma_area->e->shmid = vfi->ino;
- } else {
- vma_area->e->status |= VMA_ANON_PRIVATE;
- }
- vma_area->e->flags |= MAP_ANONYMOUS;
- }
-
- return 0;
-err:
- return -1;
-err_bogus_mapping:
- pr_err("Bogus mapping 0x%"PRIx64"-0x%"PRIx64" (flags: %#x vm_file_fd: %d)\n",
- vma_area->e->start, vma_area->e->end,
- vma_area->e->flags, vma_area->vm_file_fd);
- goto err;
-
-err_bogus_mapfile:
- pr_perror("Can't open %d's mapfile link %"PRIx64, pid, vma_area->e->start);
- goto err;
-}
-
-static int vma_list_add(struct vma_area *vma_area,
- struct vm_area_list *vma_area_list,
- unsigned long *prev_end,
- struct vma_file_info *vfi, struct vma_file_info *prev_vfi)
-{
- if (vma_area->e->status & VMA_UNSUPP) {
- pr_err("Unsupported mapping found %016"PRIx64"-%016"PRIx64"\n",
- vma_area->e->start, vma_area->e->end);
- return -1;
- }
-
- /* Add a guard page only if here is enough space for it */
- if ((vma_area->e->flags & MAP_GROWSDOWN) &&
- *prev_end < vma_area->e->start)
- vma_area->e->start -= PAGE_SIZE; /* Guard page */
- *prev_end = vma_area->e->end;
-
- list_add_tail(&vma_area->list, &vma_area_list->h);
- vma_area_list->nr++;
- if (vma_area_is_private(vma_area, kdat.task_size)) {
- unsigned long pages;
-
- pages = vma_area_len(vma_area) / PAGE_SIZE;
- vma_area_list->priv_size += pages;
- vma_area_list->longest = max(vma_area_list->longest, pages);
- }
-
- *prev_vfi = *vfi;
- prev_vfi->vma = vma_area;
-
- return 0;
-}
-
-int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list)
-{
- struct vma_area *vma_area = NULL;
- unsigned long start, end, pgoff, prev_end = 0;
- char r, w, x, s;
- int ret = -1;
- struct vma_file_info vfi;
- struct vma_file_info prev_vfi = {};
-
- DIR *map_files_dir = NULL;
- struct bfd f;
-
- vma_area_list->nr = 0;
- vma_area_list->nr_aios = 0;
- vma_area_list->longest = 0;
- vma_area_list->priv_size = 0;
- INIT_LIST_HEAD(&vma_area_list->h);
-
- f.fd = open_proc(pid, "smaps");
- if (f.fd < 0)
- goto err_n;
-
- if (bfdopenr(&f))
- goto err_n;
-
- map_files_dir = opendir_proc(pid, "map_files");
- if (!map_files_dir) /* old kernel? */
- goto err;
-
- while (1) {
- int num, path_off;
- bool eof;
- char *str;
-
- str = breadline(&f);
- if (IS_ERR(str))
- goto err;
- eof = (str == NULL);
-
- if (!eof && !is_vma_range_fmt(str)) {
- if (!strncmp(str, "Nonlinear", 9)) {
- BUG_ON(!vma_area);
- pr_err("Nonlinear mapping found %016"PRIx64"-%016"PRIx64"\n",
- vma_area->e->start, vma_area->e->end);
- /*
- * VMA is already on list and will be
- * freed later as list get destroyed.
- */
- vma_area = NULL;
- goto err;
- } else if (!strncmp(str, "VmFlags: ", 9)) {
- BUG_ON(!vma_area);
- if (parse_vmflags(&str[9], vma_area))
- goto err;
- continue;
- } else
- continue;
- }
-
- if (vma_area && vma_list_add(vma_area, vma_area_list,
- &prev_end, &vfi, &prev_vfi))
- goto err;
-
- if (eof)
- break;
-
- vma_area = alloc_vma_area();
- if (!vma_area)
- goto err;
-
- num = sscanf(str, "%lx-%lx %c%c%c%c %lx %x:%x %lu %n",
- &start, &end, &r, &w, &x, &s, &pgoff,
- &vfi.dev_maj, &vfi.dev_min, &vfi.ino, &path_off);
- if (num < 10) {
- pr_err("Can't parse: %s\n", str);
- goto err;
- }
-
- vma_area->e->start = start;
- vma_area->e->end = end;
- vma_area->e->pgoff = pgoff;
- vma_area->e->prot = PROT_NONE;
-
- if (r == 'r')
- vma_area->e->prot |= PROT_READ;
- if (w == 'w')
- vma_area->e->prot |= PROT_WRITE;
- if (x == 'x')
- vma_area->e->prot |= PROT_EXEC;
-
- if (s == 's')
- vma_area->e->flags = MAP_SHARED;
- else if (s == 'p')
- vma_area->e->flags = MAP_PRIVATE;
- else {
- pr_err("Unexpected VMA met (%c)\n", s);
- goto err;
- }
-
- if (handle_vma(pid, vma_area, str + path_off, map_files_dir,
- &vfi, &prev_vfi, vma_area_list))
- goto err;
- }
-
- vma_area = NULL;
- ret = 0;
-
-err:
- bclose(&f);
-err_n:
- if (map_files_dir)
- closedir(map_files_dir);
-
- xfree(vma_area);
- return ret;
-
-}
-
-int parse_pid_stat(pid_t pid, struct proc_pid_stat *s)
-{
- char *tok, *p;
- int fd;
- int n;
-
- fd = open_proc(pid, "stat");
- if (fd < 0)
- return -1;
-
- n = read(fd, buf, BUF_SIZE);
- close(fd);
- if (n < 1) {
- pr_err("stat for %d is corrupted\n", pid);
- return -1;
- }
-
- memset(s, 0, sizeof(*s));
-
- tok = strchr(buf, ' ');
- if (!tok)
- goto err;
- *tok++ = '\0';
- if (*tok != '(')
- goto err;
-
- s->pid = atoi(buf);
-
- p = strrchr(tok + 1, ')');
- if (!p)
- goto err;
- *tok = '\0';
- *p = '\0';
-
- strlcpy(s->comm, tok + 1, sizeof(s->comm));
-
- n = sscanf(p + 1,
- " %c %d %d %d %d %d %u %lu %lu %lu %lu "
- "%lu %lu %ld %ld %ld %ld %d %d %llu %lu %ld %lu %lu %lu %lu "
- "%lu %lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld "
- "%lu %lu %lu %lu %lu %lu %lu %d",
- &s->state,
- &s->ppid,
- &s->pgid,
- &s->sid,
- &s->tty_nr,
- &s->tty_pgrp,
- &s->flags,
- &s->min_flt,
- &s->cmin_flt,
- &s->maj_flt,
- &s->cmaj_flt,
- &s->utime,
- &s->stime,
- &s->cutime,
- &s->cstime,
- &s->priority,
- &s->nice,
- &s->num_threads,
- &s->zero0,
- &s->start_time,
- &s->vsize,
- &s->mm_rss,
- &s->rsslim,
- &s->start_code,
- &s->end_code,
- &s->start_stack,
- &s->esp,
- &s->eip,
- &s->sig_pending,
- &s->sig_blocked,
- &s->sig_ignored,
- &s->sig_handled,
- &s->wchan,
- &s->zero1,
- &s->zero2,
- &s->exit_signal,
- &s->task_cpu,
- &s->rt_priority,
- &s->policy,
- &s->delayacct_blkio_ticks,
- &s->gtime,
- &s->cgtime,
- &s->start_data,
- &s->end_data,
- &s->start_brk,
- &s->arg_start,
- &s->arg_end,
- &s->env_start,
- &s->env_end,
- &s->exit_code);
- if (n < 50)
- goto err;
-
- return 0;
-
-err:
- pr_err("Parsing %d's stat failed (#fields do not match)\n", pid);
- return -1;
-}
-
-int prepare_loginuid(unsigned int value, unsigned int loglevel)
-{
- int fd, ret = 0;
- char buf[11]; /* 4294967295 is maximum for u32 */
-
- fd = open_proc_rw(PROC_SELF, "loginuid");
- if (fd < 0)
- return -1;
-
- snprintf(buf, 11, "%u", value);
-
- if (write(fd, buf, 11) < 0) {
- print_on_level(loglevel,
- "Write %s to /proc/self/loginuid failed: %s",
- buf, strerror(errno));
- ret = -1;
- }
- close(fd);
- return ret;
-}
-
-unsigned int parse_pid_loginuid(pid_t pid, int *err, bool ignore_noent)
-{
- int fd;
- ssize_t num;
-
- *err = 0;
- fd = __open_proc(pid, (ignore_noent) ? ENOENT : 0,
- O_RDONLY, "loginuid");
- if (fd < 0)
- goto out;
-
- num = read(fd, buf, 10);
- close(fd);
- if (num < 0) {
- pr_perror("Unable to read /proc/%d/loginuid", pid);
- goto out;
- }
- buf[num] = '\0';
-
- return strtol(buf, NULL, 10);
-
-out:
- *err = -1;
- return INVALID_UID; /* unset value */
-}
-
-int parse_pid_oom_score_adj(pid_t pid, int *err)
-{
- int fd;
- ssize_t num;
-
- *err = 0;
- fd = open_proc(pid, "oom_score_adj");
- if (fd < 0)
- goto out;
-
- num = read(fd, buf, 10);
- close(fd);
- if (num < 0) {
- pr_perror("Unable to read /proc/%d/oom_score_adj", pid);
- goto out;
- }
- buf[num] = '\0';
-
- return strtol(buf, NULL, 10);
-
-out:
- *err = -1;
- return 0;
-}
-
-static int ids_parse(char *str, unsigned int *arr)
-{
- char *end;
-
- arr[0] = strtol(str, &end, 10);
- arr[1] = strtol(end + 1, &end, 10);
- arr[2] = strtol(end + 1, &end, 10);
- arr[3] = strtol(end + 1, &end, 10);
- if (*end)
- return -1;
- else
- return 0;
-}
-
-static int cap_parse(char *str, unsigned int *res)
-{
- int i, ret;
-
- for (i = 0; i < PROC_CAP_SIZE; i++) {
- ret = sscanf(str, "%08x", &res[PROC_CAP_SIZE - 1 - i]);
- if (ret != 1)
- return -1;
- str += 8;
- }
-
- return 0;
-}
-
-int parse_pid_status(pid_t pid, struct proc_status_creds *cr)
-{
- struct bfd f;
- int done = 0;
- int ret = -1;
- char *str;
- bool parsed_seccomp = false;
-
- f.fd = open_proc(pid, "status");
- if (f.fd < 0) {
- pr_perror("Can't open proc status");
- return -1;
- }
-
- cr->sigpnd = 0;
- cr->shdpnd = 0;
-
- if (bfdopenr(&f))
- return -1;
-
- while (done < 12) {
- str = breadline(&f);
- if (str == NULL)
- break;
- if (IS_ERR(str))
- goto err_parse;
-
- if (!strncmp(str, "State:", 6)) {
- cr->state = str[7];
- done++;
- continue;
- }
-
- if (!strncmp(str, "PPid:", 5)) {
- if (sscanf(str, "PPid:\t%d", &cr->ppid) != 1) {
- pr_err("Unable to parse: %s\n", str);
- goto err_parse;
- }
- done++;
- continue;
- }
-
- if (!strncmp(str, "Uid:", 4)) {
- if (ids_parse(str + 5, cr->uids))
- goto err_parse;
-
- done++;
- continue;
- }
-
- if (!strncmp(str, "Gid:", 4)) {
- if (ids_parse(str + 5, cr->gids))
- goto err_parse;
-
- done++;
- continue;
- }
-
- if (!strncmp(str, "CapInh:", 7)) {
- if (cap_parse(str + 8, cr->cap_inh))
- goto err_parse;
-
- done++;
- continue;
- }
-
- if (!strncmp(str, "CapEff:", 7)) {
- if (cap_parse(str + 8, cr->cap_eff))
- goto err_parse;
-
- done++;
- continue;
- }
-
- if (!strncmp(str, "CapPrm:", 7)) {
- if (cap_parse(str + 8, cr->cap_prm))
- goto err_parse;
-
- done++;
- continue;
- }
-
- if (!strncmp(str, "CapBnd:", 7)) {
- if (cap_parse(str + 8, cr->cap_bnd))
- goto err_parse;
-
- done++;
- continue;
- }
-
- if (!strncmp(str, "Seccomp:", 8)) {
- if (sscanf(str + 9, "%d", &cr->seccomp_mode) != 1) {
- goto err_parse;
- }
-
- parsed_seccomp = true;
- done++;
- continue;
- }
-
- if (!strncmp(str, "ShdPnd:", 7)) {
- unsigned long long sigpnd;
-
- if (sscanf(str + 7, "%llx", &sigpnd) != 1)
- goto err_parse;
- cr->shdpnd |= sigpnd;
-
- done++;
- continue;
- }
- if (!strncmp(str, "SigPnd:", 7)) {
- unsigned long long sigpnd;
-
- if (sscanf(str + 7, "%llx", &sigpnd) != 1)
- goto err_parse;
- cr->sigpnd |= sigpnd;
-
- done++;
- continue;
- }
- }
-
- /* seccomp is optional */
- if (done >= 11 || (done == 10 && !parsed_seccomp))
- ret = 0;
-
-err_parse:
- if (ret)
- pr_err("Error parsing proc status file\n");
- bclose(&f);
- return ret;
-}
-
-struct opt2flag {
- char *opt;
- unsigned flag;
-};
-
-static bool sb_opt_cb(char *opt, char *unknown, size_t *uoff)
-{
- unsigned int id;
-
- if (sscanf(opt, "gid=%d", &id) == 1) {
- *uoff += sprintf(unknown + *uoff, "gid=%d", userns_gid(id));
- unknown[*uoff] = ',';
- (*uoff)++;
- return true;
- } else if (sscanf(opt, "uid=%d", &id) == 1) {
- *uoff += sprintf(unknown + *uoff, "uid=%d", userns_uid(id));
- unknown[*uoff] = ',';
- (*uoff)++;
- return true;
- }
- return false;
-}
-
-static int do_opt2flag(char *opt, unsigned *flags,
- const struct opt2flag *opts, char *unknown,
- bool (*cb)(char *opt, char *unknown, size_t *uoff))
-{
- int i;
- char *end;
- size_t uoff = 0;
-
- while (1) {
- end = strchr(opt, ',');
- if (end)
- *end = '\0';
-
- for (i = 0; opts[i].opt != NULL; i++)
- if (!strcmp(opts[i].opt, opt)) {
- (*flags) |= opts[i].flag;
- break;
- }
-
- if (opts[i].opt == NULL && cb && !cb(opt, unknown, &uoff)) {
- if (!unknown) {
- pr_err("Unknown option [%s]\n", opt);
- return -1;
- }
-
- strcpy(unknown + uoff, opt);
- uoff += strlen(opt);
- unknown[uoff] = ',';
- uoff++;
- }
-
- if (!end) {
- if (uoff)
- uoff--;
- if (unknown)
- unknown[uoff] = '\0';
- break;
- } else
- opt = end + 1;
- }
-
- return 0;
-}
-
-static int parse_mnt_flags(char *opt, unsigned *flags)
-{
- static const struct opt2flag mnt_opt2flag[] = {
- { "rw", 0, },
- { "ro", MS_RDONLY, },
- { "nosuid", MS_NOSUID, },
- { "nodev", MS_NODEV, },
- { "noexec", MS_NOEXEC, },
- { "noatime", MS_NOATIME, },
- { "nodiratime", MS_NODIRATIME, },
- { "relatime", MS_RELATIME, },
- { },
- };
-
- if (do_opt2flag(opt, flags, mnt_opt2flag, NULL, NULL))
- return -1;
-
- /* Otherwise the kernel assumes RELATIME by default */
- if ((*flags & (MS_RELATIME | MS_NOATIME)) == 0)
- *flags = MS_STRICTATIME;
-
- return 0;
-}
-
-static int parse_sb_opt(char *opt, unsigned *flags, char *uopt)
-{
- static const struct opt2flag sb_opt2flag[] = {
- { "rw", 0, },
- { "ro", MS_RDONLY, },
- { "sync", MS_SYNC, },
- { "dirsync", MS_DIRSYNC, },
- { "mad", MS_MANDLOCK, },
- { },
- };
-
- return do_opt2flag(opt, flags, sb_opt2flag, uopt, sb_opt_cb);
-}
-
-static int parse_mnt_opt(char *str, struct mount_info *mi, int *off)
-{
- char *istr = str, *end;
-
- while (1) {
- end = strchr(str, ' ');
- if (!end) {
- pr_err("Error parsing mount options\n");
- return -1;
- }
-
- *end = '\0';
- if (!strncmp(str, "-", 1))
- break;
- else if (!strncmp(str, "shared:", 7)) {
- mi->flags |= MS_SHARED;
- mi->shared_id = atoi(str + 7);
- } else if (!strncmp(str, "master:", 7)) {
- mi->flags |= MS_SLAVE;
- mi->master_id = atoi(str + 7);
- } else if (!strncmp(str, "propagate_from:", 15)) {
- /* skip */;
- } else if (!strncmp(str, "unbindable", 11))
- mi->flags |= MS_UNBINDABLE;
- else {
- pr_err("Unknown option [%s]\n", str);
- return -1;
- }
-
- str = end + 1;
- }
-
- *off = end - istr + 1;
- return 0;
-}
-
-/*
- * mountinfo contains mangled paths. space, tab and back slash were replaced
- * with usual octal escape. This function replaces these symbols back.
- */
-static void cure_path(char *path)
-{
- int i, len, off = 0;
-
- if (strchr(path, '\\') == NULL) /* fast path */
- return;
-
- len = strlen(path);
- for (i = 0; i < len; i++) {
- if (!strncmp(path + i, "\\040", 4)) {
- path[i - off] = ' ';
- goto replace;
- } else if (!strncmp(path + i, "\\011", 4)) {
- path[i - off] = '\t';
- goto replace;
- } else if (!strncmp(path + i, "\\134", 4)) {
- path[i - off] = '\\';
- goto replace;
- }
- if (off)
- path[i - off] = path[i];
- continue;
-replace:
- off += 3;
- i += 3;
- }
- path[len - off] = 0;
-}
-
-static int parse_mountinfo_ent(char *str, struct mount_info *new, char **fsname)
-{
- struct fd_link root_link;
- unsigned int kmaj, kmin;
- int ret, n;
- char *sub, *opt = NULL;
-
- new->mountpoint = xmalloc(PATH_MAX);
- if (new->mountpoint == NULL)
- goto err;
- new->ns_mountpoint = new->mountpoint;
-
- new->mountpoint[0] = '.';
- ret = sscanf(str, "%i %i %u:%u %ms %s %ms %n",
- &new->mnt_id, &new->parent_mnt_id,
- &kmaj, &kmin, &new->root, new->mountpoint + 1,
- &opt, &n);
- if (ret != 7)
- goto err;
-
- cure_path(new->mountpoint);
- cure_path(new->root);
-
- root_link.len = strlen(new->root);
- strcpy(root_link.name, new->root);
- if (strip_deleted(&root_link)) {
- strcpy(new->root, root_link.name);
- new->deleted = true;
- }
-
- new->mountpoint = xrealloc(new->mountpoint, strlen(new->mountpoint) + 1);
- if (!new->mountpoint)
- goto err;
-
- new->s_dev = new->s_dev_rt = MKKDEV(kmaj, kmin);
- new->flags = 0;
- if (parse_mnt_flags(opt, &new->flags))
- goto err;
-
- free(opt); /* we are going to reallocate/reuse this buffer */
- opt = NULL;
-
- str += n;
- if (parse_mnt_opt(str, new, &n))
- goto err;
-
- str += n;
- ret = sscanf(str, "%ms %ms %ms", fsname, &new->source, &opt);
- if (ret == 2) {
- /* src may be empty */
- opt = new->source;
- new->source = xstrdup("");
- if (new->source == NULL)
- goto err;
- } else if (ret != 3)
- goto err;
-
- cure_path(new->source);
-
- /*
- * The kernel reports "subtypes" sometimes and the valid
- * type-vs-subtype delimiter is the dot symbol. We disregard
- * any subtypes for the purpose of finding the fstype.
- */
- sub = strchr(*fsname, '.');
- if (sub)
- *sub = 0;
-
- new->fstype = find_fstype_by_name(*fsname);
-
- new->options = xmalloc(strlen(opt) + 1);
- if (!new->options)
- goto err;
-
- if (parse_sb_opt(opt, &new->sb_flags, new->options))
- goto err;
-
- ret = 0;
-ret:
- xfree(opt);
- return ret;
-err:
- ret = -1;
- goto ret;
-}
-
-static LIST_HEAD(skip_mount_list);
-
-struct str_node {
- struct list_head node;
- char string[];
-};
-
-bool add_skip_mount(const char *mountpoint)
-{
- struct str_node *skip = xmalloc(sizeof(struct str_node) +
- strlen(mountpoint) + 1);
- if (!skip)
- return false;
-
- strcpy(skip->string, mountpoint);
- list_add(&skip->node, &skip_mount_list);
- return true;
-}
-
-static bool should_skip_mount(const char *mountpoint)
-{
- struct str_node *pos;
-
- list_for_each_entry(pos, &skip_mount_list, node) {
- if (strcmp(mountpoint, pos->string) == 0)
- return true;
- }
-
- return false;
-}
-
-struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool for_dump)
-{
- struct mount_info *list = NULL;
- FILE *f;
- char str[1024];
-
- f = fopen_proc(pid, "mountinfo");
- if (!f) {
- pr_perror("Can't open %d mountinfo", pid);
- return NULL;
- }
-
- while (fgets(str, sizeof(str), f)) {
- struct mount_info *new;
- int ret = -1;
- char *fsname = NULL;
-
- new = mnt_entry_alloc();
- if (!new)
- goto end;
-
- new->nsid = nsid;
-
- ret = parse_mountinfo_ent(str, new, &fsname);
- if (ret < 0) {
- pr_err("Bad format in %d mountinfo: '%s'\n", pid, str);
- goto end;
- }
-
- /*
- * Drop this mountpoint early, so that lookup_mnt_id/etc will
- * fail loudly at "dump" stage if an opened file or another mnt
- * depends on this one.
- */
- if (for_dump && should_skip_mount(new->mountpoint + 1)) {
- pr_info("\tskip %s @ %s\n", fsname, new->mountpoint);
- mnt_entry_free(new);
- new = NULL;
- goto end;
- }
-
- pr_info("\ttype %s source %s mnt_id %d s_dev %#x %s @ %s flags %#x options %s\n",
- fsname, new->source,
- new->mnt_id, new->s_dev, new->root, new->mountpoint,
- new->flags, new->options);
-
- if (new->fstype->parse) {
- ret = new->fstype->parse(new);
- if (ret) {
- pr_err("Failed to parse FS specific data on %s\n",
- new->mountpoint);
- goto end;
- }
- }
-end:
- if (fsname)
- free(fsname);
-
- if (new) {
- new->next = list;
- list = new;
- }
-
- if (ret)
- goto err;
- }
-out:
- fclose(f);
- return list;
-
-err:
- while (list) {
- struct mount_info *next = list->next;
- mnt_entry_free(list);
- list = next;
- }
- goto out;
-}
-
-static char nybble(const char n)
-{
- if (n >= '0' && n <= '9')
- return n - '0';
- else if (n >= 'A' && n <= 'F')
- return n - ('A' - 10);
- else if (n >= 'a' && n <= 'f')
- return n - ('a' - 10);
- return 0;
-}
-
-static int alloc_fhandle(FhEntry *fh)
-{
- fh->n_handle = FH_ENTRY_SIZES__min_entries;
- fh->handle = xmalloc(pb_repeated_size(fh, handle));
-
- return fh->handle == NULL ? -1 : 0;
-}
-
-static void free_fhandle(FhEntry *fh)
-{
- if (fh->handle)
- xfree(fh->handle);
-}
-
-void free_inotify_wd_entry(union fdinfo_entries *e)
-{
- free_fhandle(e->ify.e.f_handle);
- xfree(e);
-}
-
-void free_fanotify_mark_entry(union fdinfo_entries *e)
-{
- if (e->ffy.e.ie)
- free_fhandle(e->ffy.ie.f_handle);
- xfree(e);
-}
-
-void free_event_poll_entry(union fdinfo_entries *e)
-{
- xfree(e);
-}
-
-static void parse_fhandle_encoded(char *tok, FhEntry *fh)
-{
- char *d = (char *)fh->handle;
- int i = 0;
-
- memzero(d, pb_repeated_size(fh, handle));
-
- while (*tok == ' ')
- tok++;
-
- while (*tok) {
- if (i >= pb_repeated_size(fh, handle))
- break;
- d[i++] = (nybble(tok[0]) << 4) | nybble(tok[1]);
- if (tok[1])
- tok += 2;
- else
- break;
- }
-}
-
-static int parse_timerfd(struct bfd *f, char *str, TimerfdEntry *tfy)
-{
- /*
- * Format is
- * clockid: 0
- * ticks: 0
- * settime flags: 01
- * it_value: (0, 49406829)
- * it_interval: (1, 0)
- */
- if (sscanf(str, "clockid: %d", &tfy->clockid) != 1)
- goto parse_err;
-
- str = breadline(f);
- if (IS_ERR_OR_NULL(str))
- goto nodata;
- if (sscanf(str, "ticks: %llu", (unsigned long long *)&tfy->ticks) != 1)
- goto parse_err;
-
- str = breadline(f);
- if (IS_ERR_OR_NULL(str))
- goto nodata;
- if (sscanf(str, "settime flags: 0%o", &tfy->settime_flags) != 1)
- goto parse_err;
-
- str = breadline(f);
- if (IS_ERR_OR_NULL(str))
- goto nodata;
- if (sscanf(str, "it_value: (%llu, %llu)",
- (unsigned long long *)&tfy->vsec,
- (unsigned long long *)&tfy->vnsec) != 2)
- goto parse_err;
-
- str = breadline(f);
- if (IS_ERR_OR_NULL(str))
- goto nodata;
- if (sscanf(str, "it_interval: (%llu, %llu)",
- (unsigned long long *)&tfy->isec,
- (unsigned long long *)&tfy->insec) != 2)
- goto parse_err;
- return 0;
-
-parse_err:
- return -1;
-nodata:
- pr_err("No data left in proc file while parsing timerfd\n");
- goto parse_err;
-}
-
-#define fdinfo_field(str, field) !strncmp(str, field":", sizeof(field))
-
-static int parse_file_lock_buf(char *buf, struct file_lock *fl,
- bool is_blocked);
-static int parse_fdinfo_pid_s(int pid, int fd, int type,
- int (*cb)(union fdinfo_entries *e, void *arg), void *arg)
-{
- struct bfd f;
- char *str;
- bool entry_met = false;
- int ret, exit_code = -1;;
-
- f.fd = open_proc(pid, "fdinfo/%d", fd);
- if (f.fd < 0) {
- pr_perror("Can't open fdinfo/%d to parse", fd);
- return -1;
- }
-
- if (bfdopenr(&f))
- return -1;
-
- while (1) {
- union fdinfo_entries entry;
-
- str = breadline(&f);
- if (!str)
- break;
- if (IS_ERR(str))
- goto out;
-
- if (fdinfo_field(str, "pos") ||
- fdinfo_field(str, "flags") ||
- fdinfo_field(str, "mnt_id")) {
- unsigned long long val;
- struct fdinfo_common *fdinfo = arg;
-
- if (type != FD_TYPES__UND)
- continue;
- ret = sscanf(str, "%*s %lli", &val);
- if (ret != 1)
- goto parse_err;
-
- if (fdinfo_field(str, "pos"))
- fdinfo->pos = val;
- else if (fdinfo_field(str, "flags"))
- fdinfo->flags = val;
- else if (fdinfo_field(str, "mnt_id"))
- fdinfo->mnt_id = val;
-
- entry_met = true;
- continue;
- }
-
- if (fdinfo_field(str, "lock")) {
- struct file_lock *fl;
- struct fdinfo_common *fdinfo = arg;
-
- if (type != FD_TYPES__UND)
- continue;
-
- fl = alloc_file_lock();
- if (!fl) {
- pr_perror("Alloc file lock failed!");
- goto out;
- }
-
- if (parse_file_lock_buf(str + 6, fl, 0)) {
- xfree(fl);
- goto parse_err;
- }
-
- pr_info("lockinfo: %lld:%d %x %d %02x:%02x:%ld %lld %s\n",
- fl->fl_id, fl->fl_kind, fl->fl_ltype,
- fl->fl_owner, fl->maj, fl->min, fl->i_no,
- fl->start, fl->end);
-
-
- if (fl->fl_kind == FL_UNKNOWN) {
- pr_err("Unknown file lock!\n");
- xfree(fl);
- goto out;
- }
-
- fl->real_owner = fdinfo->owner;
- fl->owners_fd = fd;
- list_add_tail(&fl->list, &file_lock_list);
- }
-
- if (type == FD_TYPES__UND)
- continue;
-
- if (fdinfo_field(str, "eventfd-count")) {
- eventfd_file_entry__init(&entry.efd);
-
- if (type != FD_TYPES__EVENTFD)
- goto parse_err;
- ret = sscanf(str, "eventfd-count: %"PRIx64,
- &entry.efd.counter);
- if (ret != 1)
- goto parse_err;
- ret = cb(&entry, arg);
- if (ret)
- goto out;
-
- entry_met = true;
- continue;
- }
- if (fdinfo_field(str, "clockid")) {
- timerfd_entry__init(&entry.tfy);
-
- if (type != FD_TYPES__TIMERFD)
- goto parse_err;
- ret = parse_timerfd(&f, str, &entry.tfy);
- if (ret)
- goto parse_err;
- ret = cb(&entry, arg);
- if (ret)
- goto out;
-
- entry_met = true;
- continue;
- }
- if (fdinfo_field(str, "tfd")) {
- union fdinfo_entries *e;
-
- if (type != FD_TYPES__EVENTPOLL)
- goto parse_err;
-
- e = xmalloc(sizeof(union fdinfo_entries));
- if (!e)
- goto out;
-
- eventpoll_tfd_entry__init(&e->epl.e);
-
- ret = sscanf(str, "tfd: %d events: %x data: %"PRIx64,
- &e->epl.e.tfd, &e->epl.e.events, &e->epl.e.data);
- if (ret != 3) {
- free_event_poll_entry(e);
- goto parse_err;
- }
- ret = cb(e, arg);
- if (ret)
- goto out;
-
- entry_met = true;
- continue;
- }
- if (fdinfo_field(str, "sigmask")) {
- signalfd_entry__init(&entry.sfd);
-
- if (type != FD_TYPES__SIGNALFD)
- goto parse_err;
- ret = sscanf(str, "sigmask: %Lx",
- (unsigned long long *)&entry.sfd.sigmask);
- if (ret != 1)
- goto parse_err;
- ret = cb(&entry, arg);
- if (ret)
- goto out;
-
- entry_met = true;
- continue;
- }
- if (fdinfo_field(str, "fanotify flags")) {
- struct fsnotify_params *p = arg;
-
- if (type != FD_TYPES__FANOTIFY)
- goto parse_err;
-
- ret = sscanf(str, "fanotify flags:%x event-flags:%x",
- &p->faflags, &p->evflags);
- if (ret != 2)
- goto parse_err;
- entry_met = true;
- continue;
- }
- if (fdinfo_field(str, "fanotify ino")) {
- union fdinfo_entries *e;
- int hoff = 0;
-
- if (type != FD_TYPES__FANOTIFY)
- goto parse_err;
-
- e = xmalloc(sizeof(*e));
- if (!e)
- goto parse_err;
-
- fanotify_mark_entry__init(&e->ffy.e);
- fanotify_inode_mark_entry__init(&e->ffy.ie);
- fh_entry__init(&e->ffy.f_handle);
- e->ffy.e.ie = &e->ffy.ie;
- e->ffy.ie.f_handle = &e->ffy.f_handle;
-
- ret = sscanf(str,
- "fanotify ino:%"PRIx64" sdev:%x mflags:%x mask:%x ignored_mask:%x "
- "fhandle-bytes:%x fhandle-type:%x f_handle: %n",
- &e->ffy.ie.i_ino, &e->ffy.e.s_dev,
- &e->ffy.e.mflags, &e->ffy.e.mask, &e->ffy.e.ignored_mask,
- &e->ffy.f_handle.bytes, &e->ffy.f_handle.type,
- &hoff);
- if (ret != 7 || hoff == 0) {
- free_fanotify_mark_entry(e);
- goto parse_err;
- }
-
- if (alloc_fhandle(&e->ffy.f_handle)) {
- free_fanotify_mark_entry(e);
- goto out;
- }
- parse_fhandle_encoded(str + hoff, &e->ffy.f_handle);
-
- e->ffy.e.type = MARK_TYPE__INODE;
- ret = cb(e, arg);
-
-
- if (ret)
- goto out;
-
- entry_met = true;
- continue;
- }
- if (fdinfo_field(str, "fanotify mnt_id")) {
- union fdinfo_entries *e;
-
- if (type != FD_TYPES__FANOTIFY)
- goto parse_err;
-
- e = xmalloc(sizeof(*e));
- if (!e)
- goto parse_err;
-
- fanotify_mark_entry__init(&e->ffy.e);
- fanotify_mount_mark_entry__init(&e->ffy.me);
- e->ffy.e.me = &e->ffy.me;
-
- ret = sscanf(str,
- "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x",
- &e->ffy.e.me->mnt_id, &e->ffy.e.mflags,
- &e->ffy.e.mask, &e->ffy.e.ignored_mask);
- if (ret != 4)
- goto parse_err;
-
- e->ffy.e.type = MARK_TYPE__MOUNT;
- ret = cb(e, arg);
- if (ret)
- goto out;
-
- entry_met = true;
- continue;
- }
- if (fdinfo_field(str, "inotify wd")) {
- InotifyWdEntry *ify;
- union fdinfo_entries *e;
- int hoff;
-
- if (type != FD_TYPES__INOTIFY)
- goto parse_err;
-
- e = xmalloc(sizeof(*e));
- if (!e)
- goto parse_err;
- ify = &e->ify.e;
-
- inotify_wd_entry__init(ify);
- ify->f_handle = &e->ify.f_handle;
- fh_entry__init(ify->f_handle);
-
- ret = sscanf(str,
- "inotify wd:%x ino:%"PRIx64" sdev:%x "
- "mask:%x ignored_mask:%x "
- "fhandle-bytes:%x fhandle-type:%x "
- "f_handle: %n",
- &ify->wd, &ify->i_ino, &ify->s_dev,
- &ify->mask, &ify->ignored_mask,
- &ify->f_handle->bytes, &ify->f_handle->type,
- &hoff);
- if (ret != 7) {
- free_inotify_wd_entry(e);
- goto parse_err;
- }
-
- if (alloc_fhandle(ify->f_handle)) {
- free_inotify_wd_entry(e);
- goto out;
- }
-
- parse_fhandle_encoded(str + hoff, ify->f_handle);
-
- ret = cb(e, arg);
-
- if (ret)
- goto out;
-
- entry_met = true;
- continue;
- }
- }
-
- exit_code = 0;
- if (entry_met)
- goto out;
- /*
- * An eventpoll/inotify file may have no target fds set thus
- * resulting in no tfd: lines in proc. This is normal.
- */
- if (type == FD_TYPES__EVENTPOLL || type == FD_TYPES__INOTIFY)
- goto out;
-
- pr_err("No records of type %d found in fdinfo file\n", type);
-parse_err:
- exit_code = -1;
- pr_perror("%s: error parsing [%s] for %d", __func__, str, type);
-out:
- bclose(&f);
- return exit_code;
-}
-
-int parse_fdinfo_pid(int pid, int fd, int type,
- int (*cb)(union fdinfo_entries *e, void *arg), void *arg)
-{
- return parse_fdinfo_pid_s(pid, fd, type, cb, arg);
-}
-
-int parse_fdinfo(int fd, int type,
- int (*cb)(union fdinfo_entries *e, void *arg), void *arg)
-{
- return parse_fdinfo_pid_s(PROC_SELF, fd, type, cb, arg);
-}
-
-int get_fd_mntid(int fd, int *mnt_id)
-{
- struct fdinfo_common fdinfo = { .mnt_id = -1};
-
- if (parse_fdinfo(fd, FD_TYPES__UND, NULL, &fdinfo))
- return -1;
-
- *mnt_id = fdinfo.mnt_id;
- return 0;
-}
-
-static int parse_file_lock_buf(char *buf, struct file_lock *fl,
- bool is_blocked)
-{
- int num;
- char fl_flag[10], fl_type[15], fl_option[10];
-
- if (is_blocked) {
- num = sscanf(buf, "%lld: -> %s %s %s %d %x:%x:%ld %lld %s",
- &fl->fl_id, fl_flag, fl_type, fl_option,
- &fl->fl_owner, &fl->maj, &fl->min, &fl->i_no,
- &fl->start, fl->end);
- } else {
- num = sscanf(buf, "%lld:%s %s %s %d %x:%x:%ld %lld %s",
- &fl->fl_id, fl_flag, fl_type, fl_option,
- &fl->fl_owner, &fl->maj, &fl->min, &fl->i_no,
- &fl->start, fl->end);
- }
-
- if (num < 10) {
- pr_err("Invalid file lock info (%d): %s\n", num, buf);
- return -1;
- }
-
- if (!strcmp(fl_flag, "POSIX"))
- fl->fl_kind = FL_POSIX;
- else if (!strcmp(fl_flag, "FLOCK"))
- fl->fl_kind = FL_FLOCK;
- else
- fl->fl_kind = FL_UNKNOWN;
-
- if (!strcmp(fl_type, "MSNFS")) {
- fl->fl_ltype |= LOCK_MAND;
-
- if (!strcmp(fl_option, "READ")) {
- fl->fl_ltype |= LOCK_READ;
- } else if (!strcmp(fl_option, "RW")) {
- fl->fl_ltype |= LOCK_RW;
- } else if (!strcmp(fl_option, "WRITE")) {
- fl->fl_ltype |= LOCK_WRITE;
- } else {
- pr_err("Unknown lock option!\n");
- return -1;
- }
- } else {
- if (!strcmp(fl_option, "UNLCK")) {
- fl->fl_ltype |= F_UNLCK;
- } else if (!strcmp(fl_option, "WRITE")) {
- fl->fl_ltype |= F_WRLCK;
- } else if (!strcmp(fl_option, "READ")) {
- fl->fl_ltype |= F_RDLCK;
- } else {
- pr_err("Unknown lock option!\n");
- return -1;
- }
- }
-
- return 0;
-}
-
-int parse_file_locks(void)
-{
- struct file_lock *fl;
-
- FILE *fl_locks;
- int exit_code = -1;
- bool is_blocked;
-
- if (kdat.has_fdinfo_lock)
- return 0;
-
- fl_locks = fopen_proc(PROC_GEN, "locks");
- if (!fl_locks) {
- pr_perror("Can't open file locks file!");
- return -1;
- }
-
- while (fgets(buf, BUF_SIZE, fl_locks)) {
- is_blocked = strstr(buf, "->") != NULL;
-
- fl = alloc_file_lock();
- if (!fl) {
- pr_perror("Alloc file lock failed!");
- goto err;
- }
-
- if (parse_file_lock_buf(buf, fl, is_blocked)) {
- xfree(fl);
- goto err;
- }
-
- pr_info("lockinfo: %lld:%d %x %d %02x:%02x:%ld %lld %s\n",
- fl->fl_id, fl->fl_kind, fl->fl_ltype,
- fl->fl_owner, fl->maj, fl->min, fl->i_no,
- fl->start, fl->end);
-
-
- if (fl->fl_kind == FL_UNKNOWN) {
- pr_err("Unknown file lock: %s!\n", buf);
- xfree(fl);
- goto err;
- }
-
- if (is_blocked) {
- /*
- * All target processes are stopped in this moment and
- * can't wait any locks.
- */
- pr_debug("Skip blocked processes\n");
- xfree(fl);
- continue;
- }
-
- if ((fl->fl_kind == FL_POSIX) &&
- !pid_in_pstree(fl->fl_owner)) {
- /*
- * We only care about tasks which are taken
- * into dump, so we only collect file locks
- * belong to these tasks.
- */
- xfree(fl);
- continue;
- }
-
- list_add_tail(&fl->list, &file_lock_list);
- }
-
- exit_code = 0;
-err:
- fclose(fl_locks);
- return exit_code;
-}
-
-void free_posix_timers(struct proc_posix_timers_stat *st)
-{
- while (!list_empty(&st->timers)) {
- struct proc_posix_timer *timer;
- timer = list_first_entry(&st->timers, struct proc_posix_timer, list);
- list_del(&timer->list);
- xfree(timer);
- }
-}
-
-int parse_posix_timers(pid_t pid, struct proc_posix_timers_stat *args)
-{
- int exit_code = -1;
- int pid_t;
- int i = 0;
-
- struct bfd f;
- char *s;
- char sigpid[7];
- char tidpid[4];
-
- struct proc_posix_timer *timer = NULL;
-
- INIT_LIST_HEAD(&args->timers);
- args->timer_n = 0;
-
- f.fd = open_proc(pid, "timers");
- if (f.fd < 0) {
- pr_perror("Can't open posix timers file!");
- return -1;
- }
-
- if (bfdopenr(&f))
- return -1;
-
- while (1) {
- char pbuf[17]; /* 16 + eol */
-
- s = breadline(&f);
- if (!s)
- break;
- if (IS_ERR(s))
- goto err;
-
- switch (i % 4) {
- case 0:
- timer = xzalloc(sizeof(struct proc_posix_timer));
- if (timer == NULL)
- goto err;
-
- if (sscanf(s, "ID: %ld",
- &timer->spt.it_id) != 1)
- goto err;
- break;
- case 1:
- if (sscanf(s, "signal: %d/%16s",
- &timer->spt.si_signo, pbuf) != 2)
- goto err;
- break;
- case 2:
- if (sscanf(s, "notify: %6[a-z]/%3[a-z].%d\n",
- sigpid, tidpid, &pid_t) != 3)
- goto err;
- break;
- case 3:
- if (sscanf(s, "ClockID: %d\n",
- &timer->spt.clock_id) != 1)
- goto err;
-
- timer->spt.sival_ptr = NULL;
- if (sscanf(pbuf, "%p", &timer->spt.sival_ptr) != 1 &&
- strcmp(pbuf, "(null)")) {
- pr_err("Unable to parse '%s'\n", pbuf);
- goto err;
- }
-
- if ( tidpid[0] == 't') {
- timer->spt.it_sigev_notify = SIGEV_THREAD_ID;
- } else {
- switch (sigpid[0]) {
- case 's' :
- timer->spt.it_sigev_notify = SIGEV_SIGNAL;
- break;
- case 't' :
- timer->spt.it_sigev_notify = SIGEV_THREAD;
- break;
- default :
- timer->spt.it_sigev_notify = SIGEV_NONE;
- break;
- }
- }
-
- list_add(&timer->list, &args->timers);
- timer = NULL;
- args->timer_n++;
- break;
- }
- i++;
- }
-
- exit_code = 0;
-out:
- bclose(&f);
- return exit_code;
-err:
- xfree(timer);
- free_posix_timers(args);
- pr_perror("Parse error in posix timers proc file!");
- goto out;
-}
-
-int parse_threads(int pid, struct pid **_t, int *_n)
-{
- struct dirent *de;
- DIR *dir;
- struct pid *t = NULL;
- int nr = 1;
-
- if (*_t)
- t = *_t;
-
- dir = opendir_proc(pid, "task");
- if (!dir)
- return -1;
-
- while ((de = readdir(dir))) {
- struct pid *tmp;
-
- /* We expect numbers only here */
- if (de->d_name[0] == '.')
- continue;
-
- if (*_t == NULL) {
- tmp = xrealloc(t, nr * sizeof(struct pid));
- if (!tmp) {
- xfree(t);
- return -1;
- }
- t = tmp;
- t[nr - 1].virt = -1;
- }
- t[nr - 1].real = atoi(de->d_name);
- nr++;
- }
-
- closedir(dir);
-
- if (*_t == NULL) {
- *_t = t;
- *_n = nr - 1;
- } else
- BUG_ON(nr - 1 != *_n);
-
- return 0;
-}
-
-int parse_task_cgroup(int pid, struct list_head *retl, unsigned int *n)
-{
- FILE *f;
-
- f = fopen_proc(pid, "cgroup");
- if (f == NULL)
- return -1;
- while (fgets(buf, BUF_SIZE, f)) {
- struct cg_ctl *ncc, *cc;
- char *name, *path = NULL, *e;
-
- ncc = xmalloc(sizeof(*cc));
- if (!ncc)
- goto err;
-
- /*
- * Typical output (':' is a separator here)
- *
- * 4:cpu,cpuacct:/
- * 3:cpuset:/
- * 2:name=systemd:/user.slice/user-1000.slice/session-1.scope
- */
- name = strchr(buf, ':');
- if (name)
- path = strchr(++name, ':');
- if (!name || !path) {
- pr_err("Failed parsing cgroup %s\n", buf);
- xfree(ncc);
- goto err;
- }
- e = strchr(name, '\n');
- *path++ = '\0';
- if (e)
- *e = '\0';
-
- ncc->name = xstrdup(name);
- ncc->path = xstrdup(path);
- if (!ncc->name || !ncc->path) {
- xfree(ncc->name);
- xfree(ncc->path);
- xfree(ncc);
- goto err;
- }
-
- list_for_each_entry(cc, retl, l)
- if (strcmp(cc->name, name) >= 0)
- break;
-
- list_add_tail(&ncc->l, &cc->l);
- (*n)++;
- }
-
- fclose(f);
- return 0;
-
-err:
- put_ctls(retl);
- fclose(f);
- return -1;
-}
-
-void put_ctls(struct list_head *l)
-{
- struct cg_ctl *c, *n;
-
- list_for_each_entry_safe(c, n, l, l) {
- xfree(c->name);
- xfree(c->path);
- xfree(c);
- }
-}
-
-/* Parse and create all the real controllers. This does not include things with
- * the "name=" prefix, e.g. systemd.
- */
-int collect_controllers(struct list_head *cgroups, unsigned int *n_cgroups)
-{
- int exit_code = -1;
- FILE *f;
-
- f = fopen_proc(PROC_SELF, "cgroup");
- if (f == NULL)
- return -1;
-
- while (fgets(buf, BUF_SIZE, f)) {
- struct cg_controller *nc = NULL;
- char *controllers, *off;
-
- controllers = strchr(buf, ':');
- if (!controllers) {
- pr_err("Unable to parse \"%s\"\n", buf);
- goto err;
- }
- controllers++;
-
- off = strchr(controllers, ':');
- if (!off) {
- pr_err("Unable to parse \"%s\"\n", buf);
- goto err;
- }
- *off = '\0';
- while (1) {
- off = strchr(controllers, ',');
- if (off)
- *off = '\0';
-
- if (!strncmp("name=", controllers, 5))
- goto skip;
-
- if (!nc) {
- nc = new_controller(controllers);
- if (!nc)
- goto err;
- list_add_tail(&nc->l, cgroups);
- (*n_cgroups)++;
- } else {
- void *m;
- char *n;
-
- nc->n_controllers++;
- m = xrealloc(nc->controllers, sizeof(char *) * nc->n_controllers);
- if (!m)
- goto err;
-
- nc->controllers = m;
-
- n = xstrdup(controllers);
- if (!n)
- goto err;
-
- nc->controllers[nc->n_controllers-1] = n;
- }
-
-skip:
- if (!off)
- break;
- controllers = off + 1;
- }
- }
-
- exit_code = 0;
-err:
- fclose(f);
- return exit_code;
-}
-
-/*
- * If an OverlayFS mountpoint is found in the mountinfo table,
- * we enable opts.overlayfs, which is a workaround for the
- * OverlayFS Kernel bug.
- *
- * See fixup_overlayfs for details.
- */
-int overlayfs_parse(struct mount_info *new)
-{
- opts.overlayfs = true;
- return 0;
-}
-
-/*
- * AUFS callback function to "fix up" the root pathname.
- * See sysfs_parse.c for details.
- */
-int aufs_parse(struct mount_info *new)
-{
- int ret = 0;
-
- if (!strcmp(new->mountpoint, "./")) {
- opts.aufs = true;
- ret = parse_aufs_branches(new);
- }
-
- return ret;
-}
-
-bool proc_status_creds_dumpable(struct proc_status_creds *parent,
- struct proc_status_creds *child)
-{
- const size_t size = sizeof(struct proc_status_creds) -
- offsetof(struct proc_status_creds, cap_inh);
-
- /*
- * The comparision rules are the following
- *
- * - CAPs can be different
- * - seccomp filters should be passed via
- * semantic comparision (FIXME) but for
- * now we require them to be exactly
- * identical
- * - the rest of members must match
- */
-
- if (memcmp(parent, child, size)) {
- if (!pr_quelled(LOG_DEBUG)) {
- pr_debug("Creds undumpable (parent:child)\n"
- " uids: %d:%d %d:%d %d:%d %d:%d\n"
- " gids: %d:%d %d:%d %d:%d %d:%d\n"
- " state: %d:%d"
- " ppid: %d:%d\n"
- " sigpnd: %llu:%llu\n"
- " shdpnd: %llu:%llu\n"
- " seccomp_mode: %d:%d\n"
- " last_filter: %u:%u\n",
- parent->uids[0], child->uids[0],
- parent->uids[1], child->uids[1],
- parent->uids[2], child->uids[2],
- parent->uids[3], child->uids[3],
- parent->gids[0], child->gids[0],
- parent->gids[1], child->gids[1],
- parent->gids[2], child->gids[2],
- parent->gids[3], child->gids[3],
- parent->state, child->state,
- parent->ppid, child->ppid,
- parent->sigpnd, child->sigpnd,
- parent->shdpnd, child->shdpnd,
- parent->seccomp_mode, child->seccomp_mode,
- parent->last_filter, child->last_filter);
- }
- return false;
- }
-
- return true;
-}
-
-int parse_children(pid_t pid, pid_t **_c, int *_n)
-{
- pid_t *ch = NULL;
- int nr = 0;
- DIR *dir;
- struct dirent *de;
- struct bfd f;
-
- dir = opendir_proc(pid, "task");
- if (dir == NULL)
- return -1;
-
- while ((de = readdir(dir))) {
- char *pos, *end;
-
- if (dir_dots(de))
- continue;
-
- f.fd = open_proc(pid, "task/%s/children", de->d_name);
- if (f.fd < 0)
- goto err;
-
- if (bfdopenr(&f))
- goto err;
-
- while (1) {
- pid_t val, *tmp;
-
- pos = breadchr(&f, ' ');
- if (IS_ERR(pos))
- goto err_close;
- if (pos == NULL)
- break;
-
- val = strtol(pos, &end, 0);
-
- if (*end != 0 && *end != ' ') {
- pr_err("Unable to parse %s\n", end);
- goto err_close;
- }
-
- tmp = xrealloc(ch, (nr + 1) * sizeof(pid_t));
- if (!tmp)
- goto err_close;
-
- ch = tmp;
- ch[nr] = val;
- nr++;
- }
- bclose(&f);
- }
-
- *_c = ch;
- *_n = nr;
-
- closedir(dir);
- return 0;
-err_close:
- bclose(&f);
-err:
- closedir(dir);
- xfree(ch);
- return -1;
-}
-
diff --git a/protobuf-desc.c b/protobuf-desc.c
deleted file mode 100644
index c80ebb794671..000000000000
--- a/protobuf-desc.c
+++ /dev/null
@@ -1,104 +0,0 @@
-#include <stdlib.h>
-#include <unistd.h>
-
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include <fcntl.h>
-#include <arpa/inet.h>
-#include <ctype.h>
-
-#include "asm/types.h"
-
-#include "compiler.h"
-#include "log.h"
-
-#include "protobuf-desc.h"
-
-#include "protobuf/inventory.pb-c.h"
-#include "protobuf/stats.pb-c.h"
-#include "protobuf/regfile.pb-c.h"
-#include "protobuf/ext-file.pb-c.h"
-#include "protobuf/ns.pb-c.h"
-#include "protobuf/eventfd.pb-c.h"
-#include "protobuf/eventpoll.pb-c.h"
-#include "protobuf/signalfd.pb-c.h"
-#include "protobuf/fsnotify.pb-c.h"
-#include "protobuf/core.pb-c.h"
-#include "protobuf/mm.pb-c.h"
-#include "protobuf/pipe.pb-c.h"
-#include "protobuf/fifo.pb-c.h"
-#include "protobuf/fdinfo.pb-c.h"
-#include "protobuf/pipe-data.pb-c.h"
-#include "protobuf/pstree.pb-c.h"
-#include "protobuf/sa.pb-c.h"
-#include "protobuf/sk-unix.pb-c.h"
-#include "protobuf/sk-inet.pb-c.h"
-#include "protobuf/packet-sock.pb-c.h"
-#include "protobuf/sk-packet.pb-c.h"
-#include "protobuf/creds.pb-c.h"
-#include "protobuf/timer.pb-c.h"
-#include "protobuf/utsns.pb-c.h"
-#include "protobuf/ipc-var.pb-c.h"
-#include "protobuf/ipc-shm.pb-c.h"
-#include "protobuf/ipc-msg.pb-c.h"
-#include "protobuf/ipc-sem.pb-c.h"
-#include "protobuf/fs.pb-c.h"
-#include "protobuf/remap-file-path.pb-c.h"
-#include "protobuf/ghost-file.pb-c.h"
-#include "protobuf/mnt.pb-c.h"
-#include "protobuf/netdev.pb-c.h"
-#include "protobuf/tcp-stream.pb-c.h"
-#include "protobuf/tty.pb-c.h"
-#include "protobuf/file-lock.pb-c.h"
-#include "protobuf/rlimit.pb-c.h"
-#include "protobuf/pagemap.pb-c.h"
-#include "protobuf/siginfo.pb-c.h"
-#include "protobuf/sk-netlink.pb-c.h"
-#include "protobuf/vma.pb-c.h"
-#include "protobuf/tun.pb-c.h"
-#include "protobuf/cgroup.pb-c.h"
-#include "protobuf/timerfd.pb-c.h"
-#include "protobuf/cpuinfo.pb-c.h"
-#include "protobuf/userns.pb-c.h"
-#include "protobuf/seccomp.pb-c.h"
-#include "protobuf/binfmt-misc.pb-c.h"
-
-struct cr_pb_message_desc cr_pb_descs[PB_MAX];
-
-#define CR_PB_DESC(__type, __vtype, __ftype) \
- CR_PB_MDESC_INIT(cr_pb_descs[PB_##__type], \
- __vtype##Entry, \
- __ftype##_entry)
-
-#define PB_PACK_TYPECHECK(__o, __fn) ({ if (0) __fn##__pack(__o, NULL); (pb_pack_t)&__fn##__pack; })
-#define PB_GPS_TYPECHECK(__o, __fn) ({ if (0) __fn##__get_packed_size(__o); (pb_getpksize_t)&__fn##__get_packed_size; })
-#define PB_UNPACK_TYPECHECK(__op, __fn) ({ if (0) *__op = __fn##__unpack(NULL, 0, NULL); (pb_unpack_t)&__fn##__unpack; })
-#define PB_FREE_TYPECHECK(__o, __fn) ({ if (0) __fn##__free_unpacked(__o, NULL); (pb_free_t)&__fn##__free_unpacked; })
-
-/*
- * This should be explicitly "called" to do type-checking
- */
-
-#define CR_PB_MDESC_INIT(__var, __type, __name) \
- do { \
- __var.getpksize = PB_GPS_TYPECHECK((__type *)NULL, __name); \
- __var.pack = PB_PACK_TYPECHECK((__type *)NULL, __name); \
- __var.unpack = PB_UNPACK_TYPECHECK((__type **)NULL, __name); \
- __var.free = PB_FREE_TYPECHECK((__type *)NULL, __name); \
- __var.pb_desc = &__name##__descriptor; \
- } while (0)
-
-void cr_pb_init(void)
-{
- CR_PB_DESC(IDS, TaskKobjIds, task_kobj_ids);
- CR_PB_DESC(SIGACT, Sa, sa);
- CR_PB_DESC(SK_QUEUES, SkPacket, sk_packet);
- CR_PB_MDESC_INIT(cr_pb_descs[PB_IPCNS_MSG], IpcMsg, ipc_msg);
- CR_PB_DESC(IPCNS_MSG_ENT, IpcMsg, ipc_msg);
- CR_PB_DESC(REMAP_FPATH, RemapFilePath, remap_file_path);
- CR_PB_DESC(NETDEV, NetDevice, net_device);
- CR_PB_MDESC_INIT(cr_pb_descs[PB_PAGEMAP_HEAD], PagemapHead, pagemap_head);
-
-#include "protobuf-desc-gen.h"
-}
diff --git a/protobuf.c b/protobuf.c
deleted file mode 100644
index ae003da44158..000000000000
--- a/protobuf.c
+++ /dev/null
@@ -1,692 +0,0 @@
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdlib.h>
-#include <arpa/inet.h>
-#include <ctype.h>
-
-#include <google/protobuf-c/protobuf-c.h>
-
-#include "image.h"
-#include "servicefd.h"
-#include "compiler.h"
-#include "asm/types.h"
-#include "log.h"
-#include "util.h"
-#include "string.h"
-#include "sockets.h"
-#include "cr_options.h"
-#include "bfd.h"
-#include "protobuf.h"
-
-/*
- * To speed up reading of packed objects
- * by providing space on stack, this should
- * be more than enough for most objects.
- */
-#define PB_PKOBJ_LOCAL_SIZE 1024
-
-#define INET_ADDR_LEN 40
-
-typedef struct {
- void *data;
- int number;
- int depth;
- int count;
- char fmt[32];
-} pb_pr_field_t;
-
-typedef struct {
- void *arg;
- int single_entry;
- const char *pretty_fmt;
- pb_pr_field_t cur;
-} pb_pr_ctl_t;
-
-typedef int (*pb_pr_show_t)(pb_pr_field_t *field);
-
-/*
- * This one describes how fields should be shown
- * @fsize is the size of the field entry
- * @show is the callback to print the entry
- */
-struct pb_shower {
- size_t fsize;
- pb_pr_show_t show;
-};
-
-static int pb_msg_int32x(pb_pr_field_t *field)
-{
- pr_msg("%#x", *(int *)field->data);
- return 0;
-}
-
-static int pb_msg_int64x(pb_pr_field_t *field)
-{
- pr_msg("%#016lx", *(long *)field->data);
- return 0;
-}
-
-static int pb_msg_int64x_r(pb_pr_field_t *field)
-{
- long val = *(long *)field->data;
- if (val)
- pr_msg("%#016lx", val);
- else
- pr_msg("0");
- return 0;
-}
-
-static int pb_msg_string(pb_pr_field_t *field)
-{
- pr_msg("\"%s\"", *(char **)field->data);
- return 0;
-}
-
-static int pb_msg_unk(pb_pr_field_t *field)
-{
- pr_msg("unknown object %p", field->data);
- return 0;
-}
-
-static inline void print_tabs(pb_pr_ctl_t *ctl)
-{
- int counter = ctl->cur.depth;
-
- if (!ctl->single_entry)
- return;
-
- while (counter--)
- pr_msg("\t");
-}
-
-static void print_nested_message_braces(pb_pr_ctl_t *ctl, int right_brace)
-{
- print_tabs(ctl);
- pr_msg("%s%s", (right_brace) ? "}" : "{", (ctl->single_entry) ? "\n" : " ");
-}
-
-static void pb_show_msg(const void *msg, pb_pr_ctl_t *ctl);
-
-static int show_nested_message(pb_pr_field_t *field)
-{
- pb_pr_ctl_t *ctl = container_of(field, pb_pr_ctl_t, cur);
- void *arg = ctl->arg;
-
- print_nested_message_braces(ctl, 0);
- field->depth++;
- pb_show_msg(field->data, ctl);
- field->depth--;
- print_nested_message_braces(ctl, 1);
- ctl->arg = arg;
- return 0;
-}
-
-static int show_enum(pb_pr_field_t *field)
-{
- pb_pr_ctl_t *ctl = container_of(field, pb_pr_ctl_t, cur);
- ProtobufCEnumDescriptor *d = ctl->arg;
- const char *val_name = NULL;
- int val, i;
-
- val = *(int *)field->data;
- for (i = 0; i < d->n_values; i++)
- if (d->values[i].value == val) {
- val_name = d->values[i].name;
- break;
- }
-
- if (val_name != NULL)
- pr_msg("%s", val_name);
- else
- pr_msg("%d", val);
- return 0;
-}
-
-static int show_bool(pb_pr_field_t *field)
-{
- protobuf_c_boolean val = *(protobuf_c_boolean *)field->data;
-
- if (val)
- pr_msg("True");
- else
- pr_msg("False");
- return 0;
-}
-
-static int show_bytes(pb_pr_field_t *field)
-{
- ProtobufCBinaryData *bytes = (ProtobufCBinaryData *)field->data;
- int i = 0;
-
- while (i < bytes->len)
- pr_msg("%02x ", bytes->data[i++]);
- return 0;
-}
-
-static int pb_show_pretty(pb_pr_field_t *field)
-{
- switch (field->fmt[0]) {
- case '%':
- pr_msg(field->fmt, *(long *)field->data);
- break;
- case 'S':
- {
- ProtobufCBinaryData *name = (ProtobufCBinaryData *)field->data;
- int i;
-
- for (i = 0; i < name->len; i++) {
- char c = (char)name->data[i];
-
- if (isprint(c))
- pr_msg("%c", c);
- else if (c != 0)
- pr_msg(".");
- }
- break;
- }
- case 'A':
- {
- char addr[INET_ADDR_LEN] = "<unknown>";
- int family = (field->count == 1) ? AF_INET : AF_INET6;
-
- if (inet_ntop(family, (void *)field->data, addr,
- INET_ADDR_LEN) == NULL)
- pr_msg("failed to translate");
- else
- pr_msg("%s", addr);
- }
- return 1;
- }
- return 0;
-}
-
-static void pb_copy_fmt(const char *fmt, char *to)
-{
- while (*fmt != ' ' && *fmt != '\0') {
- *to = *fmt;
- to++;
- fmt++;
- }
-
- *to = '\0';
-}
-
-static const char *pb_next_pretty(const char *pfmt)
-{
- pfmt = strchr(pfmt, ' ');
- if (pfmt) {
- while (*pfmt == ' ')
- pfmt++;
-
- if (*pfmt == '\0')
- pfmt = NULL;
- }
-
- return pfmt;
-}
-
-static int pb_find_fmt(char *what, pb_pr_ctl_t *ctl)
-{
- int len;
- const char *pretty = ctl->pretty_fmt;
-
- len = strlen(what);
- while (1) {
- if (!strncmp(pretty, what, len)) {
- pb_copy_fmt(pretty + len, ctl->cur.fmt);
- return 1;
- }
-
- pretty = pb_next_pretty(pretty + len);
- if (!pretty)
- return 0;
- }
-}
-
-static int pb_field_show_pretty(const ProtobufCFieldDescriptor *fd, pb_pr_ctl_t *ctl)
-{
- char cookie[32];
-
- if (!ctl->pretty_fmt)
- return 0;
-
- sprintf(cookie, "%s:", fd->name);
- if (pb_find_fmt(cookie, ctl))
- return 1;
-
- if (!ctl->cur.depth)
- sprintf(cookie, "%d:", ctl->cur.number);
- else
- sprintf(cookie, "%d.%d:", ctl->cur.depth, ctl->cur.number);
-
- if (pb_find_fmt(cookie, ctl))
- return 1;
-
- sprintf(cookie, "*:");
- if (pb_find_fmt(cookie, ctl))
- return 1;
-
- return 0;
-}
-
-static void pb_prepare_shower(const ProtobufCFieldDescriptor *fd,
- pb_pr_ctl_t *ctl, struct pb_shower *sh)
-{
- sh->fsize = 0;
- sh->show = pb_msg_unk;
-
- switch (fd->type) {
- case PROTOBUF_C_TYPE_INT32:
- case PROTOBUF_C_TYPE_SINT32:
- case PROTOBUF_C_TYPE_UINT32:
- case PROTOBUF_C_TYPE_SFIXED32:
- sh->fsize = 4;
- sh->show = pb_msg_int32x;
- break;
-
- case PROTOBUF_C_TYPE_INT64:
- case PROTOBUF_C_TYPE_SINT64:
- case PROTOBUF_C_TYPE_SFIXED64:
- case PROTOBUF_C_TYPE_FIXED32:
- case PROTOBUF_C_TYPE_UINT64:
- case PROTOBUF_C_TYPE_FIXED64:
- sh->fsize = 8;
- sh->show = (fd->label == PROTOBUF_C_LABEL_REPEATED ?
- pb_msg_int64x_r : pb_msg_int64x);
- break;
-
- case PROTOBUF_C_TYPE_STRING:
- sh->fsize = sizeof (void *);
- sh->show = pb_msg_string;
- break;
- case PROTOBUF_C_TYPE_MESSAGE:
- sh->fsize = sizeof (void *);
- sh->show = show_nested_message;
- ctl->arg = (void *)fd->descriptor;
- break;
- case PROTOBUF_C_TYPE_ENUM:
- sh->fsize = 4;
- sh->show = show_enum;
- ctl->arg = (void *)fd->descriptor;
- break;
-
- case PROTOBUF_C_TYPE_BOOL:
- sh->fsize = sizeof (protobuf_c_boolean);
- sh->show = show_bool;
- break;
- case PROTOBUF_C_TYPE_BYTES:
- sh->fsize = sizeof (ProtobufCBinaryData);
- sh->show = show_bytes;
- break;
- case PROTOBUF_C_TYPE_FLOAT:
- sh->fsize = 4;
- break;
- case PROTOBUF_C_TYPE_DOUBLE:
- sh->fsize = 8;
- break;
-
- default:
- BUG();
- }
-
- if (pb_field_show_pretty(fd, ctl))
- sh->show = pb_show_pretty;
-}
-
-static void pb_show_repeated(const ProtobufCFieldDescriptor *fd,
- pb_pr_ctl_t *ctl, struct pb_shower *sh)
-{
- pb_pr_field_t *field = &ctl->cur;
- unsigned long i, nr_fields = field->count;
-
- if (nr_fields == 0) {
- pr_msg("<empty>");
- return;
- }
-
- if (fd->type == PROTOBUF_C_TYPE_MESSAGE) {
- void *p = field->data;
-
- for (i = 0; i < nr_fields; i++) {
- field->data = (void *)(*(long *)p);
- sh->show(field);
- p += sh->fsize;
- }
-
- return;
- }
-
- for (i = 0; i < nr_fields; i++) {
- if (i)
- pr_msg(":");
- if (sh->show(field))
- break;
- field->data += sh->fsize;
- }
-}
-
-static void pb_show_field(const ProtobufCFieldDescriptor *fd, pb_pr_ctl_t *ctl)
-{
- struct pb_shower sh;
-
- print_tabs(ctl);
- pr_msg("%s: ", fd->name);
-
- pb_prepare_shower(fd, ctl, &sh);
- pb_show_repeated(fd, ctl, &sh);
-
- if (ctl->single_entry)
- pr_msg("\n");
- else
- pr_msg(" ");
-}
-
-static int pb_optional_field_present(const ProtobufCFieldDescriptor *field,
- const void *msg)
-{
- if ((field->type == PROTOBUF_C_TYPE_MESSAGE) ||
- (field->type == PROTOBUF_C_TYPE_STRING)) {
- const void *opt_flag = * (const void * const *)(msg + field->offset);
-
- if ((opt_flag == NULL) || (opt_flag == field->default_value))
- return 0;
- } else {
- const protobuf_c_boolean *has = msg + field->quantifier_offset;
-
- if (!*has)
- return 0;
- }
- return 1;
-}
-
-static bool should_show_field(const char *name)
-{
- char *s, *e;
- int len;
-
- if (!opts.show_fmt)
- return true;
-
- len = strlen(name);
- s = opts.show_fmt;
-
- while (1) {
- e = strchrnul(s, ',');
- if (e - s == len) {
- if (!strncmp(name, s, len))
- return true;
- }
- if (*e == '\0')
- return false;
- s = e + 1;
- }
-}
-
-static void pb_show_msg(const void *msg, pb_pr_ctl_t *ctl)
-{
- int i;
- const ProtobufCMessageDescriptor *md = ctl->arg;
-
- BUG_ON(md == NULL);
-
- for (i = 0; i < md->n_fields; i++) {
- const ProtobufCFieldDescriptor fd = md->fields[i];
- unsigned long *data;
- size_t nr_fields;
-
- nr_fields = 1;
- data = (unsigned long *)(msg + fd.offset);
-
- if (fd.label == PROTOBUF_C_LABEL_OPTIONAL) {
- if (!pb_optional_field_present(&fd, msg))
- continue;
- }
-
- if (!should_show_field(fd.name))
- continue;
-
- if (fd.label == PROTOBUF_C_LABEL_REPEATED) {
- nr_fields = *(size_t *)(msg + fd.quantifier_offset);
- data = (unsigned long *)*data;
- }
-
- ctl->cur.data = data;
- ctl->cur.number = i + 1;
- ctl->cur.count = nr_fields;
-
- pb_show_field(&fd, ctl);
- }
-}
-
-static inline void pb_no_payload(struct cr_img *i, void *obj) { }
-
-void do_pb_show_plain(struct cr_img *img, int type, int single_entry,
- void (*payload_hadler)(struct cr_img *, void *obj),
- const char *pretty_fmt)
-{
- pb_pr_ctl_t ctl = {NULL, single_entry, pretty_fmt};
- void (*handle_payload)(struct cr_img *, void *obj);
-
- if (!cr_pb_descs[type].pb_desc) {
- pr_err("Wrong object requested %d\n", type);
- return;
- }
-
- handle_payload = (payload_hadler) ? : pb_no_payload;
-
- while (1) {
- void *obj;
-
- if (pb_read_one_eof(img, &obj, type) <= 0)
- break;
-
- ctl.arg = (void *)cr_pb_descs[type].pb_desc;
- pb_show_msg(obj, &ctl);
- handle_payload(img, obj);
- cr_pb_descs[type].free(obj, NULL);
- if (single_entry)
- break;
- pr_msg("\n");
- }
-}
-
-static char *image_name(struct cr_img *img)
-{
- int fd = img->_x.fd;
- static char image_path[PATH_MAX];
-
- if (read_fd_link(fd, image_path, sizeof(image_path)) > 0)
- return image_path;
- return NULL;
-}
-
-/*
- * Reads PB record (header + packed object) from file @fd and unpack
- * it with @unpack procedure to the pointer @pobj
- *
- * 1 on success
- * -1 on error (or EOF met and @eof set to false)
- * 0 on EOF and @eof set to true
- *
- * Don't forget to free memory granted to unpacked object in calling code if needed
- */
-
-int do_pb_read_one(struct cr_img *img, void **pobj, int type, bool eof)
-{
- u8 local[PB_PKOBJ_LOCAL_SIZE];
- void *buf = (void *)&local;
- u32 size;
- int ret;
-
- if (!cr_pb_descs[type].pb_desc) {
- pr_err("Wrong object requested %d on %s\n",
- type, image_name(img));
- return -1;
- }
-
- *pobj = NULL;
-
- if (unlikely(empty_image(img)))
- ret = 0;
- else
- ret = bread(&img->_x, &size, sizeof(size));
- if (ret == 0) {
- if (eof) {
- return 0;
- } else {
- pr_err("Unexpected EOF on %s\n",
- image_name(img));
- return -1;
- }
- } else if (ret < sizeof(size)) {
- pr_perror("Read %d bytes while %d expected on %s",
- ret, (int)sizeof(size),
- image_name(img));
- return -1;
- }
-
- if (size > sizeof(local)) {
- ret = -1;
- buf = xmalloc(size);
- if (!buf)
- goto err;
- }
-
- ret = bread(&img->_x, buf, size);
- if (ret < 0) {
- pr_perror("Can't read %d bytes from file %s",
- size, image_name(img));
- goto err;
- } else if (ret != size) {
- pr_perror("Read %d bytes while %d expected from %s",
- ret, size, image_name(img));
- ret = -1;
- goto err;
- }
-
- *pobj = cr_pb_descs[type].unpack(NULL, size, buf);
- if (!*pobj) {
- ret = -1;
- pr_err("Failed unpacking object %p from %s\n",
- pobj, image_name(img));
- goto err;
- }
-
- ret = 1;
-err:
- if (buf != (void *)&local)
- xfree(buf);
-
- return ret;
-}
-
-/*
- * Writes PB record (header + packed object pointed by @obj)
- * to file @fd, using @getpksize to get packed size and @pack
- * to implement packing
- *
- * 0 on success
- * -1 on error
- */
-int pb_write_one(struct cr_img *img, void *obj, int type)
-{
- u8 local[PB_PKOBJ_LOCAL_SIZE];
- void *buf = (void *)&local;
- u32 size, packed;
- int ret = -1;
- struct iovec iov[2];
-
- if (!cr_pb_descs[type].pb_desc) {
- pr_err("Wrong object requested %d\n", type);
- return -1;
- }
-
- if (lazy_image(img) && open_image_lazy(img))
- return -1;
-
- size = cr_pb_descs[type].getpksize(obj);
- if (size > (u32)sizeof(local)) {
- buf = xmalloc(size);
- if (!buf)
- goto err;
- }
-
- packed = cr_pb_descs[type].pack(obj, buf);
- if (packed != size) {
- pr_err("Failed packing PB object %p\n", obj);
- goto err;
- }
-
- iov[0].iov_base = &size;
- iov[0].iov_len = sizeof(size);
- iov[1].iov_base = buf;
- iov[1].iov_len = size;
-
- ret = bwritev(&img->_x, iov, 2);
- if (ret != size + sizeof(size)) {
- pr_perror("Can't write %d bytes", (int)(size + sizeof(size)));
- goto err;
- }
-
- ret = 0;
-err:
- if (buf != (void *)&local)
- xfree(buf);
- return ret;
-}
-
-int collect_image(struct collect_image_info *cinfo)
-{
- int ret;
- struct cr_img *img;
- void *(*o_alloc)(size_t size) = malloc;
- void (*o_free)(void *ptr) = free;
-
- pr_info("Collecting %d/%d (flags %x)\n",
- cinfo->fd_type, cinfo->pb_type, cinfo->flags);
-
- img = open_image(cinfo->fd_type, O_RSTR);
- if (!img)
- return -1;
-
- cinfo->flags |= COLLECT_HAPPENED;
- if (cinfo->flags & COLLECT_SHARED) {
- o_alloc = shmalloc;
- o_free = shfree_last;
- }
-
- while (1) {
- void *obj;
- ProtobufCMessage *msg;
-
- if (cinfo->priv_size) {
- ret = -1;
- obj = o_alloc(cinfo->priv_size);
- if (!obj)
- break;
- } else
- obj = NULL;
-
- ret = pb_read_one_eof(img, &msg, cinfo->pb_type);
- if (ret <= 0) {
- o_free(obj);
- break;
- }
-
- ret = cinfo->collect(obj, msg);
- if (ret < 0) {
- o_free(obj);
- cr_pb_descs[cinfo->pb_type].free(msg, NULL);
- break;
- }
-
- if (!cinfo->priv_size)
- cr_pb_descs[cinfo->pb_type].free(msg, NULL);
- }
-
- close_image(img);
- pr_debug(" `- ... done\n");
- return ret;
-}
diff --git a/pstree.c b/pstree.c
deleted file mode 100644
index 06bc5f84b5be..000000000000
--- a/pstree.c
+++ /dev/null
@@ -1,846 +0,0 @@
-#include <sys/mman.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <sched.h>
-
-#include "cr_options.h"
-#include "pstree.h"
-#include "util.h"
-#include "lock.h"
-#include "namespaces.h"
-#include "files.h"
-#include "tty.h"
-#include "mount.h"
-#include "asm/dump.h"
-
-#include "protobuf.h"
-#include "protobuf/pstree.pb-c.h"
-
-struct pstree_item *root_item;
-
-#define CLONE_ALLNS (CLONE_NEWPID | CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_NEWNS | CLONE_NEWUSER)
-
-void core_entry_free(CoreEntry *core)
-{
- if (core->tc && core->tc->timers)
- xfree(core->tc->timers->posix);
- if (core->thread_core)
- xfree(core->thread_core->creds->groups);
- arch_free_thread_info(core);
- xfree(core);
-}
-
-#ifndef RLIM_NLIMITS
-# define RLIM_NLIMITS 16
-#endif
-
-CoreEntry *core_entry_alloc(int th, int tsk)
-{
- size_t sz;
- CoreEntry *core = NULL;
- void *m;
-
- sz = sizeof(CoreEntry);
- if (tsk) {
- sz += sizeof(TaskCoreEntry) + TASK_COMM_LEN;
- if (th) {
- sz += sizeof(TaskRlimitsEntry);
- sz += RLIM_NLIMITS * sizeof(RlimitEntry *);
- sz += RLIM_NLIMITS * sizeof(RlimitEntry);
- sz += sizeof(TaskTimersEntry);
- sz += 3 * sizeof(ItimerEntry); /* 3 for real, virt and prof */
- }
- }
- if (th) {
- CredsEntry *ce = NULL;
-
- sz += sizeof(ThreadCoreEntry) + sizeof(ThreadSasEntry) + sizeof(CredsEntry);
-
- sz += CR_CAP_SIZE * sizeof(ce->cap_inh[0]);
- sz += CR_CAP_SIZE * sizeof(ce->cap_prm[0]);
- sz += CR_CAP_SIZE * sizeof(ce->cap_eff[0]);
- sz += CR_CAP_SIZE * sizeof(ce->cap_bnd[0]);
- /*
- * @groups are dynamic and allocated
- * on demand.
- */
- }
-
- m = xmalloc(sz);
- if (m) {
- core = xptr_pull(&m, CoreEntry);
- core_entry__init(core);
- core->mtype = CORE_ENTRY__MARCH;
-
- if (tsk) {
- core->tc = xptr_pull(&m, TaskCoreEntry);
- task_core_entry__init(core->tc);
- core->tc->comm = xptr_pull_s(&m, TASK_COMM_LEN);
- memzero(core->tc->comm, TASK_COMM_LEN);
-
- if (th) {
- TaskRlimitsEntry *rls;
- TaskTimersEntry *tte;
- int i;
-
- rls = core->tc->rlimits = xptr_pull(&m, TaskRlimitsEntry);
- task_rlimits_entry__init(rls);
-
- rls->n_rlimits = RLIM_NLIMITS;
- rls->rlimits = xptr_pull_s(&m, sizeof(RlimitEntry *) * RLIM_NLIMITS);
-
- for (i = 0; i < RLIM_NLIMITS; i++) {
- rls->rlimits[i] = xptr_pull(&m, RlimitEntry);
- rlimit_entry__init(rls->rlimits[i]);
- }
-
- tte = core->tc->timers = xptr_pull(&m, TaskTimersEntry);
- task_timers_entry__init(tte);
- tte->real = xptr_pull(&m, ItimerEntry);
- itimer_entry__init(tte->real);
- tte->virt = xptr_pull(&m, ItimerEntry);
- itimer_entry__init(tte->virt);
- tte->prof = xptr_pull(&m, ItimerEntry);
- itimer_entry__init(tte->prof);
- }
- }
-
- if (th) {
- CredsEntry *ce;
-
- core->thread_core = xptr_pull(&m, ThreadCoreEntry);
- thread_core_entry__init(core->thread_core);
- core->thread_core->sas = xptr_pull(&m, ThreadSasEntry);
- thread_sas_entry__init(core->thread_core->sas);
- ce = core->thread_core->creds = xptr_pull(&m, CredsEntry);
- creds_entry__init(ce);
-
- ce->n_cap_inh = CR_CAP_SIZE;
- ce->n_cap_prm = CR_CAP_SIZE;
- ce->n_cap_eff = CR_CAP_SIZE;
- ce->n_cap_bnd = CR_CAP_SIZE;
- ce->cap_inh = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_inh[0]));
- ce->cap_prm = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_prm[0]));
- ce->cap_eff = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_eff[0]));
- ce->cap_bnd = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_bnd[0]));
-
- if (arch_alloc_thread_info(core)) {
- xfree(core);
- core = NULL;
- }
- }
- }
-
- return core;
-}
-
-int pstree_alloc_cores(struct pstree_item *item)
-{
- unsigned int i;
-
- item->core = xzalloc(sizeof(*item->core) * item->nr_threads);
- if (!item->core)
- return -1;
-
- for (i = 0; i < item->nr_threads; i++) {
- if (item->threads[i].real == item->pid.real)
- item->core[i] = core_entry_alloc(1, 1);
- else
- item->core[i] = core_entry_alloc(1, 0);
-
- if (!item->core[i])
- goto err;
- }
-
- return 0;
-err:
- pstree_free_cores(item);
- return -1;
-}
-
-void pstree_free_cores(struct pstree_item *item)
-{
- unsigned int i;
-
- if (item->core) {
- for (i = 1; i < item->nr_threads; i++)
- core_entry_free(item->core[i]);
- xfree(item->core);
- item->core = NULL;
- }
-}
-
-void free_pstree(struct pstree_item *root_item)
-{
- struct pstree_item *item = root_item, *parent;
-
- while (item) {
- if (!list_empty(&item->children)) {
- item = list_first_entry(&item->children, struct pstree_item, sibling);
- continue;
- }
-
- parent = item->parent;
- list_del(&item->sibling);
- pstree_free_cores(item);
- xfree(item->threads);
- xfree(item);
- item = parent;
- }
-}
-
-struct pstree_item *__alloc_pstree_item(bool rst)
-{
- struct pstree_item *item;
- int sz;
-
- if (!rst) {
- sz = sizeof(*item) + sizeof(struct dmp_info);
- item = xzalloc(sz);
- if (!item)
- return NULL;
- } else {
- sz = sizeof(*item) + sizeof(struct rst_info);
- item = shmalloc(sz);
- if (!item)
- return NULL;
-
- memset(item, 0, sz);
- vm_area_list_init(&rsti(item)->vmas);
- }
-
- INIT_LIST_HEAD(&item->children);
- INIT_LIST_HEAD(&item->sibling);
-
- item->pid.virt = -1;
- item->pid.real = -1;
- item->born_sid = -1;
-
- return item;
-}
-
-struct pstree_item *alloc_pstree_helper(void)
-{
- struct pstree_item *ret;
-
- ret = alloc_pstree_item_with_rst();
- if (ret) {
- ret->state = TASK_HELPER;
- rsti(ret)->clone_flags = CLONE_FILES | CLONE_FS;
- task_entries->nr_helpers++;
- }
-
- return ret;
-}
-
-/* Deep first search on children */
-struct pstree_item *pstree_item_next(struct pstree_item *item)
-{
- if (!list_empty(&item->children))
- return list_first_entry(&item->children, struct pstree_item, sibling);
-
- while (item->parent) {
- if (item->sibling.next != &item->parent->children)
- return list_entry(item->sibling.next, struct pstree_item, sibling);
- item = item->parent;
- }
-
- return NULL;
-}
-
-/* Preorder traversal of pstree item */
-int preorder_pstree_traversal(struct pstree_item *item, int (*f)(struct pstree_item *))
-{
- struct pstree_item *cursor;
-
- if (f(item) < 0)
- return -1;
-
- list_for_each_entry(cursor, &item->children, sibling) {
- if (preorder_pstree_traversal(cursor, f) < 0)
- return -1;
- }
-
- return 0;
-}
-
-int dump_pstree(struct pstree_item *root_item)
-{
- struct pstree_item *item = root_item;
- PstreeEntry e = PSTREE_ENTRY__INIT;
- int ret = -1, i;
- struct cr_img *img;
-
- pr_info("\n");
- pr_info("Dumping pstree (pid: %d)\n", root_item->pid.real);
- pr_info("----------------------------------------\n");
-
- /*
- * Make sure we're dumping session leader, if not an
- * appropriate option must be passed.
- *
- * Also note that if we're not a session leader we
- * can't get the situation where the leader sits somewhere
- * deeper in process tree, thus top-level checking for
- * leader is enough.
- */
- if (root_item->pid.virt != root_item->sid) {
- if (!opts.shell_job) {
- pr_err("The root process %d is not a session leader. "
- "Consider using --" OPT_SHELL_JOB " option\n", item->pid.virt);
- return -1;
- }
- }
-
- img = open_image(CR_FD_PSTREE, O_DUMP);
- if (!img)
- return -1;
-
- for_each_pstree_item(item) {
- pr_info("Process: %d(%d)\n", item->pid.virt, item->pid.real);
-
- e.pid = item->pid.virt;
- e.ppid = item->parent ? item->parent->pid.virt : 0;
- e.pgid = item->pgid;
- e.sid = item->sid;
- e.n_threads = item->nr_threads;
-
- e.threads = xmalloc(sizeof(e.threads[0]) * e.n_threads);
- if (!e.threads)
- goto err;
-
- for (i = 0; i < item->nr_threads; i++)
- e.threads[i] = item->threads[i].virt;
-
- ret = pb_write_one(img, &e, PB_PSTREE);
- xfree(e.threads);
-
- if (ret)
- goto err;
- }
- ret = 0;
-
-err:
- pr_info("----------------------------------------\n");
- close_image(img);
- return ret;
-}
-
-static int max_pid = 0;
-
-static int prepare_pstree_for_shell_job(void)
-{
- pid_t current_sid = getsid(getpid());
- pid_t current_gid = getpgid(getpid());
-
- struct pstree_item *pi;
-
- pid_t old_sid;
- pid_t old_gid;
-
- if (!opts.shell_job)
- return 0;
-
- if (root_item->sid == root_item->pid.virt)
- return 0;
-
- /*
- * Migration of a root task group leader is a bit tricky.
- * When a task yields SIGSTOP, the kernel notifies the parent
- * with SIGCHLD. This means when task is running in a
- * shell, the shell obtains SIGCHLD and sends a task to
- * the background.
- *
- * The situation gets changed once we restore the
- * program -- our tool become an additional stub between
- * the restored program and the shell. So to be able to
- * notify the shell with SIGCHLD from our restored
- * program -- we make the root task to inherit the
- * process group from us.
- *
- * Not that clever solution but at least it works.
- */
-
- old_sid = root_item->sid;
- old_gid = root_item->pgid;
-
- pr_info("Migrating process tree (GID %d->%d SID %d->%d)\n",
- old_gid, current_gid, old_sid, current_sid);
-
- for_each_pstree_item(pi) {
- if (pi->pgid == old_gid)
- pi->pgid = current_gid;
- if (pi->sid == old_sid)
- pi->sid = current_sid;
- }
-
- max_pid = max((int)current_sid, max_pid);
- max_pid = max((int)current_gid, max_pid);
-
- return 0;
-}
-
-static int read_pstree_image(void)
-{
- int ret = 0, i;
- struct cr_img *img;
- struct pstree_item *pi, *parent = NULL;
-
- pr_info("Reading image tree\n");
-
- img = open_image(CR_FD_PSTREE, O_RSTR);
- if (!img)
- return -1;
-
- while (1) {
- PstreeEntry *e;
-
- ret = pb_read_one_eof(img, &e, PB_PSTREE);
- if (ret <= 0)
- break;
-
- ret = -1;
- pi = alloc_pstree_item_with_rst();
- if (pi == NULL)
- break;
-
- pi->pid.virt = e->pid;
- max_pid = max((int)e->pid, max_pid);
-
- pi->pgid = e->pgid;
- max_pid = max((int)e->pgid, max_pid);
-
- pi->sid = e->sid;
- max_pid = max((int)e->sid, max_pid);
-
- if (e->ppid == 0) {
- if (root_item) {
- pr_err("Parent missed on non-root task "
- "with pid %d, image corruption!\n", e->pid);
- goto err;
- }
- root_item = pi;
- pi->parent = NULL;
- } else {
- /*
- * Fast path -- if the pstree image is not edited, the
- * parent of any item should have already being restored
- * and sit among the last item's ancestors.
- */
- while (parent) {
- if (parent->pid.virt == e->ppid)
- break;
- parent = parent->parent;
- }
-
- if (parent == NULL) {
- for_each_pstree_item(parent) {
- if (parent->pid.virt == e->ppid)
- break;
- }
-
- if (parent == NULL) {
- pr_err("Can't find a parent for %d\n", pi->pid.virt);
- pstree_entry__free_unpacked(e, NULL);
- xfree(pi);
- goto err;
- }
- }
-
- pi->parent = parent;
- list_add(&pi->sibling, &parent->children);
- }
-
- parent = pi;
-
- pi->nr_threads = e->n_threads;
- pi->threads = xmalloc(e->n_threads * sizeof(struct pid));
- if (!pi->threads)
- break;
-
- for (i = 0; i < e->n_threads; i++) {
- pi->threads[i].real = -1;
- pi->threads[i].virt = e->threads[i];
- max_pid = max((int)e->threads[i], max_pid);
- }
-
- task_entries->nr_threads += e->n_threads;
- task_entries->nr_tasks++;
-
- pstree_entry__free_unpacked(e, NULL);
-
- {
- struct cr_img *img;
-
- img = open_image(CR_FD_IDS, O_RSTR, pi->pid.virt);
- if (!img)
- goto err;
- ret = pb_read_one_eof(img, &pi->ids, PB_IDS);
- close_image(img);
- }
-
- if (ret == 0)
- continue;
- if (ret < 0)
- goto err;
-
- if (pi->ids->has_mnt_ns_id) {
- if (rst_add_ns_id(pi->ids->mnt_ns_id, pi, &mnt_ns_desc))
- goto err;
- }
- }
-err:
- close_image(img);
- return ret;
-}
-
-static int prepare_pstree_ids(void)
-{
- struct pstree_item *item, *child, *helper, *tmp;
- LIST_HEAD(helpers);
-
- pid_t current_pgid = getpgid(getpid());
-
- /*
- * Some task can be reparented to init. A helper task should be added
- * for restoring sid of such tasks. The helper tasks will be exited
- * immediately after forking children and all children will be
- * reparented to init.
- */
- list_for_each_entry(item, &root_item->children, sibling) {
-
- /*
- * If a child belongs to the root task's session or it's
- * a session leader himself -- this is a simple case, we
- * just proceed in a normal way.
- */
- if (item->sid == root_item->sid || item->sid == item->pid.virt)
- continue;
-
- helper = alloc_pstree_helper();
- if (helper == NULL)
- return -1;
- helper->sid = item->sid;
- helper->pgid = item->sid;
- helper->pid.virt = item->sid;
- helper->parent = root_item;
- helper->ids = root_item->ids;
- list_add_tail(&helper->sibling, &helpers);
-
- pr_info("Add a helper %d for restoring SID %d\n",
- helper->pid.virt, helper->sid);
-
- child = list_entry(item->sibling.prev, struct pstree_item, sibling);
- item = child;
-
- /*
- * Stack on helper task all children with target sid.
- */
- list_for_each_entry_safe_continue(child, tmp, &root_item->children, sibling) {
- if (child->sid != helper->sid)
- continue;
- if (child->sid == child->pid.virt)
- continue;
-
- pr_info("Attach %d to the temporary task %d\n",
- child->pid.virt, helper->pid.virt);
-
- child->parent = helper;
- list_move(&child->sibling, &helper->children);
- }
- }
-
- /* Try to connect helpers to session leaders */
- for_each_pstree_item(item) {
- if (!item->parent) /* skip the root task */
- continue;
-
- if (item->state == TASK_HELPER)
- continue;
-
- if (item->sid != item->pid.virt) {
- struct pstree_item *parent;
-
- if (item->parent->sid == item->sid)
- continue;
-
- /* the task could fork a child before and after setsid() */
- parent = item->parent;
- while (parent && parent->pid.virt != item->sid) {
- if (parent->born_sid != -1 && parent->born_sid != item->sid) {
- pr_err("Can't determinate with which sid (%d or %d)"
- "the process %d was born\n",
- parent->born_sid, item->sid, parent->pid.virt);
- return -1;
- }
- parent->born_sid = item->sid;
- pr_info("%d was born with sid %d\n", parent->pid.virt, item->sid);
- parent = parent->parent;
- }
-
- if (parent == NULL) {
- pr_err("Can't find a session leader for %d\n", item->sid);
- return -1;
- }
-
- continue;
- }
-
- pr_info("Session leader %d\n", item->sid);
-
- /* Try to find helpers, who should be connected to the leader */
- list_for_each_entry(child, &helpers, sibling) {
- if (child->state != TASK_HELPER)
- continue;
-
- if (child->sid != item->sid)
- continue;
-
- child->pgid = item->pgid;
- child->pid.virt = ++max_pid;
- child->parent = item;
- list_move(&child->sibling, &item->children);
-
- pr_info("Attach %d to the task %d\n",
- child->pid.virt, item->pid.virt);
-
- break;
- }
- }
-
- /* All other helpers are session leaders for own sessions */
- list_splice(&helpers, &root_item->children);
-
- /* Add a process group leader if it is absent */
- for_each_pstree_item(item) {
- struct pstree_item *gleader;
-
- if (!item->pgid || item->pid.virt == item->pgid)
- continue;
-
- for_each_pstree_item(gleader) {
- if (gleader->pid.virt == item->pgid)
- break;
- }
-
- if (gleader) {
- rsti(item)->pgrp_leader = gleader;
- continue;
- }
-
- /*
- * If the PGID is eq to current one -- this
- * means we're inheriting group from the current
- * task so we need to escape creating a helper here.
- */
- if (current_pgid == item->pgid)
- continue;
-
- helper = alloc_pstree_helper();
- if (helper == NULL)
- return -1;
- helper->sid = item->sid;
- helper->pgid = item->pgid;
- helper->pid.virt = item->pgid;
- helper->parent = item;
- helper->ids = item->ids;
- list_add(&helper->sibling, &item->children);
- rsti(item)->pgrp_leader = helper;
-
- pr_info("Add a helper %d for restoring PGID %d\n",
- helper->pid.virt, helper->pgid);
- }
-
- return 0;
-}
-
-static unsigned long get_clone_mask(TaskKobjIdsEntry *i,
- TaskKobjIdsEntry *p)
-{
- unsigned long mask = 0;
-
- if (i->files_id == p->files_id)
- mask |= CLONE_FILES;
- if (i->pid_ns_id != p->pid_ns_id)
- mask |= CLONE_NEWPID;
- if (i->net_ns_id != p->net_ns_id)
- mask |= CLONE_NEWNET;
- if (i->ipc_ns_id != p->ipc_ns_id)
- mask |= CLONE_NEWIPC;
- if (i->uts_ns_id != p->uts_ns_id)
- mask |= CLONE_NEWUTS;
- if (i->mnt_ns_id != p->mnt_ns_id)
- mask |= CLONE_NEWNS;
- if (i->user_ns_id != p->user_ns_id)
- mask |= CLONE_NEWUSER;
-
- return mask;
-}
-
-static int prepare_pstree_kobj_ids(void)
-{
- struct pstree_item *item;
-
- /* Find a process with minimal pid for shared fd tables */
- for_each_pstree_item(item) {
- struct pstree_item *parent = item->parent;
- TaskKobjIdsEntry *ids;
- unsigned long cflags;
-
- if (!item->ids) {
- if (item == root_item) {
- cflags = opts.rst_namespaces_flags;
- goto set_mask;
- }
-
- continue;
- }
-
- if (parent)
- ids = parent->ids;
- else
- ids = root_ids;
-
- /*
- * Add some sanity check on image data.
- */
- if (unlikely(!ids)) {
- pr_err("No kIDs provided, image corruption\n");
- return -1;
- }
-
- cflags = get_clone_mask(item->ids, ids);
-
- if (cflags & CLONE_FILES) {
- int ret;
-
- /*
- * There might be a case when kIDs for
- * root task are the same as in root_ids,
- * thus it's image corruption and we should
- * exit out.
- */
- if (unlikely(!item->parent)) {
- pr_err("Image corruption on kIDs data\n");
- return -1;
- }
-
- ret = shared_fdt_prepare(item);
- if (ret)
- return ret;
- }
-
-set_mask:
- rsti(item)->clone_flags = cflags;
- if (parent)
- /*
- * Mount namespaces are setns()-ed at
- * restore_task_mnt_ns() explicitly,
- * no need in creating it with its own
- * temporary namespace.
- *
- * Root task is exceptional -- it will
- * be born in a fresh new mount namespace
- * which will be populated with all other
- * namespaces' entries.
- */
- rsti(item)->clone_flags &= ~CLONE_NEWNS;
-
- cflags &= CLONE_ALLNS;
-
- if (item == root_item) {
- pr_info("Will restore in %lx namespaces\n", cflags);
- root_ns_mask = cflags;
- } else if (cflags & ~(root_ns_mask & CLONE_SUBNS)) {
- /*
- * Namespaces from CLONE_SUBNS can be nested, but in
- * this case nobody can't share external namespaces of
- * these types.
- *
- * Workaround for all other namespaces --
- * all tasks should be in one namespace. And
- * this namespace is either inherited from the
- * criu or is created for the init task (only)
- */
- pr_err("Can't restore sub-task in NS\n");
- return -1;
- }
- }
-
- pr_debug("NS mask to use %lx\n", root_ns_mask);
- return 0;
-}
-
-int prepare_pstree(void)
-{
- int ret;
-
- ret = read_pstree_image();
- if (!ret)
- /*
- * Shell job may inherit sid/pgid from the current
- * shell, not from image. Set things up for this.
- */
- ret = prepare_pstree_for_shell_job();
- if (!ret)
- /*
- * Walk the collected tree and prepare for restoring
- * of shared objects at clone time
- */
- ret = prepare_pstree_kobj_ids();
- if (!ret)
- /*
- * Session/Group leaders might be dead. Need to fix
- * pstree with properly injected helper tasks.
- */
- ret = prepare_pstree_ids();
-
- return ret;
-}
-
-bool restore_before_setsid(struct pstree_item *child)
-{
- int csid = child->born_sid == -1 ? child->sid : child->born_sid;
-
- if (child->parent->born_sid == csid)
- return true;
-
- return false;
-}
-
-struct pstree_item *pstree_item_by_virt(pid_t virt)
-{
- struct pstree_item *item;
-
- for_each_pstree_item(item) {
- if (item->pid.virt == virt)
- return item;
- }
- return NULL;
-}
-
-struct pstree_item *pstree_item_by_real(pid_t real)
-{
- struct pstree_item *item;
-
- for_each_pstree_item(item) {
- if (item->pid.real == real)
- return item;
- }
- return NULL;
-}
-
-int pid_to_virt(pid_t real)
-{
- struct pstree_item *item;
-
- item = pstree_item_by_real(real);
- if (item)
- return item->pid.virt;
- return 0;
-}
-
-bool pid_in_pstree(pid_t pid)
-{
- return pstree_item_by_real(pid) != NULL;
-}
diff --git a/ptrace.c b/ptrace.c
deleted file mode 100644
index 25970fc4eb57..000000000000
--- a/ptrace.c
+++ /dev/null
@@ -1,331 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdarg.h>
-#include <string.h>
-#include <errno.h>
-#include <unistd.h>
-#include <stdbool.h>
-#include <limits.h>
-#include <signal.h>
-
-#include <sys/ptrace.h>
-#include <sys/types.h>
-#include <sys/time.h>
-#include <sys/resource.h>
-#include <sys/wait.h>
-
-#include "compiler.h"
-#include "asm/types.h"
-#include "util.h"
-#include "ptrace.h"
-#include "proc_parse.h"
-#include "crtools.h"
-#include "seccomp.h"
-#include "cr_options.h"
-
-int unseize_task(pid_t pid, int orig_st, int st)
-{
- pr_debug("\tUnseizing %d into %d\n", pid, st);
-
- if (st == TASK_DEAD) {
- kill(pid, SIGKILL);
- return 0;
- } else if (st == TASK_STOPPED) {
- /*
- * Task might have had STOP in queue. We detected such
- * guy as TASK_STOPPED, but cleared signal to run the
- * parasite code. hus after detach the task will become
- * running. That said -- STOP everyone regardless of
- * the initial state.
- */
- kill(pid, SIGSTOP);
- } else if (st == TASK_ALIVE) {
- /*
- * Same as in the comment above -- there might be a
- * task with STOP in queue that would get lost after
- * detach, so stop it again.
- */
- if (orig_st == TASK_STOPPED)
- kill(pid, SIGSTOP);
- } else
- pr_err("Unknown final state %d\n", st);
-
- if (ptrace(PTRACE_DETACH, pid, NULL, NULL)) {
- pr_perror("Unable to detach from %d", pid);
- return -1;
- }
-
- return 0;
-}
-
-int suspend_seccomp(pid_t pid)
-{
- if (ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_SUSPEND_SECCOMP) < 0) {
- pr_perror("suspending seccomp failed");
- return -1;
- }
-
- return 0;
-}
-
-int seize_catch_task(pid_t pid)
-{
- int ret;
-
- ret = ptrace(PTRACE_SEIZE, pid, NULL, 0);
- if (ret) {
- /*
- * ptrace API doesn't allow to distinguish
- * attaching to zombie from other errors.
- * All errors will be handled in seize_wait_task().
- */
- pr_warn("Unable to interrupt task: %d (%s)\n", pid, strerror(errno));
- return ret;
- }
-
- /*
- * If we SEIZE-d the task stop it before going
- * and reading its stat from proc. Otherwise task
- * may die _while_ we're doing it and we'll have
- * inconsistent seize/state pair.
- *
- * If task dies after we seize it but before we
- * do this interrupt, we'll notice it via proc.
- */
- ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL);
- if (ret < 0) {
- pr_warn("SEIZE %d: can't interrupt task: %s", pid, strerror(errno));
- if (ptrace(PTRACE_DETACH, pid, NULL, NULL))
- pr_perror("Unable to detach from %d", pid);
- }
-
- return ret;
-}
-
-static int skip_sigstop(int pid, int nr_signals)
-{
- int i, status, ret;
-
- /*
- * 1) SIGSTOP is queued, but isn't handled yet:
- * SGISTOP can't be blocked, so we need to wait when the kernel
- * handles this signal.
- *
- * Otherwise the process will be stopped immediatly after
- * starting it.
- *
- * 2) A seized task was stopped:
- * PTRACE_SEIZE doesn't affect signal or group stop state.
- * Currently ptrace reported that task is in stopped state.
- * We need to start task again, and it will be trapped
- * immediately, because we sent PTRACE_INTERRUPT to it.
- */
- for (i = 0; i < nr_signals; i++) {
- ret = ptrace(PTRACE_CONT, pid, 0, 0);
- if (ret) {
- pr_perror("Unable to start process");
- return -1;
- }
-
- ret = wait4(pid, &status, __WALL, NULL);
- if (ret < 0) {
- pr_perror("SEIZE %d: can't wait task", pid);
- return -1;
- }
-
- if (!WIFSTOPPED(status)) {
- pr_err("SEIZE %d: task not stopped after seize\n", pid);
- return -1;
- }
- }
- return 0;
-}
-
-/*
- * This routine seizes task putting it into a special
- * state where we can manipulate the task via ptrace
- * interface, and finally we can detach ptrace out of
- * of it so the task would not know if it was saddled
- * up with someone else.
- */
-int seize_wait_task(pid_t pid, pid_t ppid, struct proc_status_creds **creds)
-{
- siginfo_t si;
- int status, nr_sigstop;
- int ret = 0, ret2, wait_errno = 0;
- struct proc_status_creds cr;
-
- /*
- * For the comparison below, let's zero out any padding.
- */
- memzero(&cr, sizeof(struct proc_status_creds));
-
- /*
- * It's ugly, but the ptrace API doesn't allow to distinguish
- * attaching to zombie from other errors. Thus we have to parse
- * the target's /proc/pid/stat. Sad, but parse whatever else
- * we might need at that early point.
- */
-
- processes_to_wait--;
-try_again:
-
- ret = wait4(pid, &status, __WALL, NULL);
- if (ret < 0) {
- /*
- * wait4() can expectedly fail only in a first time
- * if a task is zombie. If we are here from try_again,
- * this means that we are tracing this task.
- *
- * processes_to_wait should be descrimented only once in this
- * function if a first wait was success.
- */
- processes_to_wait++;
- wait_errno = errno;
- }
-
- ret2 = parse_pid_status(pid, &cr);
- if (ret2)
- goto err;
-
- if (ret < 0 || WIFEXITED(status) || WIFSIGNALED(status)) {
- if (cr.state != 'Z') {
- if (pid == getpid())
- pr_err("The criu itself is within dumped tree.\n");
- else
- pr_err("Unseizable non-zombie %d found, state %c, err %d/%d\n",
- pid, cr.state, ret, wait_errno);
- return -1;
- }
-
- return TASK_DEAD;
- }
-
- if ((ppid != -1) && (cr.ppid != ppid)) {
- pr_err("Task pid reused while suspending (%d: %d -> %d)\n",
- pid, ppid, cr.ppid);
- goto err;
- }
-
- if (!WIFSTOPPED(status)) {
- pr_err("SEIZE %d: task not stopped after seize\n", pid);
- goto err;
- }
-
- ret = ptrace(PTRACE_GETSIGINFO, pid, NULL, &si);
- if (ret < 0) {
- pr_perror("SEIZE %d: can't read signfo", pid);
- goto err;
- }
-
- if (SI_EVENT(si.si_code) != PTRACE_EVENT_STOP) {
- /*
- * Kernel notifies us about the task being seized received some
- * event other than the STOP, i.e. -- a signal. Let the task
- * handle one and repeat.
- */
-
- if (ptrace(PTRACE_CONT, pid, NULL,
- (void *)(unsigned long)si.si_signo)) {
- pr_perror("Can't continue signal handling, aborting");
- goto err;
- }
-
- ret = 0;
- goto try_again;
- }
-
- if (*creds == NULL) {
- *creds = xzalloc(sizeof(struct proc_status_creds));
- if (!*creds)
- goto err;
-
- **creds = cr;
-
- } else if (!proc_status_creds_dumpable(*creds, &cr)) {
- pr_err("creds don't match %d %d\n", pid, ppid);
- goto err;
- }
-
- if (cr.seccomp_mode != SECCOMP_MODE_DISABLED && suspend_seccomp(pid) < 0)
- goto err;
-
- nr_sigstop = 0;
- if (cr.sigpnd & (1 << (SIGSTOP - 1)))
- nr_sigstop++;
- if (cr.shdpnd & (1 << (SIGSTOP - 1)))
- nr_sigstop++;
- if (si.si_signo == SIGSTOP)
- nr_sigstop++;
-
- if (nr_sigstop) {
- if (skip_sigstop(pid, nr_sigstop))
- goto err_stop;
-
- return TASK_STOPPED;
- }
-
- if (si.si_signo == SIGTRAP)
- return TASK_ALIVE;
- else {
- pr_err("SEIZE %d: unsupported stop signal %d\n", pid, si.si_signo);
- goto err;
- }
-
-err_stop:
- kill(pid, SIGSTOP);
-err:
- if (ptrace(PTRACE_DETACH, pid, NULL, NULL))
- pr_perror("Unable to detach from %d", pid);
- return -1;
-}
-
-int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes)
-{
- unsigned long w;
- if (bytes & (sizeof(long) - 1))
- return -1;
- for (w = 0; w < bytes / sizeof(long); w++) {
- unsigned long *d = dst, *a = addr;
- d[w] = ptrace(PTRACE_PEEKDATA, pid, a + w, NULL);
- if (d[w] == -1U && errno)
- goto err;
- }
- return 0;
-err:
- return -2;
-}
-
-int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes)
-{
- unsigned long w;
- if (bytes & (sizeof(long) - 1))
- return -1;
- for (w = 0; w < bytes / sizeof(long); w++) {
- unsigned long *s = src, *a = addr;
- if (ptrace(PTRACE_POKEDATA, pid, a + w, s[w]))
- goto err;
- }
- return 0;
-err:
- return -2;
-}
-
-/* don't swap big space, it might overflow the stack */
-int ptrace_swap_area(pid_t pid, void *dst, void *src, long bytes)
-{
- void *t = alloca(bytes);
-
- if (ptrace_peek_area(pid, t, dst, bytes))
- return -1;
-
- if (ptrace_poke_area(pid, src, dst, bytes)) {
- if (ptrace_poke_area(pid, t, dst, bytes))
- return -2;
- return -1;
- }
-
- memcpy(src, t, bytes);
-
- return 0;
-}
diff --git a/rbtree.c b/rbtree.c
deleted file mode 100644
index 64a38ea76a48..000000000000
--- a/rbtree.c
+++ /dev/null
@@ -1,357 +0,0 @@
-/*
- * RBtree implementation adopted from the Linux kernel sources.
- */
-
-#include <sys/types.h>
-#include "rbtree.h"
-
-static void __rb_rotate_left(struct rb_node *node, struct rb_root *root)
-{
- struct rb_node *right = node->rb_right;
- struct rb_node *parent = rb_parent(node);
-
- node->rb_right = right->rb_left;
- if (node->rb_right)
- rb_set_parent(right->rb_left, node);
- right->rb_left = node;
-
- rb_set_parent(right, parent);
-
- if (parent) {
- if (node == parent->rb_left)
- parent->rb_left = right;
- else
- parent->rb_right = right;
- } else
- root->rb_node = right;
- rb_set_parent(node, right);
-}
-
-static void __rb_rotate_right(struct rb_node *node, struct rb_root *root)
-{
- struct rb_node *left = node->rb_left;
- struct rb_node *parent = rb_parent(node);
-
- node->rb_left = left->rb_right;
- if (node->rb_left)
- rb_set_parent(left->rb_right, node);
- left->rb_right = node;
-
- rb_set_parent(left, parent);
-
- if (parent) {
- if (node == parent->rb_right)
- parent->rb_right = left;
- else
- parent->rb_left = left;
- } else
- root->rb_node = left;
- rb_set_parent(node, left);
-}
-
-void rb_insert_color(struct rb_node *node, struct rb_root *root)
-{
- struct rb_node *parent, *gparent;
-
- while ((parent = rb_parent(node)) && rb_is_red(parent)) {
- gparent = rb_parent(parent);
-
- if (parent == gparent->rb_left) {
- {
- register struct rb_node *uncle = gparent->rb_right;
- if (uncle && rb_is_red(uncle)) {
- rb_set_black(uncle);
- rb_set_black(parent);
- rb_set_red(gparent);
- node = gparent;
- continue;
- }
- }
-
- if (parent->rb_right == node) {
- register struct rb_node *tmp;
- __rb_rotate_left(parent, root);
- tmp = parent;
- parent = node;
- node = tmp;
- }
-
- rb_set_black(parent);
- rb_set_red(gparent);
- __rb_rotate_right(gparent, root);
- } else {
- {
- register struct rb_node *uncle = gparent->rb_left;
- if (uncle && rb_is_red(uncle)) {
- rb_set_black(uncle);
- rb_set_black(parent);
- rb_set_red(gparent);
- node = gparent;
- continue;
- }
- }
-
- if (parent->rb_left == node) {
- register struct rb_node *tmp;
- __rb_rotate_right(parent, root);
- tmp = parent;
- parent = node;
- node = tmp;
- }
-
- rb_set_black(parent);
- rb_set_red(gparent);
- __rb_rotate_left(gparent, root);
- }
- }
-
- rb_set_black(root->rb_node);
-}
-
-static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
- struct rb_root *root)
-{
- struct rb_node *other;
-
- while ((!node || rb_is_black(node)) && node != root->rb_node) {
- if (parent->rb_left == node) {
- other = parent->rb_right;
- if (rb_is_red(other)) {
- rb_set_black(other);
- rb_set_red(parent);
- __rb_rotate_left(parent, root);
- other = parent->rb_right;
- }
- if ((!other->rb_left || rb_is_black(other->rb_left)) &&
- (!other->rb_right || rb_is_black(other->rb_right))) {
- rb_set_red(other);
- node = parent;
- parent = rb_parent(node);
- } else {
- if (!other->rb_right || rb_is_black(other->rb_right)) {
- rb_set_black(other->rb_left);
- rb_set_red(other);
- __rb_rotate_right(other, root);
- other = parent->rb_right;
- }
- rb_set_color(other, rb_color(parent));
- rb_set_black(parent);
- rb_set_black(other->rb_right);
- __rb_rotate_left(parent, root);
- node = root->rb_node;
- break;
- }
- } else {
- other = parent->rb_left;
- if (rb_is_red(other)) {
- rb_set_black(other);
- rb_set_red(parent);
- __rb_rotate_right(parent, root);
- other = parent->rb_left;
- }
- if ((!other->rb_left || rb_is_black(other->rb_left)) &&
- (!other->rb_right || rb_is_black(other->rb_right))) {
- rb_set_red(other);
- node = parent;
- parent = rb_parent(node);
- } else {
- if (!other->rb_left || rb_is_black(other->rb_left)) {
- rb_set_black(other->rb_right);
- rb_set_red(other);
- __rb_rotate_left(other, root);
- other = parent->rb_left;
- }
- rb_set_color(other, rb_color(parent));
- rb_set_black(parent);
- rb_set_black(other->rb_left);
- __rb_rotate_right(parent, root);
- node = root->rb_node;
- break;
- }
- }
- }
-
- if (node)
- rb_set_black(node);
-}
-
-void rb_erase(struct rb_node *node, struct rb_root *root)
-{
- struct rb_node *child, *parent;
- int color;
-
- if (!node->rb_left)
- child = node->rb_right;
- else if (!node->rb_right)
- child = node->rb_left;
- else {
- struct rb_node *old = node, *left;
-
- node = node->rb_right;
- while ((left = node->rb_left))
- node = left;
-
- if (rb_parent(old)) {
- if (rb_parent(old)->rb_left == old)
- rb_parent(old)->rb_left = node;
- else
- rb_parent(old)->rb_right = node;
- } else
- root->rb_node = node;
-
- child = node->rb_right;
- parent = rb_parent(node);
- color = rb_color(node);
-
- if (parent == old) {
- parent = node;
- } else {
- if (child)
- rb_set_parent(child, parent);
- parent->rb_left = child;
-
- node->rb_right = old->rb_right;
- rb_set_parent(old->rb_right, node);
- }
-
- node->rb_parent_color = old->rb_parent_color;
- node->rb_left = old->rb_left;
- rb_set_parent(old->rb_left, node);
-
- goto color;
- }
-
- parent = rb_parent(node);
- color = rb_color(node);
-
- if (child)
- rb_set_parent(child, parent);
-
- if (parent) {
- if (parent->rb_left == node)
- parent->rb_left = child;
- else
- parent->rb_right = child;
- } else
- root->rb_node = child;
-
-color:
- if (color == RB_BLACK)
- __rb_erase_color(child, parent, root);
-}
-
-/*
- * This function returns the first node (in sort order) of the tree.
- */
-struct rb_node *rb_first(const struct rb_root *root)
-{
- struct rb_node *n;
-
- n = root->rb_node;
- if (!n)
- return NULL;
-
- while (n->rb_left)
- n = n->rb_left;
-
- return n;
-}
-
-struct rb_node *rb_last(const struct rb_root *root)
-{
- struct rb_node *n;
-
- n = root->rb_node;
- if (!n)
- return NULL;
-
- while (n->rb_right)
- n = n->rb_right;
-
- return n;
-}
-
-struct rb_node *rb_next(const struct rb_node *node)
-{
- struct rb_node *parent;
-
- if (rb_parent(node) == node)
- return NULL;
-
- /*
- * If we have a right-hand child, go down and
- * then left as far as we can.
- */
- if (node->rb_right) {
- node = node->rb_right;
- while (node->rb_left)
- node=node->rb_left;
- return (struct rb_node *)node;
- }
-
- /*
- * No right-hand children. Everything down and left is
- * smaller than us, so any 'next' node must be in the general
- * direction of our parent. Go up the tree; any time the
- * ancestor is a right-hand child of its parent, keep going
- * up. First time it's a left-hand child of its parent, said
- * parent is our 'next' node.
- */
- while ((parent = rb_parent(node)) && node == parent->rb_right)
- node = parent;
-
- return parent;
-}
-
-struct rb_node *rb_prev(const struct rb_node *node)
-{
- struct rb_node *parent;
-
- if (rb_parent(node) == node)
- return NULL;
-
- /*
- * If we have a left-hand child, go down and
- * then right as far as we can.
- */
- if (node->rb_left) {
- node = node->rb_left;
- while (node->rb_right)
- node = node->rb_right;
- return (struct rb_node *)node;
- }
-
- /*
- * No left-hand children. Go up till we find
- * an ancestor which is a right-hand child of its parent.
- */
- while ((parent = rb_parent(node)) && node == parent->rb_left)
- node = parent;
-
- return parent;
-}
-
-void rb_replace_node(struct rb_node *victim,
- struct rb_node *new,
- struct rb_root *root)
-{
- struct rb_node *parent = rb_parent(victim);
-
- /* Set the surrounding nodes to point to the replacement */
- if (parent) {
- if (victim == parent->rb_left)
- parent->rb_left = new;
- else
- parent->rb_right = new;
- } else
- root->rb_node = new;
-
- if (victim->rb_left)
- rb_set_parent(victim->rb_left, new);
-
- if (victim->rb_right)
- rb_set_parent(victim->rb_right, new);
-
- /* Copy the pointers/colour from the victim to the replacement */
- *new = *victim;
-}
diff --git a/rst-malloc.c b/rst-malloc.c
deleted file mode 100644
index d39499729112..000000000000
--- a/rst-malloc.c
+++ /dev/null
@@ -1,223 +0,0 @@
-#include <stdio.h>
-#include <stdbool.h>
-#include <sys/mman.h>
-
-#include "rst-malloc.h"
-#include "bug.h"
-#include "asm/types.h"
-
-struct rst_mem_type_s {
- bool remapable;
- bool enabled;
- unsigned long free_bytes;
- void *free_mem;
- int (*grow)(struct rst_mem_type_s *, unsigned long size);
- unsigned long last;
-
- void *buf;
- unsigned long size;
-};
-
-static inline unsigned long rst_mem_grow(unsigned long need_size)
-{
- int rst_mem_batch = 2 * page_size();
-
- need_size = round_up(need_size, page_size());
- if (likely(need_size < rst_mem_batch))
- need_size = rst_mem_batch;
- else
- pr_debug("Growing rst memory %lu pages\n", need_size / page_size());
- return need_size;
-}
-
-static int grow_shared(struct rst_mem_type_s *t, unsigned long size)
-{
- void *aux;
-
- size = rst_mem_grow(size);
-
- /*
- * This buffer will not get remapped into
- * restorer, thus we can just forget the
- * previous chunk location and allocate a
- * new one
- */
- aux = mmap(NULL, size, PROT_READ | PROT_WRITE,
- MAP_SHARED | MAP_ANON, 0, 0);
- if (aux == MAP_FAILED)
- return -1;
-
- t->free_mem = aux;
- t->free_bytes = size;
- t->last = 0;
-
- return 0;
-}
-
-static int grow_remap(struct rst_mem_type_s *t, int flag, unsigned long size)
-{
- void *aux;
-
- size = rst_mem_grow(size);
-
- if (!t->buf)
- /*
- * Can't call mremap with NULL address :(
- */
- aux = mmap(NULL, size, PROT_READ | PROT_WRITE,
- flag | MAP_ANON, 0, 0);
- else
- /*
- * We'll have to remap all objects into restorer
- * address space and get their new addresses. Since
- * we allocate many objects as one linear array, it's
- * simpler just to grow the buffer and let callers
- * find out new array addresses, rather than allocate
- * a completely new one and force callers use objects'
- * cpos-s.
- */
- aux = mremap(t->buf, t->size,
- t->size + size, MREMAP_MAYMOVE);
- if (aux == MAP_FAILED)
- return -1;
-
- t->free_mem += (aux - t->buf);
- t->free_bytes += size;
- t->size += size;
- t->buf = aux;
-
- return 0;
-}
-
-static int grow_shremap(struct rst_mem_type_s *t, unsigned long size)
-{
- return grow_remap(t, MAP_SHARED, size);
-}
-
-static int grow_private(struct rst_mem_type_s *t, unsigned long size)
-{
- return grow_remap(t, MAP_PRIVATE, size);
-}
-
-static struct rst_mem_type_s rst_mems[RST_MEM_TYPES] = {
- [RM_SHARED] = {
- .grow = grow_shared,
- .remapable = false,
- .enabled = true,
- },
- [RM_SHREMAP] = {
- .grow = grow_shremap,
- .remapable = true,
- .enabled = true,
- },
- [RM_PRIVATE] = {
- .grow = grow_private,
- .remapable = true,
- .enabled = false,
- },
-};
-
-void rst_mem_switch_to_private(void)
-{
- rst_mems[RM_SHARED].enabled = false;
- rst_mems[RM_SHREMAP].enabled = false;
- rst_mems[RM_PRIVATE].enabled = true;
-}
-
-unsigned long rst_mem_align_cpos(int type)
-{
- struct rst_mem_type_s *t = &rst_mems[type];
- BUG_ON(!t->remapable || !t->enabled);
-
- t->free_mem = (void *) round_up((unsigned long)t->free_mem, sizeof(void *));
-
- return t->free_mem - t->buf;
-}
-
-void *rst_mem_remap_ptr(unsigned long pos, int type)
-{
- struct rst_mem_type_s *t = &rst_mems[type];
- BUG_ON(!t->remapable);
- return t->buf + pos;
-}
-
-void *rst_mem_alloc(unsigned long size, int type)
-{
- struct rst_mem_type_s *t = &rst_mems[type];
- void *ret;
-
- BUG_ON(!t->enabled);
-
- if ((t->free_bytes < size) && t->grow(t, size)) {
- pr_perror("Can't grow rst mem");
- return NULL;
- }
-
- ret = t->free_mem;
- t->free_mem += size;
- t->free_bytes -= size;
- t->last = size;
-
- return ret;
-}
-
-void rst_mem_free_last(int type)
-{
- struct rst_mem_type_s *t = &rst_mems[type];
-
- BUG_ON(!t->enabled);
-
- t->free_mem -= t->last;
- t->free_bytes += t->last;
- t->last = 0; /* next free_last would be no-op */
-}
-
-unsigned long rst_mem_lock(void)
-{
- /*
- * Don't allow further allocations from rst_mem since we're
- * going to get the bootstrap area and remap all the stuff
- * into it. The SHREMAP and SHARED should be already locked
- * in the rst_mem_switch_to_private().
- */
- rst_mems[RM_PRIVATE].enabled = false;
- return rst_mems[RM_PRIVATE].size + rst_mems[RM_SHREMAP].size;
-}
-
-static int rst_mem_remap_one(struct rst_mem_type_s *t, void *to)
-{
- void *aux;
-
- BUG_ON(!t->remapable || t->enabled);
-
- if (!t->buf)
- /*
- * No allocations happenned from this buffer.
- * It's safe just to do nothing.
- */
- return 0;
-
- pr_debug("\tcall mremap(%p, %lu, %lu, MAYMOVE | FIXED, %p)\n",
- t->buf, t->size, t->size, to);
- aux = mremap(t->buf, t->size, t->size, MREMAP_MAYMOVE | MREMAP_FIXED, to);
- if (aux == MAP_FAILED) {
- pr_perror("Can't mremap rst mem");
- return -1;
- }
-
- t->buf = aux;
- return 0;
-}
-
-int rst_mem_remap(void *to)
-{
- int ret;
-
- ret = rst_mem_remap_one(&rst_mems[RM_PRIVATE], to);
- if (!ret) {
- to += rst_mems[RM_PRIVATE].size;
- ret = rst_mem_remap_one(&rst_mems[RM_SHREMAP], to);
- }
-
- return ret;
-}
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
deleted file mode 100644
index ee0b800063e1..000000000000
--- a/scripts/Makefile.build
+++ /dev/null
@@ -1,251 +0,0 @@
-##
-## General helpers for simplified Makefiles.
-##
-MAKEFLAGS := -r -R --no-print-directory
-
-targets :=
-deps :=
-deps-after :=
-all-objs :=
-incdeps :=
-_all :=
-_cleanup-y :=
-
-include scripts/Makefile.rules
-include $(obj)/$(makefile)
-
-##
-## Append targets to be auto-cleanuped
-define add-cleanup-obj-c-by-name
-_cleanup-y+= $(1).o
-_cleanup-y+= $(1).i
-_cleanup-y+= $(1).d
-_cleanup-y+= $(1).s
-endef
-
-define add-cleanup-obj-S-by-name
-_cleanup-y+= $(1).o
-_cleanup-y+= $(1).d
-_cleanup-y+= $(1).i
-endef
-
-##
-##
-## Generate a bundle of rules for C files
-define gen-target-c-bundle
-$(eval $(call gen-rule-o-from-c-by-name,$(1),$(2),$(3)))
-$(eval $(call gen-rule-i-from-c-by-name,$(1),$(2),$(3)))
-$(eval $(call gen-rule-d-from-c-by-name,$(1),$(2),$(3)))
-$(eval $(call gen-rule-s-from-c-by-name,$(1),$(2),$(3)))
-$(eval $(call add-cleanup-obj-c-by-name,$(1)))
-endef
-
-##
-##
-## Generate a bundle of rules for S files
-define gen-target-S-bundle
-$(eval $(call gen-rule-o-from-S-by-name,$(1),$(2),$(3)))
-$(eval $(call gen-rule-d-from-S-by-name,$(1),$(2),$(3)))
-$(eval $(call gen-rule-i-from-S-by-name,$(1),$(2),$(3)))
-$(eval $(call add-cleanup-obj-S-by-name,$(1)))
-endef
-
-##
-##
-## Shared or standalone targets
-ifneq ($(obj-y),)
-obj-y := $(addprefix $(obj)/, $(obj-y))
-$(foreach file, \
- $(obj-y), \
- $(eval \
- $(call gen-target-c-bundle, \
- $(file:.o=),$(file:.o=))))
-all-objs += $(obj-y)
-deps += $(obj-y:.o=.d)
-endif
-
-ifneq ($(obj-x),)
-obj-x := $(addprefix $(obj)/, $(obj-x))
-obj-x := $(addsuffix $(xsuffix).o, $(obj-x:.o=))
-$(foreach file, \
- $(obj-x), \
- $(eval \
- $(call gen-target-c-bundle, \
- $(file:$(xsuffix).o=),$(file:.o=))))
-all-objs += $(obj-x)
-deps += $(obj-x:.o=.d)
-cleanup-y += $(obj-x) $(obj-x:.o=.d)
-endif
-
-ifneq ($(obj-e),)
-$(foreach file, \
- $(obj-e), \
- $(eval \
- $(call gen-target-c-bundle, \
- $(file:.o=),$(file:.o=))))
-all-objs += $(obj-e)
-deps += $(obj-e:.o=.d)
-endif
-
-ifneq ($(asm-y),)
-asm-y := $(addprefix $(obj)/, $(asm-y))
-$(foreach file, \
- $(asm-y), \
- $(eval \
- $(call gen-target-S-bundle, \
- $(file:.o=),$(file:.o=))))
-all-objs += $(asm-y)
-deps += $(asm-y:.o=.d)
-endif
-
-ifneq ($(asm-e),)
-$(foreach file, \
- $(asm-e), \
- $(eval \
- $(call gen-target-S-bundle, \
- $(file:.o=),$(file:.o=))))
-all-objs += $(asm-e)
-deps += $(asm-e:.o=.d)
-endif
-
-##
-##
-## Standalone files where sources are kept in external
-## directories. Usually needed when same source files
-## are compiled with different flags.
-ifneq ($(obj-ext-src-y),)
-__obj-ext-src-y := $(addprefix $(obj)/, $(notdir $(obj-ext-src-y)))
-$(foreach file, \
- $(obj-ext-src-y), \
- $(eval \
- $(call gen-target-c-bundle, \
- $(file:.o=), \
- $(addprefix $(obj)/,$(notdir $(file:.o=))))))
-all-objs += $(__obj-ext-src-y)
-deps += $(__obj-ext-src-y:.o=.d)
-cleanup-y += $(__obj-ext-src-y) $(__obj-ext-src-y:.o=.d)
-endif
-
-##
-##
-## Generate rules for a target
-define gen-target-rules
-
-$(1)-all-objs :=
-
-ifneq ($($(1)-obj-y),)
- $(foreach file, \
- $($(1)-obj-y), \
- $(eval \
- $(call gen-target-c-bundle, \
- $(obj)/$(file:.o=), \
- $(obj)/$(file:.o=), \
- $($(1)-obj-y-cflags))))
- $(1)-all-objs += $$(addprefix $(obj)/, $($(1)-obj-y))
- deps += $$(addprefix $(obj)/, $($(1)-obj-y:.o=.d))
-endif
-
-ifneq ($($(1)-obj-e),)
- $(foreach file, \
- $($(1)-obj-e), \
- $(eval \
- $(call gen-target-c-bundle, \
- $(file:.o=), \
- $(file:.o=), \
- $($(1)-obj-e-cflags))))
- $(1)-all-objs += $$($(1)-obj-e)
- deps += $$($(1)-obj-e:.o=.d)
-endif
-
-ifneq ($($(1)-asm-y),)
- $(foreach file, \
- $($(1)-asm-y), \
- $(eval \
- $(call gen-target-S-bundle, \
- $(obj)/$(file:.o=), \
- $(obj)/$(file:.o=), \
- $($(1)-asm-y-asmflags))))
- $(1)-all-objs += $$(addprefix $(obj)/, $($(1)-asm-y))
- deps += $$($(1)-asm-y:.o=.d)
-endif
-
-ifneq ($($(1)-asm-e),)
- $(foreach file, \
- $($(1)-asm-e), \
- $(eval \
- $(call gen-target-S-bundle, \
- $(file:.o=), \
- $(file:.o=), \
- $($(1)-asm-e-asmflags))))
- $(1)-all-objs += $$($(1)-asm-e)
- deps += $$($(1)-asm-e:.o=.d)
-endif
-
-$(1)-all-objs += $(all-objs)
-
-$$(obj)/$(1).built-in.o: $$($(1)-all-objs) $$($(1)-libs-e) $(libs-e)
- $$(E) " LINK " $$@
- $$(Q) $$(LD) $$(ldflags-y) -r -o $$@ $$^
-
-_all += $$(obj)/$(1).built-in.o
-cleanup-y += $$(obj)/$(1).built-in.o
-endef
-
-##
-##
-## Walk over all targets and generate rules they require
-$(foreach target, \
- $(targets), \
- $(eval \
- $(call gen-target-rules,$(target))))
-
-##
-##
-## No targets -- just builtin default one
-ifeq ($(targets),)
-ifneq ($(all-objs),)
-$(obj)/built-in.o: $(all-objs) $(libs-e)
- $(E) " LINK " $@
- $(Q) $(LD) $(ldflags-y) -r -o $@ $^
-
-_all += $(obj)/built-in.o
-cleanup-y += $(obj)/built-in.o
-endif
-endif
-
-##
-## A rule for building library.
-ifneq ($(lib-so),)
-$(obj)/$(lib-so).so: $(all-objs) $(libs-e)
- $(E) " LINK " $@
- $(Q) $(CC) -shared $(cflags-so) -o $@ $^ $(ldflags-so) $(LDFLAGS)
-
-_all += $(obj)/$(lib-so).so
-cleanup-y += $(obj)/$(lib-so).so
-endif
-
-##
-##
-## Include deps if requested
-ifneq ($(incdeps),)
-ifneq ($(deps-after),)
-$(deps): | $(deps-after)
-endif
--include $(deps)
-endif
-
-##
-##
-## Autocomplete cleanups
-cleanup-y += $(_cleanup-y)
-
-##
-## Predefined .PHONY targets
-.PHONY: all clean
-
-all: $(_all)
- @true
-
-clean:
- $(E) " CLEANUP " $(obj)
- $(Q) $(RM) $(cleanup-y)
diff --git a/scripts/Makefile.rules b/scripts/Makefile.rules
deleted file mode 100644
index 229131091e6f..000000000000
--- a/scripts/Makefile.rules
+++ /dev/null
@@ -1,52 +0,0 @@
-##
-##
-## These are per-file generators.
-##
-define gen-rule-o-from-c-by-name
-$(2).o: $(1).c
- $$(E) " CC " $$@
- $$(Q) $$(CC) -c $$(CFLAGS) $$(cflags-y) $(3) $$< -o $$@
-endef
-
-define gen-rule-i-from-c-by-name
-$(2).i: $(1).c
- $$(E) " CC " $$@
- $$(Q) $$(CC) -E $$(CFLAGS) $$(cflags-y) $(3) $$< -o $$@
-endef
-
-define gen-rule-s-from-c-by-name
-$(2).s: $(1).c
- $$(E) " CC " $$@
- $$(Q) $$(CC) -S $$(CFLAGS) $$(cflags-y) $(3) -fverbose-asm $$< -o $$@
-endef
-
-define gen-rule-o-from-S-by-name
-$(2).o: $(1).S
- $$(E) " CC " $$@
- $$(Q) $$(CC) -c $$(CFLAGS) $$(cflags-y) $(3) $$(ASMFLAGS) $(4) $$< -o $$@
-endef
-
-define gen-rule-d-from-c-by-name
-$(2).d: $(1).c
- $$(E) " DEP " $$@
- $$(Q) $$(CC) -M -MT $$@ -MT $$(patsubst %.d,%.o,$$@) $$(CFLAGS) $$(cflags-y) $(3) $$< -o $$@
-endef
-
-define gen-rule-d-from-S-by-name
-$(2).d: $(1).S
- $$(E) " DEP " $$@
- $$(Q) $$(CC) -M -MT $$@ -MT $$(patsubst %.d,%.o,$$@) $$(CFLAGS) $$(cflags-y) $(3) $$< -o $$@
-endef
-
-define gen-rule-i-from-S-by-name
-$(2).i: $(1).S
- $$(E) " CC " $$@
- $$(Q) $$(CC) -E $$(CFLAGS) $$(cflags-y) $(3) $$< -o $$@
-endef
-
-##
-## In case if someone add last resort rule
-## together with .SUFFIXES not cleaned, this
-## will slow down the build procedure
-scripts/Makefile.rules::
- @true
diff --git a/scripts/Makefile.version b/scripts/Makefile.version
deleted file mode 100644
index 8905bd9b63a9..000000000000
--- a/scripts/Makefile.version
+++ /dev/null
@@ -1,36 +0,0 @@
-CRTOOLSVERSION := $(VERSION_MAJOR)$(if $(VERSION_MINOR),.$(VERSION_MINOR))$(if $(VERSION_SUBLEVEL),.$(VERSION_SUBLEVEL))
-
-VERSION_HEADER := include/version.h
-GITID_FILE := .gitid
-GITID := $(shell if [ -d ".git" ]; then git describe; fi)
-
-ifeq ($(GITID),)
- GITID := 0
-else
- GITID_FILE_VALUE := $(shell if [ -f '.gitid' ]; then if [ `cat .gitid` = $(GITID) ]; then echo y; fi; fi)
- ifneq ($(GITID_FILE_VALUE),y)
- .PHONY: $(GITID_FILE)
- endif
-endif
-
-$(GITID_FILE):
- $(E) " GEN " $@
- $(Q) echo "$(GITID)" > $(GITID_FILE)
-
-$(VERSION_HEADER): Makefile scripts/Makefile.version $(GITID_FILE)
- $(E) " GEN " $@
- $(Q) echo "/* Autogenerated, do not edit */" > $(VERSION_HEADER)
- $(Q) echo "#ifndef __CR_VERSION_H__" >> $(VERSION_HEADER)
- $(Q) echo "#define __CR_VERSION_H__" >> $(VERSION_HEADER)
- $(Q) echo "#define CRIU_VERSION \"$(CRTOOLSVERSION)\"" >> $(VERSION_HEADER)
- $(Q) echo "#define CRIU_VERSION_MAJOR " $(VERSION_MAJOR) >> $(VERSION_HEADER)
- $(Q) echo "#define CRIU_VERSION_MINOR " $(VERSION_MINOR) >> $(VERSION_HEADER)
- $(Q) echo "#define CRIU_GITID \"$(GITID)\"" >> $(VERSION_HEADER)
- $(Q) echo "#endif /* __CR_VERSION_H__ */" >> $(VERSION_HEADER)
-
-##
-## In case if someone add last resort rule
-## together with .SUFFIXES not cleaned, this
-## will slow down the build procedure
-scripts/Makefile.version::
- @true
diff --git a/seccomp.c b/seccomp.c
deleted file mode 100644
index 9fd545d677b6..000000000000
--- a/seccomp.c
+++ /dev/null
@@ -1,272 +0,0 @@
-#include <linux/filter.h>
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <unistd.h>
-
-#include "config.h"
-#include "imgset.h"
-#include "kcmp.h"
-#include "pstree.h"
-#include "ptrace.h"
-#include "proc_parse.h"
-#include "seccomp.h"
-#include "servicefd.h"
-#include "util.h"
-#include "rst-malloc.h"
-
-#include "protobuf.h"
-#include "protobuf/seccomp.pb-c.h"
-
-/* populated on dump during collect_seccomp_filters() */
-static int next_filter_id = 0;
-static struct seccomp_info **filters = NULL;
-
-static struct seccomp_info *find_inherited(struct pstree_item *parent,
- struct sock_filter *filter, int len)
-{
- struct seccomp_info *info;
-
- /* if we have no filters yet, this one has no parent */
- if (!filters)
- return NULL;
-
- for (info = filters[dmpi(parent)->pi_creds->last_filter]; info; info = info->prev) {
-
- if (len != info->filter.filter.len)
- continue;
- if (!memcmp(filter, info->filter.filter.data, len))
- return info;
- }
-
- return NULL;
-}
-
-static int collect_filter_for_pstree(struct pstree_item *item)
-{
- struct seccomp_info *infos = NULL, *cursor;
- int info_count, i, ret = -1;
- struct sock_filter buf[BPF_MAXINSNS];
- void *m;
-
- if (item->state == TASK_DEAD ||
- dmpi(item)->pi_creds->seccomp_mode != SECCOMP_MODE_FILTER)
- return 0;
-
- for (i = 0; true; i++) {
- int len;
- struct seccomp_info *info, *inherited = NULL;
-
- len = ptrace(PTRACE_SECCOMP_GET_FILTER, item->pid.real, i, buf);
- if (len < 0) {
- if (errno == ENOENT) {
- /* end of the search */
- BUG_ON(i == 0);
- goto save_infos;
- } else if (errno == EINVAL) {
- pr_err("dumping seccomp infos not supported\n");
- goto out;
- } else {
- pr_perror("couldn't dump seccomp filter");
- goto out;
- }
- }
-
- inherited = find_inherited(item->parent, buf, len);
- if (inherited) {
- bool found = false;
-
- /* Small sanity check: if infos is already populated,
- * we should have inherited that filter too. */
- for (cursor = infos; cursor; cursor = cursor->prev) {
- if (inherited->prev== cursor) {
- found = true;
- break;
- }
- }
-
- BUG_ON(!found);
-
- infos = inherited;
- continue;
- }
-
- info = xmalloc(sizeof(*info));
- if (!info)
- goto out;
- seccomp_filter__init(&info->filter);
-
- info->filter.filter.len = len * sizeof(struct sock_filter);
- info->filter.filter.data = xmalloc(info->filter.filter.len);
- if (!info->filter.filter.data) {
- xfree(info);
- goto out;
- }
-
- memcpy(info->filter.filter.data, buf, info->filter.filter.len);
-
- info->prev = infos;
- infos = info;
- }
-
-save_infos:
- info_count = i;
-
- m = xrealloc(filters, sizeof(*filters) * (next_filter_id + info_count));
- if (!m)
- goto out;
- filters = m;
-
- for (cursor = infos, i = info_count + next_filter_id - 1;
- i >= next_filter_id; i--) {
- BUG_ON(!cursor);
- cursor->id = i;
- filters[i] = cursor;
- cursor = cursor->prev;
- }
-
- next_filter_id += info_count;
-
- dmpi(item)->pi_creds->last_filter = infos->id;
-
- /* Don't free the part of the tree we just successfully acquired */
- infos = NULL;
- ret = 0;
-out:
- while (infos) {
- struct seccomp_info *freeme = infos;
- infos = infos->prev;
- xfree(freeme->filter.filter.data);
- xfree(freeme);
- }
-
- return ret;
-}
-
-static int dump_seccomp_filters(void)
-{
- SeccompEntry se = SECCOMP_ENTRY__INIT;
- int ret = -1, i;
-
- /* If we didn't collect any filters, don't create a seccomp image at all. */
- if (next_filter_id == 0)
- return 0;
-
- se.seccomp_filters = xzalloc(sizeof(*se.seccomp_filters) * next_filter_id);
- if (!se.seccomp_filters)
- return -1;
-
- se.n_seccomp_filters = next_filter_id;
-
- for (i = 0; i < next_filter_id; i++) {
- SeccompFilter *sf;
- struct seccomp_info *cur = filters[i];
-
- sf = se.seccomp_filters[cur->id] = &cur->filter;
- if (cur->prev) {
- sf->has_prev = true;
- sf->prev = cur->prev->id;
- }
- }
-
- ret = pb_write_one(img_from_set(glob_imgset, CR_FD_SECCOMP), &se, PB_SECCOMP);
-
- xfree(se.seccomp_filters);
-
- for (i = 0; i < next_filter_id; i++) {
- struct seccomp_info *freeme = filters[i];
-
- xfree(freeme->filter.filter.data);
- xfree(freeme);
- }
- xfree(filters);
-
- return ret;
-}
-
-int collect_seccomp_filters(void)
-{
- if (preorder_pstree_traversal(root_item, collect_filter_for_pstree) < 0)
- return -1;
-
- if (dump_seccomp_filters())
- return -1;
-
- return 0;
-}
-
-/* Populated on restore by prepare_seccomp_filters */
-static SeccompEntry *se;
-
-int prepare_seccomp_filters(void)
-{
- struct cr_img *img;
- int ret;
-
- img = open_image(CR_FD_SECCOMP, O_RSTR);
- if (!img)
- return -1;
-
- ret = pb_read_one_eof(img, &se, PB_SECCOMP);
- close_image(img);
- if (ret <= 0)
- return 0; /* there were no filters */
-
- BUG_ON(!se);
-
- return 0;
-}
-
-int seccomp_filters_get_rst_pos(CoreEntry *core, int *count, unsigned long *pos)
-{
- SeccompFilter *sf = NULL;
- struct sock_fprog *arr = NULL;
- void *filter_data = NULL;
- int ret = -1, i;
- size_t filter_size = 0;
-
- if (!core->tc->has_seccomp_filter) {
- *count = 0;
- return 0;
- }
-
- *count = 0;
- *pos = rst_mem_align_cpos(RM_PRIVATE);
-
- BUG_ON(core->tc->seccomp_filter > se->n_seccomp_filters);
- sf = se->seccomp_filters[core->tc->seccomp_filter];
-
- while (1) {
- (*count)++;
-
- filter_size += sf->filter.len;
-
- if (!sf->has_prev)
- break;
-
- sf = se->seccomp_filters[sf->prev];
- }
-
- arr = rst_mem_alloc(sizeof(struct sock_fprog) * (*count) + filter_size, RM_PRIVATE);
- if (!arr)
- goto out;
-
- filter_data = &arr[*count];
- sf = se->seccomp_filters[core->tc->seccomp_filter];
- for (i = 0; i < *count; i++) {
- struct sock_fprog *fprog = &arr[i];
-
- BUG_ON(sf->filter.len % sizeof(struct sock_filter));
- fprog->len = sf->filter.len / sizeof(struct sock_filter);
-
- memcpy(filter_data, sf->filter.data, sf->filter.len);
-
- filter_data += sf->filter.len;
- sf = se->seccomp_filters[sf->prev];
- }
-
- ret = 0;
-
-out:
- seccomp_entry__free_unpacked(se, NULL);
- return ret;
-}
diff --git a/seize.c b/seize.c
deleted file mode 100644
index 7d1f77c46dab..000000000000
--- a/seize.c
+++ /dev/null
@@ -1,688 +0,0 @@
-#include <stdbool.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <unistd.h>
-#include <sys/time.h>
-#include <sys/resource.h>
-#include <sys/wait.h>
-#include <time.h>
-
-#include "compiler.h"
-#include "cr_options.h"
-#include "cr-errno.h"
-#include "pstree.h"
-#include "ptrace.h"
-#include "seize.h"
-#include "stats.h"
-#include "xmalloc.h"
-#include "util.h"
-
-#define NR_ATTEMPTS 5
-
-static const char frozen[] = "FROZEN";
-static const char freezing[] = "FREEZING";
-static const char thawed[] = "THAWED";
-
-static const char *get_freezer_state(int fd)
-{
- char state[32];
- int ret;
-
- BUILD_BUG_ON((sizeof(state) < sizeof(frozen)) ||
- (sizeof(state) < sizeof(freezing)) ||
- (sizeof(state) < sizeof(thawed)));
-
- lseek(fd, 0, SEEK_SET);
- ret = read(fd, state, sizeof(state) - 1);
- if (ret <= 0) {
- pr_perror("Unable to get a current state");
- goto err;
- }
- if (state[ret - 1] == '\n')
- state[ret - 1] = 0;
- else
- state[ret] = 0;
-
- pr_debug("freezer.state=%s\n", state);
- if (strcmp(state, frozen) == 0)
- return frozen;
- else if (strcmp(state, freezing) == 0)
- return freezing;
- else if (strcmp(state, thawed) == 0)
- return thawed;
-
- pr_err("Unknown freezer state: %s\n", state);
-err:
- return NULL;
-}
-
-static bool freezer_thawed;
-
-const char *get_real_freezer_state(void)
-{
- return freezer_thawed ? thawed : frozen;
-}
-
-static int freezer_restore_state(void)
-{
- int fd;
- char path[PATH_MAX];
-
- if (!opts.freeze_cgroup || freezer_thawed)
- return 0;
-
- snprintf(path, sizeof(path), "%s/freezer.state", opts.freeze_cgroup);
- fd = open(path, O_RDWR);
- if (fd < 0) {
- pr_perror("Unable to open %s", path);
- return -1;
- }
-
- if (write(fd, frozen, sizeof(frozen)) != sizeof(frozen)) {
- pr_perror("Unable to freeze tasks");
- close(fd);
- return -1;
- }
- close(fd);
- return 0;
-}
-
-static int seize_cgroup_tree(char *root_path, const char *state)
-{
- DIR *dir;
- struct dirent *de;
- char path[PATH_MAX];
- FILE *f;
-
- /*
- * New tasks can appear while a freezer state isn't
- * frozen, so we need to catch all new tasks.
- */
- snprintf(path, sizeof(path), "%s/tasks", root_path);
- f = fopen(path, "r");
- if (f == NULL) {
- pr_perror("Unable to open %s", path);
- return -1;
- }
- while (fgets(path, sizeof(path), f)) {
- pid_t pid;
- int ret;
-
- pid = atoi(path);
-
- /* Here we are going to skip tasks which are already traced. */
- ret = ptrace(PTRACE_INTERRUPT, pid, NULL, NULL);
- if (ret == 0)
- continue;
- if (errno != ESRCH) {
- pr_perror("Unexpected error");
- fclose(f);
- return -1;
- }
-
- if (!seize_catch_task(pid)) {
- pr_debug("SEIZE %d: success\n", pid);
- processes_to_wait++;
- } else if (state == frozen) {
- char buf[] = "/proc/XXXXXXXXXX/exe";
- struct stat st;
-
- /* skip kernel threads */
- snprintf(buf, sizeof(buf), "/proc/%d/exe", pid);
- if (stat(buf, &st) == -1 && errno == ENOENT)
- continue;
-
- /* fails when meets a zombie */
- pr_err("zombie found while seizing\n");
- fclose(f);
- return -1;
- }
- }
- fclose(f);
-
- dir = opendir(root_path);
- if (!dir) {
- pr_perror("Unable to open %s", root_path);
- return -1;
- }
-
- while ((de = readdir(dir))) {
- struct stat st;
-
- if (dir_dots(de))
- continue;
-
- sprintf(path, "%s/%s", root_path, de->d_name);
-
- if (fstatat(dirfd(dir), de->d_name, &st, 0) < 0) {
- pr_perror("stat of %s failed", path);
- closedir(dir);
- return -1;
- }
-
- if (!S_ISDIR(st.st_mode))
- continue;
-
- if (seize_cgroup_tree(path, state) < 0) {
- closedir(dir);
- return -1;
- }
- }
- closedir(dir);
-
- return 0;
-}
-
-/* A number of tasks in a freezer cgroup which are not going to be dumped */
-int processes_to_wait;
-static pid_t *processes_to_wait_pids;
-
-/*
- * A freezer cgroup can contain tasks which will not be dumped
- * and we need to wait them, because the are interupted them by ptrace.
- */
-static int freezer_wait_processes()
-{
- int i;
-
- processes_to_wait_pids = xmalloc(sizeof(pid_t) * processes_to_wait);
- if (processes_to_wait_pids == NULL)
- return -1;
-
- for (i = 0; i < processes_to_wait; i++) {
- int status;
- pid_t pid;
-
- /*
- * Here we are going to skip tasks which are already traced.
- * Ptraced tasks looks like children for us, so if
- * a task isn't ptraced yet, waitpid() will return a error.
- */
- pid = waitpid(-1, &status, 0);
- if (pid < 0) {
- pr_perror("Unable to wait processes");
- xfree(processes_to_wait_pids);
- return -1;
- }
- pr_warn("Unexpected process %d in the freezer cgroup (status 0x%x)\n", pid, status);
-
- processes_to_wait_pids[i] = pid;
- }
-
- return 0;
-}
-
-static int freezer_detach(void)
-{
- int i;
-
- if (!opts.freeze_cgroup)
- return 0;
-
- for (i = 0; i < processes_to_wait; i++) {
- pid_t pid = processes_to_wait_pids[i];
- int status, save_errno;
-
- if (ptrace(PTRACE_DETACH, pid, NULL, NULL) == 0)
- continue;
-
- save_errno = errno;
-
- /* A process may be killed by SIGKILL */
- if (wait4(pid, &status, __WALL, NULL) == pid) {
- pr_warn("The %d process returned 0x %x\n", pid, status);
- continue;
- }
- errno = save_errno;
- pr_perror("Unable to detach from %d", pid);
- }
-
- return 0;
-}
-
-static int freeze_processes(void)
-{
- int i, fd, exit_code = -1;
- char path[PATH_MAX];
- const char *state = thawed;
-
- snprintf(path, sizeof(path), "%s/freezer.state", opts.freeze_cgroup);
- fd = open(path, O_RDWR);
- if (fd < 0) {
- pr_perror("Unable to open %s", path);
- return -1;
- }
- state = get_freezer_state(fd);
- if (!state) {
- close(fd);
- return -1;
- }
- if (state == thawed) {
- freezer_thawed = true;
-
- lseek(fd, 0, SEEK_SET);
- if (write(fd, frozen, sizeof(frozen)) != sizeof(frozen)) {
- pr_perror("Unable to freeze tasks");
- close(fd);
- return -1;
- }
- }
-
- /*
- * There is not way to wait a specified state, so we need to poll the
- * freezer.state.
- * Here is one extra attempt to check that everything are frozen.
- */
- for (i = 0; i <= NR_ATTEMPTS; i++) {
- struct timespec req = {};
- u64 timeout;
-
- if (seize_cgroup_tree(opts.freeze_cgroup, state) < 0)
- goto err;
-
- if (state == frozen)
- break;
-
- state = get_freezer_state(fd);
- if (!state)
- goto err;
-
- if (state == frozen) {
- /*
- * Enumerate all tasks one more time to collect all new
- * tasks, which can be born while the cgroup is being frozen.
- */
-
- continue;
- }
-
- timeout = 100000000 * (i + 1); /* 100 msec */
- req.tv_nsec = timeout % 1000000000;
- req.tv_sec = timeout / 1000000000;
- nanosleep(&req, NULL);
- }
-
- if (i > NR_ATTEMPTS) {
- pr_err("Unable to freeze cgroup %s\n", opts.freeze_cgroup);
- goto err;
- }
-
- exit_code = 0;
-err:
- if (exit_code == 0 || freezer_thawed) {
- lseek(fd, 0, SEEK_SET);
- if (write(fd, thawed, sizeof(thawed)) != sizeof(thawed)) {
- pr_perror("Unable to thaw tasks");
- exit_code = -1;
- }
- }
- if (close(fd)) {
- pr_perror("Unable to thaw tasks");
- return -1;
- }
-
- return exit_code;
-}
-
-static inline bool child_collected(struct pstree_item *i, pid_t pid)
-{
- struct pstree_item *c;
-
- list_for_each_entry(c, &i->children, sibling)
- if (c->pid.real == pid)
- return true;
-
- return false;
-}
-
-static int collect_task(struct pstree_item *item);
-static int collect_children(struct pstree_item *item)
-{
- pid_t *ch;
- int ret, i, nr_children, nr_inprogress;
-
- ret = parse_children(item->pid.real, &ch, &nr_children);
- if (ret < 0)
- return ret;
-
- nr_inprogress = 0;
- for (i = 0; i < nr_children; i++) {
- struct pstree_item *c;
- pid_t pid = ch[i];
-
- /* Is it already frozen? */
- if (child_collected(item, pid))
- continue;
-
- nr_inprogress++;
-
- pr_info("Seized task %d, state %d\n", pid, ret);
-
- c = alloc_pstree_item();
- if (c == NULL) {
- ret = -1;
- goto free;
- }
-
- if (!opts.freeze_cgroup)
- /* fails when meets a zombie */
- seize_catch_task(pid);
-
- ret = seize_wait_task(pid, item->pid.real, &dmpi(c)->pi_creds);
- if (ret < 0) {
- /*
- * Here is a race window between parse_children() and seize(),
- * so the task could die for these time.
- * Don't worry, will try again on the next attempt. The number
- * of attempts is restricted, so it will exit if something
- * really wrong.
- */
- ret = 0;
- xfree(c);
- continue;
- }
-
- c->pid.real = pid;
- c->parent = item;
- c->state = ret;
- list_add_tail(&c->sibling, &item->children);
-
- /* Here is a recursive call (Depth-first search) */
- ret = collect_task(c);
- if (ret < 0)
- goto free;
- }
-free:
- xfree(ch);
- return ret < 0 ? ret : nr_inprogress;
-}
-
-static void unseize_task_and_threads(const struct pstree_item *item, int st)
-{
- int i;
-
- if (item->state == TASK_DEAD)
- return;
-
- /*
- * The st is the state we want to switch tasks into,
- * the item->state is the state task was in when we seized one.
- */
-
- unseize_task(item->pid.real, item->state, st);
-
- if (st == TASK_DEAD)
- return;
-
- for (i = 1; i < item->nr_threads; i++)
- if (ptrace(PTRACE_DETACH, item->threads[i].real, NULL, NULL))
- pr_perror("Unable to detach from %d", item->threads[i].real);
-}
-
-static void pstree_wait(struct pstree_item *root_item)
-{
- struct pstree_item *item = root_item;
- int pid, status, i;
-
- for_each_pstree_item(item) {
-
- if (item->state == TASK_DEAD)
- continue;
-
- for (i = 0; i < item->nr_threads; i++) {
- pid = wait4(-1, &status, __WALL, NULL);
- if (pid < 0) {
- pr_perror("wait4 failed");
- break;
- } else {
- if (!WIFSIGNALED(status) || WTERMSIG(status) != SIGKILL) {
- pr_err("Unexpected exit code %d of %d\n", status, pid);
- BUG();
- }
- }
- }
- }
-
- pid = wait4(-1, &status, __WALL, NULL);
- if (pid > 0) {
- pr_err("Unexpected child %d\n", pid);
- BUG();
- }
-}
-
-void pstree_switch_state(struct pstree_item *root_item, int st)
-{
- struct pstree_item *item = root_item;
-
- if (st != TASK_DEAD)
- freezer_restore_state();
-
- /*
- * We need to detach from all processes before waiting the init
- * process, because one of these processes may collect processes from a
- * target pid namespace. The pid namespace is destroyed only when all
- * processes have been killed and collected.
- */
- freezer_detach();
-
- pr_info("Unfreezing tasks into %d\n", st);
- for_each_pstree_item(item)
- unseize_task_and_threads(item, st);
-
- if (st == TASK_DEAD)
- pstree_wait(root_item);
-}
-
-static pid_t item_ppid(const struct pstree_item *item)
-{
- item = item->parent;
- return item ? item->pid.real : -1;
-}
-
-static inline bool thread_collected(struct pstree_item *i, pid_t tid)
-{
- int t;
-
- if (i->pid.real == tid) /* thread leader is collected as task */
- return true;
-
- for (t = 0; t < i->nr_threads; t++)
- if (tid == i->threads[t].real)
- return true;
-
- return false;
-}
-
-static int collect_threads(struct pstree_item *item)
-{
- struct pid *threads = NULL;
- int nr_threads = 0, i = 0, ret, nr_inprogress, nr_stopped = 0;
-
- ret = parse_threads(item->pid.real, &threads, &nr_threads);
- if (ret < 0)
- goto err;
-
- if ((item->state == TASK_DEAD) && (nr_threads > 1)) {
- pr_err("Zombies with threads are not supported\n");
- goto err;
- }
-
- /* The number of threads can't be less than allready frozen */
- item->threads = xrealloc(item->threads, nr_threads * sizeof(struct pid));
- if (item->threads == NULL)
- return -1;
-
- if (item->nr_threads == 0) {
- item->threads[0].real = item->pid.real;
- item->nr_threads = 1;
- }
-
- nr_inprogress = 0;
- for (i = 0; i < nr_threads; i++) {
- pid_t pid = threads[i].real;
-
- if (thread_collected(item, pid))
- continue;
-
- nr_inprogress++;
-
- pr_info("\tSeizing %d's %d thread\n",
- item->pid.real, pid);
-
- if (!opts.freeze_cgroup && seize_catch_task(pid))
- continue;
-
- ret = seize_wait_task(pid, item_ppid(item), &dmpi(item)->pi_creds);
- if (ret < 0) {
- /*
- * Here is a race window between parse_threads() and seize(),
- * so the task could die for these time.
- * Don't worry, will try again on the next attempt. The number
- * of attempts is restricted, so it will exit if something
- * really wrong.
- */
- continue;
- }
-
- BUG_ON(item->nr_threads + 1 > nr_threads);
- item->threads[item->nr_threads].real = pid;
- item->nr_threads++;
-
- if (ret == TASK_DEAD) {
- pr_err("Zombie thread not supported\n");
- goto err;
- }
-
- if (ret == TASK_STOPPED) {
- nr_stopped++;
- }
- }
-
- if (nr_stopped && nr_stopped != nr_inprogress) {
- pr_err("Individually stopped threads not supported\n");
- goto err;
- }
-
- xfree(threads);
- return nr_inprogress;
-
-err:
- xfree(threads);
- return -1;
-}
-
-static int collect_loop(struct pstree_item *item,
- int (*collect)(struct pstree_item *))
-{
- int attempts = NR_ATTEMPTS, nr_inprogress = 1;
-
- if (opts.freeze_cgroup)
- attempts = 1;
-
- /*
- * While we scan the proc and seize the children/threads
- * new ones can appear (with clone(CLONE_PARENT) or with
- * pthread_create). Thus, after one go, we need to repeat
- * the scan-and-freeze again collecting new arrivals. As
- * new guys may appear again we do NR_ATTEMPTS passes and
- * fail to seize the item if new tasks/threads still
- * appear.
- */
-
- while (nr_inprogress > 0 && attempts >= 0) {
- attempts--;
- nr_inprogress = collect(item);
- }
-
- pr_info("Collected (%d attempts, %d in_progress)\n", attempts, nr_inprogress);
-
- /*
- * We may fail to collect items or run out of attempts.
- * In the former case nr_inprogress will be negative, in
- * the latter -- positive. Thus it's enough just to check
- * for "no more new stuff" and say "we're OK" if so.
- */
-
- return (nr_inprogress == 0) ? 0 : -1;
-}
-
-static int collect_task(struct pstree_item *item)
-{
- int ret;
-
- ret = collect_loop(item, collect_threads);
- if (ret < 0)
- goto err_close;
-
- /* Depth-first search (DFS) is used for traversing a process tree. */
- ret = collect_loop(item, collect_children);
- if (ret < 0)
- goto err_close;
-
- if ((item->state == TASK_DEAD) && !list_empty(&item->children)) {
- pr_err("Zombie with children?! O_o Run, run, run!\n");
- goto err_close;
- }
-
- if (pstree_alloc_cores(item))
- goto err_close;
-
- pr_info("Collected %d in %d state\n", item->pid.real, item->state);
- return 0;
-
-err_close:
- close_pid_proc();
- return -1;
-}
-
-int collect_pstree(pid_t pid)
-{
- int ret = -1;
-
- timing_start(TIME_FREEZING);
-
- if (opts.freeze_cgroup && freeze_processes())
- goto err;
-
- root_item = alloc_pstree_item();
- if (root_item == NULL)
- goto err;
-
- root_item->pid.real = pid;
-
- if (!opts.freeze_cgroup && seize_catch_task(pid)) {
- set_cr_errno(ESRCH);
- goto err;
- }
-
- /*
- * wait4() may hang for some reason. Enable timer and fire SIGALRM
- * if timeout reached. SIGALRM handler will do the necessary
- * cleanups and terminate current process.
- */
- alarm(opts.timeout);
-
- ret = seize_wait_task(pid, -1, &dmpi(root_item)->pi_creds);
- if (ret < 0)
- goto err;
- pr_info("Seized task %d, state %d\n", pid, ret);
- root_item->state = ret;
-
- ret = collect_task(root_item);
- if (ret < 0)
- goto err;
-
- if (opts.freeze_cgroup && freezer_wait_processes())
- goto err;
-
- ret = 0;
- timing_stop(TIME_FREEZING);
- timing_start(TIME_FROZEN);
-
-err:
- /* Freezing stage finished in time - disable timer. */
- alarm(0);
- return ret;
-}
-
diff --git a/shmem.c b/shmem.c
deleted file mode 100644
index ad3cdbbdeedd..000000000000
--- a/shmem.c
+++ /dev/null
@@ -1,449 +0,0 @@
-#include <unistd.h>
-#include <sys/mman.h>
-#include <stdlib.h>
-#include <fcntl.h>
-
-#include "pid.h"
-#include "shmem.h"
-#include "image.h"
-#include "cr_options.h"
-#include "kerndat.h"
-#include "page-pipe.h"
-#include "page-xfer.h"
-#include "rst-malloc.h"
-#include "vma.h"
-#include "config.h"
-
-#include "protobuf.h"
-#include "protobuf/pagemap.pb-c.h"
-
-/*
- * pid is a pid of a creater
- * start, end are used for open mapping
- * fd is a file discriptor, which is valid for creater,
- * it's opened in cr-restor, because pgoff may be non zero
- */
-struct shmem_info {
- unsigned long shmid;
- unsigned long size;
- int pid;
- int fd;
-
- /*
- * 0. lock is initilized to zero
- * 1. the master opens a descriptor and set lock to 1
- * 2. slaves open their descriptors and increment lock
- * 3. the master waits all slaves on lock. After that
- * it can close the descriptor.
- */
- futex_t lock;
-
- /*
- * Here is a problem, that we don't know, which process will restore
- * an region. Each time when we found a process with a smaller pid,
- * we reset self_count, so we can't have only one counter.
- */
- int count; /* the number of regions */
- int self_count; /* the number of regions, which belongs to "pid" */
-
- struct list_head l;
-};
-
-/*
- * This list is filled with shared objects before we fork
- * any tasks. Thus the head is private (COW-ed) and the
- * entries are all in shmem.
- */
-static LIST_HEAD(shmems); /* XXX hash? tree? */
-
-void show_saved_shmems(void)
-{
- struct shmem_info *si;
-
- pr_info("\tSaved shmems:\n");
- list_for_each_entry(si, &shmems, l)
- pr_info("\t\tshmid: 0x%lx pid: %d\n", si->shmid, si->pid);
-}
-
-static struct shmem_info *find_shmem_by_id(unsigned long shmid)
-{
- struct shmem_info *si;
-
- list_for_each_entry(si, &shmems, l)
- if (si->shmid == shmid)
- return si;
-
- return NULL;
-}
-
-int collect_shmem(int pid, VmaEntry *vi)
-{
- unsigned long size = vi->pgoff + vi->end - vi->start;
- struct shmem_info *si;
-
- si = find_shmem_by_id(vi->shmid);
- if (si) {
-
- if (si->size < size)
- si->size = size;
- si->count++;
-
- /*
- * Only the shared mapping with a lowest
- * pid will be created in real, other processes
- * will wait until the kernel propagate this mapping
- * into /proc
- */
- if (!pid_rst_prio(pid, si->pid)) {
- if (si->pid == pid)
- si->self_count++;
-
- return 0;
- }
-
- si->pid = pid;
- si->self_count = 1;
-
- return 0;
- }
-
- si = shmalloc(sizeof(struct shmem_info));
- if (!si)
- return -1;
-
- pr_info("Add new shmem 0x%"PRIx64" (0x%016"PRIx64"-0x%016"PRIx64")\n",
- vi->shmid, vi->start, vi->end);
-
- si->shmid = vi->shmid;
- si->pid = pid;
- si->size = size;
- si->fd = -1;
- si->count = 1;
- si->self_count = 1;
- futex_init(&si->lock);
- list_add_tail(&si->l, &shmems);
-
- return 0;
-}
-
-static int shmem_wait_and_open(int pid, struct shmem_info *si)
-{
- char path[128];
- int ret;
-
- pr_info("Waiting for the %lx shmem to appear\n", si->shmid);
- futex_wait_while(&si->lock, 0);
-
- snprintf(path, sizeof(path), "/proc/%d/fd/%d",
- si->pid, si->fd);
-
- pr_info("Opening shmem [%s] \n", path);
- ret = open_proc_rw(si->pid, "fd/%d", si->fd);
- if (ret < 0)
- pr_perror(" %d: Can't stat shmem at %s",
- si->pid, path);
- futex_inc_and_wake(&si->lock);
- return ret;
-}
-
-static int restore_shmem_content(void *addr, struct shmem_info *si)
-{
- int ret = 0, fd_pg;
- struct page_read pr;
- unsigned long off_real;
-
- ret = open_page_read(si->shmid, &pr, PR_SHMEM);
- if (ret <= 0)
- return -1;
-
- fd_pg = img_raw_fd(pr.pi);
- while (1) {
- unsigned long vaddr;
- unsigned nr_pages;
- struct iovec iov;
-
- ret = pr.get_pagemap(&pr, &iov);
- if (ret <= 0)
- break;
-
- vaddr = (unsigned long)iov.iov_base;
- nr_pages = iov.iov_len / PAGE_SIZE;
-
- if (vaddr + nr_pages * PAGE_SIZE > si->size)
- break;
-
- off_real = lseek(fd_pg, 0, SEEK_CUR);
-
- ret = read(fd_pg, addr + vaddr, nr_pages * PAGE_SIZE);
- if (ret != nr_pages * PAGE_SIZE) {
- ret = -1;
- break;
- }
-
- if (opts.auto_dedup) {
- ret = punch_hole(&pr, off_real, nr_pages * PAGE_SIZE, false);
- if (ret == -1) {
- break;
- }
- }
-
- if (pr.put_pagemap)
- pr.put_pagemap(&pr);
- }
-
- pr.close(&pr);
- return ret;
-}
-
-int get_shmem_fd(int pid, VmaEntry *vi)
-{
- struct shmem_info *si;
- void *addr = MAP_FAILED;
- int f = -1;
- int flags;
-
- si = find_shmem_by_id(vi->shmid);
- pr_info("Search for 0x%016"PRIx64" shmem 0x%"PRIx64" %p/%d\n", vi->start, vi->shmid, si, si ? si->pid : -1);
- if (!si) {
- pr_err("Can't find my shmem 0x%016"PRIx64"\n", vi->start);
- return -1;
- }
-
- if (si->pid != pid)
- return shmem_wait_and_open(pid, si);
-
- if (si->fd != -1)
- return dup(si->fd);
-
- flags = MAP_SHARED;
-#ifdef CONFIG_HAS_MEMFD
- if (kdat.has_memfd) {
- f = syscall(SYS_memfd_create, "", 0);
- if (f < 0) {
- pr_perror("Unable to create memfd");
- goto err;
- }
-
- if (ftruncate(f, si->size)) {
- pr_perror("Unable to truncate memfd");
- goto err;
- }
- flags |= MAP_FILE;
- } else
-#endif
- flags |= MAP_ANONYMOUS;
-
- /*
- * The following hack solves problems:
- * vi->pgoff may be not zero in a target process.
- * This mapping may be mapped more then once.
- * The restorer doesn't have snprintf.
- * Here is a good place to restore content
- */
- addr = mmap(NULL, si->size, PROT_WRITE | PROT_READ, flags, f, 0);
- if (addr == MAP_FAILED) {
- pr_err("Can't mmap shmid=0x%"PRIx64" size=%ld\n",
- vi->shmid, si->size);
- goto err;
- }
-
- if (restore_shmem_content(addr, si) < 0) {
- pr_err("Can't restore shmem content\n");
- goto err;
- }
-
- if (f == -1) {
- f = open_proc_rw(getpid(), "map_files/%lx-%lx",
- (unsigned long) addr,
- (unsigned long) addr + si->size);
- if (f < 0)
- goto err;
- }
- munmap(addr, si->size);
-
- si->fd = f;
-
- /* Send signal to slaves, that they can open fd for this shmem */
- futex_inc_and_wake(&si->lock);
- /*
- * All other regions in this process will duplicate
- * the file descriptor, so we don't wait them.
- */
- futex_wait_until(&si->lock, si->count - si->self_count + 1);
-
- return f;
-err:
- if (addr != MAP_FAILED)
- munmap(addr, si->size);
- close_safe(&f);
- return -1;
-}
-
-struct shmem_info_dump {
- unsigned long size;
- unsigned long shmid;
- unsigned long start;
- unsigned long end;
- int pid;
-
- struct shmem_info_dump *next;
-};
-
-#define SHMEM_HASH_SIZE 32
-static struct shmem_info_dump *shmems_hash[SHMEM_HASH_SIZE];
-
-static struct shmem_info_dump *shmem_find(struct shmem_info_dump **chain,
- unsigned long shmid)
-{
- struct shmem_info_dump *sh;
-
- for (sh = *chain; sh; sh = sh->next)
- if (sh->shmid == shmid)
- return sh;
-
- return NULL;
-}
-
-int add_shmem_area(pid_t pid, VmaEntry *vma)
-{
- struct shmem_info_dump *si, **chain;
- unsigned long size = vma->pgoff + (vma->end - vma->start);
-
- chain = &shmems_hash[vma->shmid % SHMEM_HASH_SIZE];
- si = shmem_find(chain, vma->shmid);
- if (si) {
- if (si->size < size)
- si->size = size;
- return 0;
- }
-
- si = xmalloc(sizeof(*si));
- if (!si)
- return -1;
-
- si->next = *chain;
- *chain = si;
-
- si->size = size;
- si->pid = pid;
- si->start = vma->start;
- si->end = vma->end;
- si->shmid = vma->shmid;
-
- return 0;
-}
-
-static int dump_pages(struct page_pipe *pp, struct page_xfer *xfer, void *addr)
-{
- struct page_pipe_buf *ppb;
-
- list_for_each_entry(ppb, &pp->bufs, l)
- if (vmsplice(ppb->p[1], ppb->iov, ppb->nr_segs,
- SPLICE_F_GIFT | SPLICE_F_NONBLOCK) !=
- ppb->pages_in * PAGE_SIZE) {
- pr_perror("Can't get shmem into page-pipe");
- return -1;
- }
-
- return page_xfer_dump_pages(xfer, pp, (unsigned long)addr);
-}
-
-static int dump_one_shmem(struct shmem_info_dump *si)
-{
- struct iovec *iovs;
- struct page_pipe *pp;
- struct page_xfer xfer;
- int err, ret = -1, fd;
- unsigned char *map = NULL;
- void *addr = NULL;
- unsigned long pfn, nrpages;
-
- pr_info("Dumping shared memory %ld\n", si->shmid);
-
- nrpages = (si->size + PAGE_SIZE - 1) / PAGE_SIZE;
- map = xmalloc(nrpages * sizeof(*map));
- if (!map)
- goto err;
-
- fd = open_proc(si->pid, "map_files/%lx-%lx", si->start, si->end);
- if (fd < 0)
- goto err;
-
- addr = mmap(NULL, si->size, PROT_READ, MAP_SHARED, fd, 0);
- close(fd);
- if (addr == MAP_FAILED) {
- pr_err("Can't map shmem 0x%lx (0x%lx-0x%lx)\n",
- si->shmid, si->start, si->end);
- goto err;
- }
-
- /*
- * We can't use pagemap here, because this vma is
- * not mapped to us at all, but mincore reports the
- * pagecache status of a file, which is correct in
- * this case.
- */
-
- err = mincore(addr, si->size, map);
- if (err)
- goto err_unmap;
-
- iovs = xmalloc(((nrpages + 1) / 2) * sizeof(struct iovec));
- if (!iovs)
- goto err_unmap;
-
- pp = create_page_pipe((nrpages + 1) / 2, iovs, true);
- if (!pp)
- goto err_iovs;
-
- err = open_page_xfer(&xfer, CR_FD_SHMEM_PAGEMAP, si->shmid);
- if (err)
- goto err_pp;
-
- for (pfn = 0; pfn < nrpages; pfn++) {
- if (!(map[pfn] & PAGE_RSS))
- continue;
-again:
- ret = page_pipe_add_page(pp, (unsigned long)addr + pfn * PAGE_SIZE);
- if (ret == -EAGAIN) {
- ret = dump_pages(pp, &xfer, addr);
- if (ret)
- goto err_xfer;
- page_pipe_reinit(pp);
- goto again;
- } else if (ret)
- goto err_xfer;
- }
-
- ret = dump_pages(pp, &xfer, addr);
-
-err_xfer:
- xfer.close(&xfer);
-err_pp:
- destroy_page_pipe(pp);
-err_iovs:
- xfree(iovs);
-err_unmap:
- munmap(addr, si->size);
-err:
- xfree(map);
- return ret;
-}
-
-#define for_each_shmem_dump(_i, _si) \
- for (i = 0; i < SHMEM_HASH_SIZE; i++) \
- for (si = shmems_hash[i]; si; si = si->next)
-
-int cr_dump_shmem(void)
-{
- int ret = 0, i;
- struct shmem_info_dump *si;
-
- for_each_shmem_dump (i, si) {
- ret = dump_one_shmem(si);
- if (ret)
- break;
- }
-
- return ret;
-}
diff --git a/sigframe.c b/sigframe.c
deleted file mode 100644
index 448749320231..000000000000
--- a/sigframe.c
+++ /dev/null
@@ -1,36 +0,0 @@
-#include <unistd.h>
-#include <string.h>
-
-#include "asm/restore.h"
-#include "asm/restorer.h"
-
-#include "protobuf/core.pb-c.h"
-
-int construct_sigframe(struct rt_sigframe *sigframe,
- struct rt_sigframe *rsigframe,
- CoreEntry *core)
-{
- k_rtsigset_t *blk_sigset = (k_rtsigset_t*)&RT_SIGFRAME_UC(sigframe).uc_sigmask;
-
- if (core->tc)
- memcpy(blk_sigset, &core->tc->blk_sigset, sizeof(k_rtsigset_t));
- else if (core->thread_core->has_blk_sigset) {
- memcpy(blk_sigset,
- &core->thread_core->blk_sigset, sizeof(k_rtsigset_t));
- } else
- memset(blk_sigset, 0, sizeof(k_rtsigset_t));
-
- if (restore_fpu(sigframe, core))
- return -1;
-
- if (RT_SIGFRAME_HAS_FPU(sigframe))
- if (sigreturn_prep_fpu_frame(sigframe, &RT_SIGFRAME_FPU(rsigframe)))
- return -1;
-
- if (restore_gpregs(sigframe, CORE_THREAD_ARCH_INFO(core)->gpregs))
- return -1;
-
- setup_sas(sigframe, core->thread_core->sas);
-
- return 0;
-}
diff --git a/signalfd.c b/signalfd.c
deleted file mode 100644
index 6d686d44c443..000000000000
--- a/signalfd.c
+++ /dev/null
@@ -1,123 +0,0 @@
-#include <unistd.h>
-#include <signal.h>
-#include <sys/signalfd.h>
-
-#include "compiler.h"
-#include "asm/types.h"
-#include "signalfd.h"
-#include "proc_parse.h"
-#include "imgset.h"
-#include "image.h"
-#include "util.h"
-#include "log.h"
-#include "files.h"
-
-#include "protobuf.h"
-#include "protobuf/signalfd.pb-c.h"
-
-struct signalfd_info {
- SignalfdEntry *sfe;
- struct file_desc d;
-};
-
-int is_signalfd_link(char *link)
-{
- return is_anon_link_type(link, "[signalfd]");
-}
-
-struct signalfd_dump_arg {
- u32 id;
- const struct fd_parms *p;
- bool dumped;
-};
-
-static int dump_signalfd_entry(union fdinfo_entries *e, void *arg)
-{
- struct signalfd_dump_arg *da = arg;
-
- if (da->dumped) {
- pr_err("Several counters in a file?\n");
- return -1;
- }
-
- da->dumped = true;
- e->sfd.id = da->id;
- e->sfd.flags = da->p->flags;
- e->sfd.fown = (FownEntry *)&da->p->fown;
-
- return pb_write_one(img_from_set(glob_imgset, CR_FD_SIGNALFD),
- &e->sfd, PB_SIGNALFD);
-}
-
-static int dump_one_signalfd(int lfd, u32 id, const struct fd_parms *p)
-{
- struct signalfd_dump_arg da = { .id = id, .p = p, };
- return parse_fdinfo(lfd, FD_TYPES__SIGNALFD, dump_signalfd_entry, &da);
-}
-
-const struct fdtype_ops signalfd_dump_ops = {
- .type = FD_TYPES__SIGNALFD,
- .dump = dump_one_signalfd,
-};
-
-static void sigset_fill(sigset_t *to, unsigned long long from)
-{
- int sig;
-
- pr_info("\tCalculating sigmask for %Lx\n", from);
- sigemptyset(to);
- for (sig = 1; sig < NSIG; sig++)
- if (from & (1ULL << (sig - 1))) {
- pr_debug("\t\tAdd %d signal to mask\n", sig);
- sigaddset(to, sig);
- }
-}
-
-static int signalfd_open(struct file_desc *d)
-{
- struct signalfd_info *info;
- int tmp;
- sigset_t mask;
-
- info = container_of(d, struct signalfd_info, d);
- pr_info("Restoring signalfd %#x\n", info->sfe->id);
-
- sigset_fill(&mask, info->sfe->sigmask);
- tmp = signalfd(-1, &mask, 0);
- if (tmp < 0) {
- pr_perror("Can't create signalfd %#08x", info->sfe->id);
- return -1;
- }
-
- if (rst_file_params(tmp, info->sfe->fown, info->sfe->flags)) {
- pr_perror("Can't restore params on signalfd %#08x",
- info->sfe->id);
- goto err_close;
- }
-
- return tmp;
-
-err_close:
- close(tmp);
- return -1;
-}
-
-static struct file_desc_ops signalfd_desc_ops = {
- .type = FD_TYPES__SIGNALFD,
- .open = signalfd_open,
-};
-
-static int collect_one_sigfd(void *o, ProtobufCMessage *msg)
-{
- struct signalfd_info *info = o;
-
- info->sfe = pb_msg(msg, SignalfdEntry);
- return file_desc_add(&info->d, info->sfe->id, &signalfd_desc_ops);
-}
-
-struct collect_image_info signalfd_cinfo = {
- .fd_type = CR_FD_SIGNALFD,
- .pb_type = PB_SIGNALFD,
- .priv_size = sizeof(struct signalfd_info),
- .collect = collect_one_sigfd,
-};
diff --git a/sk-inet.c b/sk-inet.c
deleted file mode 100644
index 4d1110767c33..000000000000
--- a/sk-inet.c
+++ /dev/null
@@ -1,758 +0,0 @@
-#include <sys/types.h>
-#include <sys/socket.h>
-#include <linux/netlink.h>
-#include <linux/rtnetlink.h>
-#include <net/if.h>
-#include <sys/mman.h>
-#include <unistd.h>
-#include <netinet/tcp.h>
-#include <arpa/inet.h>
-#include <string.h>
-#include <stdlib.h>
-
-#include "asm/types.h"
-#include "libnetlink.h"
-#include "cr_options.h"
-#include "imgset.h"
-#include "inet_diag.h"
-#include "files.h"
-#include "image.h"
-#include "log.h"
-#include "util.h"
-#include "sockets.h"
-#include "sk-inet.h"
-
-#define PB_ALEN_INET 1
-#define PB_ALEN_INET6 4
-
-static LIST_HEAD(inet_ports);
-
-struct inet_port {
- int port;
- int type;
- futex_t users;
- mutex_t reuseaddr_lock;
- struct list_head list;
-};
-
-static struct inet_port *port_add(int type, int port)
-{
- struct inet_port *e;
-
- list_for_each_entry(e, &inet_ports, list)
- if (e->type == type && e->port == port) {
- futex_inc(&e->users);
- return e;
- }
-
- e = shmalloc(sizeof(*e));
- if (e == NULL) {
- pr_err("Not enough memory\n");
- return NULL;
- }
-
- e->port = port;
- e->type = type;
- futex_init(&e->users);
- futex_inc(&e->users);
- mutex_init(&e->reuseaddr_lock);
-
- list_add(&e->list, &inet_ports);
-
- return e;
-}
-
-static void show_one_inet(const char *act, const struct inet_sk_desc *sk)
-{
- char src_addr[INET_ADDR_LEN] = "<unknown>";
-
- if (inet_ntop(sk->sd.family, (void *)sk->src_addr, src_addr,
- INET_ADDR_LEN) == NULL) {
- pr_perror("Failed to translate address");
- }
-
- pr_debug("\t%s: ino 0x%8x family %4d type %4d port %8d "
- "state %2d src_addr %s\n",
- act, sk->sd.ino, sk->sd.family, sk->type, sk->src_port,
- sk->state, src_addr);
-}
-
-static void show_one_inet_img(const char *act, const InetSkEntry *e)
-{
- char src_addr[INET_ADDR_LEN] = "<unknown>";
-
- if (inet_ntop(e->family, (void *)e->src_addr, src_addr,
- INET_ADDR_LEN) == NULL) {
- pr_perror("Failed to translate address");
- }
-
- pr_debug("\t%s: family %d type %d proto %d port %d "
- "state %d src_addr %s\n",
- act, e->family, e->type, e->proto, e->src_port,
- e->state, src_addr);
-}
-
-static int can_dump_ipproto(int ino, int proto)
-{
- /* Make sure it's a proto we support */
- switch (proto) {
- case IPPROTO_IP:
- case IPPROTO_TCP:
- case IPPROTO_UDP:
- case IPPROTO_UDPLITE:
- break;
- default:
- pr_err("Unsupported proto %d for socket %x\n", proto, ino);
- return 0;
- }
-
- return 1;
-}
-
-static int can_dump_inet_sk(const struct inet_sk_desc *sk)
-{
- BUG_ON((sk->sd.family != AF_INET) && (sk->sd.family != AF_INET6));
-
- if (sk->shutdown) {
- pr_err("Can't dump shutdown inet socket %x\n",
- sk->sd.ino);
- return 0;
- }
-
- if (sk->type == SOCK_DGRAM) {
- if (sk->wqlen != 0) {
- pr_err("Can't dump corked dgram socket %x\n",
- sk->sd.ino);
- return 0;
- }
-
- if (sk->rqlen)
- pr_warn("Read queue is dropped for socket %x\n",
- sk->sd.ino);
-
- return 1;
- }
-
- if (sk->type != SOCK_STREAM) {
- pr_err("Can't dump %d inet socket %x. "
- "Only can stream and dgram.\n",
- sk->type, sk->sd.ino);
- return 0;
- }
-
- switch (sk->state) {
- case TCP_LISTEN:
- if (sk->rqlen != 0) {
- /*
- * Currently the ICONS nla reports the conn
- * requests for listen sockets. Need to pick
- * those up and fix the connect job respectively
- */
- pr_err("In-flight connection (l) for %x\n",
- sk->sd.ino);
- return 0;
- }
- break;
- case TCP_ESTABLISHED:
- if (!opts.tcp_established_ok) {
- pr_err("Connected TCP socket, consider using --%s option.\n",
- SK_EST_PARAM);
- return 0;
- }
- break;
- case TCP_CLOSE:
- /* Trivial case, we just need to create a socket on restore */
- break;
- default:
- pr_err("Unknown inet socket %x state %d\n", sk->sd.ino, sk->state);
- return 0;
- }
-
- return 1;
-}
-
-static struct inet_sk_desc *gen_uncon_sk(int lfd, const struct fd_parms *p, int proto)
-{
- struct inet_sk_desc *sk;
- char address;
- socklen_t aux;
- int ret;
-
- sk = xzalloc(sizeof(*sk));
- if (!sk)
- goto err;
-
- /* It should has no peer name */
- aux = sizeof(address);
- ret = getsockopt(lfd, SOL_SOCKET, SO_PEERNAME, &address, &aux);
- if (ret < 0) {
- if (errno != ENOTCONN) {
- pr_perror("Unexpected error returned from unconnected socket");
- goto err;
- }
- } else if (ret == 0) {
- pr_err("Name resolved on unconnected socket\n");
- goto err;
- }
-
- sk->sd.ino = p->stat.st_ino;
-
- ret = do_dump_opt(lfd, SOL_SOCKET, SO_DOMAIN, &sk->sd.family, sizeof(sk->sd.family));
- ret |= do_dump_opt(lfd, SOL_SOCKET, SO_TYPE, &sk->type, sizeof(sk->type));
- if (ret)
- goto err;
-
- if (proto == IPPROTO_TCP) {
- struct tcp_info info;
-
- aux = sizeof(info);
- ret = getsockopt(lfd, SOL_TCP, TCP_INFO, &info, &aux);
- if (ret) {
- pr_perror("Failed to obtain TCP_INFO");
- goto err;
- }
-
- if (info.tcpi_state != TCP_CLOSE) {
- pr_err("Socket state %d obtained but expected %d\n",
- info.tcpi_state, TCP_CLOSE);
- goto err;
- }
-
- sk->wqlen = info.tcpi_backoff;
- }
-
- sk->state = TCP_CLOSE;
-
- sk_collect_one(sk->sd.ino, sk->sd.family, &sk->sd);
-
- return sk;
-err:
- xfree(sk);
- return NULL;
-}
-
-static int dump_ip_opts(int sk, IpOptsEntry *ioe)
-{
- int ret = 0;
-
- ret |= dump_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind);
- ioe->has_freebind = ioe->freebind;
-
- return ret;
-}
-
-/* Stolen from the kernel's __ipv6_addr_type/__ipv6_addr_needs_scopeid;
- * link local and (multicast + loopback + linklocal) addrs require a
- * scope id.
- */
-#define IPV6_ADDR_SCOPE_NODELOCAL 0x01
-#define IPV6_ADDR_SCOPE_LINKLOCAL 0x02
-static bool needs_scope_id(uint32_t *src_addr)
-{
- if ((src_addr[0] & htonl(0xFF00000)) == htonl(0xFF000000)) {
- if (src_addr[1] & (IPV6_ADDR_SCOPE_LINKLOCAL|IPV6_ADDR_SCOPE_NODELOCAL))
- return true;
- }
-
- if ((src_addr[0] & htonl(0xFFC00000)) == htonl(0xFE800000))
- return true;
-
- return false;
-}
-
-static int do_dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p, int family)
-{
- struct inet_sk_desc *sk;
- InetSkEntry ie = INET_SK_ENTRY__INIT;
- IpOptsEntry ipopts = IP_OPTS_ENTRY__INIT;
- SkOptsEntry skopts = SK_OPTS_ENTRY__INIT;
- int ret = -1, err = -1, proto;
-
- ret = do_dump_opt(lfd, SOL_SOCKET, SO_PROTOCOL,
- &proto, sizeof(proto));
- if (ret)
- goto err;
-
- if (!can_dump_ipproto(p->stat.st_ino, proto))
- goto err;
-
- sk = (struct inet_sk_desc *)lookup_socket(p->stat.st_ino, family, proto);
- if (IS_ERR(sk))
- goto err;
- if (!sk) {
- sk = gen_uncon_sk(lfd, p, proto);
- if (!sk)
- goto err;
- }
-
- if (!can_dump_inet_sk(sk))
- goto err;
-
- BUG_ON(sk->sd.already_dumped);
-
- ie.id = id;
- ie.ino = sk->sd.ino;
- ie.family = family;
- ie.proto = proto;
- ie.type = sk->type;
- ie.state = sk->state;
- ie.src_port = sk->src_port;
- ie.dst_port = sk->dst_port;
- ie.backlog = sk->wqlen;
- ie.flags = p->flags;
-
- ie.fown = (FownEntry *)&p->fown;
- ie.opts = &skopts;
- ie.ip_opts = &ipopts;
-
- ie.n_src_addr = PB_ALEN_INET;
- ie.n_dst_addr = PB_ALEN_INET;
- if (ie.family == AF_INET6) {
- int val;
- char device[IFNAMSIZ];
- socklen_t len = sizeof(device);
-
- ie.n_src_addr = PB_ALEN_INET6;
- ie.n_dst_addr = PB_ALEN_INET6;
-
- ret = dump_opt(lfd, SOL_IPV6, IPV6_V6ONLY, &val);
- if (ret < 0)
- goto err;
-
- ie.v6only = val ? true : false;
- ie.has_v6only = true;
-
- /* ifindex only matters on source ports for bind, so let's
- * find only that ifindex. */
- if (sk->src_port && needs_scope_id(sk->src_addr)) {
- if (getsockopt(lfd, SOL_SOCKET, SO_BINDTODEVICE, device, &len) < 0) {
- pr_perror("can't get ifname");
- goto err;
- }
-
- if (len > 0) {
- ie.ifname = xstrdup(device);
- if (!ie.ifname)
- goto err;
- } else {
- pr_err("couldn't find ifname for %d, can't bind\n", id);
- goto err;
- }
- }
- }
-
- ie.src_addr = xmalloc(pb_repeated_size(&ie, src_addr));
- ie.dst_addr = xmalloc(pb_repeated_size(&ie, dst_addr));
-
- if (!ie.src_addr || !ie.dst_addr)
- goto err;
-
- memcpy(ie.src_addr, sk->src_addr, pb_repeated_size(&ie, src_addr));
- memcpy(ie.dst_addr, sk->dst_addr, pb_repeated_size(&ie, dst_addr));
-
- if (dump_ip_opts(lfd, &ipopts))
- goto err;
-
- if (dump_socket_opts(lfd, &skopts))
- goto err;
-
- if (pb_write_one(img_from_set(glob_imgset, CR_FD_INETSK), &ie, PB_INET_SK))
- goto err;
-
- pr_info("Dumping inet socket at %d\n", p->fd);
- show_one_inet("Dumping", sk);
- show_one_inet_img("Dumped", &ie);
- sk->sd.already_dumped = 1;
- sk->cpt_reuseaddr = skopts.reuseaddr;
-
- switch (proto) {
- case IPPROTO_TCP:
- err = dump_one_tcp(lfd, sk);
- break;
- default:
- err = 0;
- break;
- }
-err:
- release_skopts(&skopts);
- xfree(ie.src_addr);
- xfree(ie.dst_addr);
- return err;
-}
-
-static int dump_one_inet_fd(int lfd, u32 id, const struct fd_parms *p)
-{
- return do_dump_one_inet_fd(lfd, id, p, PF_INET);
-}
-
-const struct fdtype_ops inet_dump_ops = {
- .type = FD_TYPES__INETSK,
- .dump = dump_one_inet_fd,
-};
-
-static int dump_one_inet6_fd(int lfd, u32 id, const struct fd_parms *p)
-{
- return do_dump_one_inet_fd(lfd, id, p, PF_INET6);
-}
-
-const struct fdtype_ops inet6_dump_ops = {
- .type = FD_TYPES__INETSK,
- .dump = dump_one_inet6_fd,
-};
-
-int inet_collect_one(struct nlmsghdr *h, int family, int type)
-{
- struct inet_sk_desc *d;
- struct inet_diag_msg *m = NLMSG_DATA(h);
- struct rtattr *tb[INET_DIAG_MAX+1];
- int ret;
-
- parse_rtattr(tb, INET_DIAG_MAX, (struct rtattr *)(m + 1),
- h->nlmsg_len - NLMSG_LENGTH(sizeof(*m)));
-
- d = xzalloc(sizeof(*d));
- if (!d)
- return -1;
-
- d->type = type;
- d->src_port = ntohs(m->id.idiag_sport);
- d->dst_port = ntohs(m->id.idiag_dport);
- d->state = m->idiag_state;
- d->rqlen = m->idiag_rqueue;
- d->wqlen = m->idiag_wqueue;
- memcpy(d->src_addr, m->id.idiag_src, sizeof(u32) * 4);
- memcpy(d->dst_addr, m->id.idiag_dst, sizeof(u32) * 4);
-
- if (tb[INET_DIAG_SHUTDOWN])
- d->shutdown = *(u8 *)RTA_DATA(tb[INET_DIAG_SHUTDOWN]);
- else
- pr_err_once("Can't check shutdown state of inet socket\n");
-
- ret = sk_collect_one(m->idiag_inode, family, &d->sd);
-
- show_one_inet("Collected", d);
-
- return ret;
-}
-
-static int open_inet_sk(struct file_desc *d);
-static int post_open_inet_sk(struct file_desc *d, int sk);
-
-static struct file_desc_ops inet_desc_ops = {
- .type = FD_TYPES__INETSK,
- .open = open_inet_sk,
- .post_open = post_open_inet_sk,
-};
-
-static inline int tcp_connection(InetSkEntry *ie)
-{
- return (ie->proto == IPPROTO_TCP) && (ie->state == TCP_ESTABLISHED);
-}
-
-static int collect_one_inetsk(void *o, ProtobufCMessage *base)
-{
- struct inet_sk_info *ii = o;
-
- ii->ie = pb_msg(base, InetSkEntry);
- if (tcp_connection(ii->ie))
- tcp_locked_conn_add(ii);
-
- /*
- * A socket can reuse addr only if all previous sockets allow that,
- * so a value of SO_REUSEADDR can be restored after restoring all
- * sockets.
- */
- ii->port = port_add(ii->ie->type, ii->ie->src_port);
- if (ii->port == NULL)
- return -1;
-
- return file_desc_add(&ii->d, ii->ie->id, &inet_desc_ops);
-}
-
-struct collect_image_info inet_sk_cinfo = {
- .fd_type = CR_FD_INETSK,
- .pb_type = PB_INET_SK,
- .priv_size = sizeof(struct inet_sk_info),
- .collect = collect_one_inetsk,
-};
-
-int collect_inet_sockets(void)
-{
- return collect_image(&inet_sk_cinfo);
-}
-
-static int inet_validate_address(InetSkEntry *ie)
-{
- if ((ie->family == AF_INET) &&
- /* v0.1 had 4 in ipv4 addr len */
- (ie->n_src_addr >= PB_ALEN_INET) &&
- (ie->n_dst_addr >= PB_ALEN_INET))
- return 0;
-
- if ((ie->family == AF_INET6) &&
- (ie->n_src_addr == PB_ALEN_INET6) &&
- (ie->n_dst_addr == PB_ALEN_INET6))
- return 0;
-
- pr_err("Addr len mismatch f %d ss %zu ds %zu\n", ie->family,
- pb_repeated_size(ie, src_addr),
- pb_repeated_size(ie, dst_addr));
-
- return -1;
-}
-
-static int post_open_inet_sk(struct file_desc *d, int sk)
-{
- struct inet_sk_info *ii;
- int val;
-
- ii = container_of(d, struct inet_sk_info, d);
-
- /*
- * TCP sockets are handled at the last moment
- * after unlocking connections.
- */
- if (tcp_connection(ii->ie)) {
- pr_debug("Schedule %d socket for repair off\n", sk);
- BUG_ON(ii->sk_fd != -1);
- ii->sk_fd = sk;
- return 0;
- }
-
- /* SO_REUSEADDR is set for all sockets */
- if (ii->ie->opts->reuseaddr)
- return 0;
-
- futex_wait_until(&ii->port->users, 0);
-
- val = ii->ie->opts->reuseaddr;
- if (restore_opt(sk, SOL_SOCKET, SO_REUSEADDR, &val))
- return -1;
-
- return 0;
-}
-
-int restore_ip_opts(int sk, IpOptsEntry *ioe)
-{
- int ret = 0;
-
- if (ioe->has_freebind)
- ret |= restore_opt(sk, SOL_IP, IP_FREEBIND, &ioe->freebind);
-
- return ret;
-}
-static int open_inet_sk(struct file_desc *d)
-{
- struct inet_sk_info *ii;
- InetSkEntry *ie;
- int sk, yes = 1;
-
- ii = container_of(d, struct inet_sk_info, d);
- ie = ii->ie;
-
- show_one_inet_img("Restore", ie);
-
- if (ie->family != AF_INET && ie->family != AF_INET6) {
- pr_err("Unsupported socket family: %d\n", ie->family);
- return -1;
- }
-
- if ((ie->type != SOCK_STREAM) && (ie->type != SOCK_DGRAM)) {
- pr_err("Unsupported socket type: %d\n", ie->type);
- return -1;
- }
-
- if (inet_validate_address(ie))
- return -1;
-
- sk = socket(ie->family, ie->type, ie->proto);
- if (sk < 0) {
- pr_perror("Can't create inet socket");
- return -1;
- }
-
- if (ie->v6only) {
- if (restore_opt(sk, SOL_IPV6, IPV6_V6ONLY, &yes) == -1)
- goto err;
- }
-
- /*
- * Set SO_REUSEADDR, because some sockets can be bound to one addr.
- * The origin value of SO_REUSEADDR will be restored in post_open.
- */
- if (restore_opt(sk, SOL_SOCKET, SO_REUSEADDR, &yes))
- goto err;
-
- if (tcp_connection(ie)) {
- if (!opts.tcp_established_ok) {
- pr_err("Connected TCP socket in image\n");
- goto err;
- }
-
- if (restore_one_tcp(sk, ii))
- goto err;
-
- goto done;
- }
-
- /*
- * Listen sockets are easiest ones -- simply
- * bind() and listen(), and that's all.
- */
-
- if (ie->src_port) {
- if (inet_bind(sk, ii))
- goto err;
- }
-
- if (ie->state == TCP_LISTEN) {
- if (ie->proto != IPPROTO_TCP) {
- pr_err("Wrong socket in listen state %d\n", ie->proto);
- goto err;
- }
-
- mutex_lock(&ii->port->reuseaddr_lock);
- if (listen(sk, ie->backlog) == -1) {
- pr_perror("Can't listen on a socket");
- mutex_unlock(&ii->port->reuseaddr_lock);
- goto err;
- }
- mutex_unlock(&ii->port->reuseaddr_lock);
- }
-
- if (ie->state == TCP_ESTABLISHED &&
- inet_connect(sk, ii))
- goto err;
-done:
- futex_dec_and_wake(&ii->port->users);
-
- if (rst_file_params(sk, ie->fown, ie->flags))
- goto err;
-
- if (ie->ip_opts && restore_ip_opts(sk, ie->ip_opts))
- goto err;
-
- if (restore_socket_opts(sk, ie->opts))
- goto err;
-
- return sk;
-
-err:
- close(sk);
- return -1;
-}
-
-union sockaddr_inet {
- struct sockaddr_in v4;
- struct sockaddr_in6 v6;
-};
-
-static int restore_sockaddr(union sockaddr_inet *sa,
- int family, u32 pb_port, u32 *pb_addr, u32 ifindex)
-{
- BUILD_BUG_ON(sizeof(sa->v4.sin_addr.s_addr) > PB_ALEN_INET * sizeof(u32));
- BUILD_BUG_ON(sizeof(sa->v6.sin6_addr.s6_addr) > PB_ALEN_INET6 * sizeof(u32));
-
- memzero(sa, sizeof(*sa));
-
- if (family == AF_INET) {
- sa->v4.sin_family = AF_INET;
- sa->v4.sin_port = htons(pb_port);
- memcpy(&sa->v4.sin_addr.s_addr, pb_addr, sizeof(sa->v4.sin_addr.s_addr));
- return sizeof(sa->v4);
- }
-
- if (family == AF_INET6) {
- sa->v6.sin6_family = AF_INET6;
- sa->v6.sin6_port = htons(pb_port);
- memcpy(sa->v6.sin6_addr.s6_addr, pb_addr, sizeof(sa->v6.sin6_addr.s6_addr));
-
- /* Here although the struct member is called scope_id, the
- * kernel really wants ifindex. See
- * /net/ipv6/af_inet6.c:inet6_bind for details.
- */
- sa->v6.sin6_scope_id = ifindex;
- return sizeof(sa->v6);
- }
-
- BUG();
- return -1;
-}
-
-int inet_bind(int sk, struct inet_sk_info *ii)
-{
- bool rst_freebind = false;
- union sockaddr_inet addr;
- int addr_size, ifindex = 0;
-
- if (ii->ie->ifname) {
- ifindex = if_nametoindex(ii->ie->ifname);
- if (!ifindex) {
- pr_err("couldn't find ifindex for %s\n", ii->ie->ifname);
- return -1;
- }
- }
-
- addr_size = restore_sockaddr(&addr, ii->ie->family,
- ii->ie->src_port, ii->ie->src_addr, ifindex);
-
- /*
- * ipv6 addresses go through a “tentative” phase and
- * sockets could not be bound to them in this moment
- * without setting IP_FREEBIND.
- */
- if (ii->ie->family == AF_INET6) {
- int yes = 1;
-
- if (restore_opt(sk, SOL_IP, IP_FREEBIND, &yes))
- return -1;
-
- if (ii->ie->ip_opts && ii->ie->ip_opts->freebind)
- /*
- * The right value is already set, so
- * don't need to restore it in restore_ip_opts()
- */
- ii->ie->ip_opts->has_freebind = false;
- else
- rst_freebind = true;
- }
-
- if (bind(sk, (struct sockaddr *)&addr, addr_size) == -1) {
- pr_perror("Can't bind inet socket (id %d)", ii->ie->id);
- return -1;
- }
-
- if (rst_freebind) {
- int no = 0;
-
- /*
- * The "no" value is default, so it will not be
- * restore in restore_ip_opts()
- */
- if (restore_opt(sk, SOL_IP, IP_FREEBIND, &no))
- return -1;
- }
-
- return 0;
-}
-
-int inet_connect(int sk, struct inet_sk_info *ii)
-{
- union sockaddr_inet addr;
- int addr_size;
-
- addr_size = restore_sockaddr(&addr, ii->ie->family,
- ii->ie->dst_port, ii->ie->dst_addr, 0);
-
- if (connect(sk, (struct sockaddr *)&addr, addr_size) == -1) {
- pr_perror("Can't connect inet socket back");
- return -1;
- }
-
- return 0;
-}
-
-mutex_t *inet_get_reuseaddr_lock(struct inet_sk_info *ii)
-{
- return &ii->port->reuseaddr_lock;
-}
diff --git a/sk-netlink.c b/sk-netlink.c
deleted file mode 100644
index a98b26dc82b4..000000000000
--- a/sk-netlink.c
+++ /dev/null
@@ -1,233 +0,0 @@
-#include <unistd.h>
-#include <linux/netlink.h>
-#include <linux/rtnetlink.h>
-
-#include "imgset.h"
-#include "files.h"
-#include "sockets.h"
-#include "util.h"
-
-#include "protobuf.h"
-#include "protobuf/sk-netlink.pb-c.h"
-#include "netlink_diag.h"
-#include "libnetlink.h"
-
-struct netlink_sk_desc {
- struct socket_desc sd;
- u32 portid;
- u32 *groups;
- u32 gsize;
- u32 dst_portid;
- u32 dst_group;
- u8 state;
- u8 protocol;
-};
-
-int netlink_receive_one(struct nlmsghdr *hdr, void *arg)
-{
- struct rtattr *tb[NETLINK_DIAG_MAX+1];
- struct netlink_diag_msg *m;
- struct netlink_sk_desc *sd;
- unsigned long *groups;
-
- m = NLMSG_DATA(hdr);
- pr_debug("Collect netlink sock 0x%x\n", m->ndiag_ino);
-
- sd = xmalloc(sizeof(*sd));
- if (!sd)
- return -1;
-
- sd->protocol = m->ndiag_protocol;
- sd->portid = m->ndiag_portid;
- sd->dst_portid = m->ndiag_dst_portid;
- sd->dst_group = m->ndiag_dst_group;
- sd->state = m->ndiag_state;
-
- parse_rtattr(tb, NETLINK_DIAG_MAX, (struct rtattr *)(m + 1),
- hdr->nlmsg_len - NLMSG_LENGTH(sizeof(*m)));
-
- if (tb[NETLINK_DIAG_GROUPS]) {
- sd->gsize = RTA_PAYLOAD(tb[NETLINK_DIAG_GROUPS]);
- groups = RTA_DATA(tb[NETLINK_DIAG_GROUPS]);
-
- sd->groups = xmalloc(sd->gsize);
- if (!sd->groups) {
- xfree(sd);
- return -1;
- }
- memcpy(sd->groups, groups, sd->gsize);
- } else {
- sd->groups = NULL;
- sd->gsize = 0;
- }
-
- return sk_collect_one(m->ndiag_ino, PF_NETLINK, &sd->sd);
-}
-
-static bool can_dump_netlink_sk(int lfd)
-{
- int ret;
-
- ret = fd_has_data(lfd);
- if (ret == 1)
- pr_err("The socket has data to read\n");
-
- return ret == 0;
-}
-
-static int dump_one_netlink_fd(int lfd, u32 id, const struct fd_parms *p)
-{
- struct netlink_sk_desc *sk;
- NetlinkSkEntry ne = NETLINK_SK_ENTRY__INIT;
- SkOptsEntry skopts = SK_OPTS_ENTRY__INIT;
-
- sk = (struct netlink_sk_desc *)lookup_socket(p->stat.st_ino, PF_NETLINK, 0);
- if (IS_ERR(sk))
- goto err;
-
- ne.id = id;
- ne.ino = p->stat.st_ino;
-
- if (!can_dump_netlink_sk(lfd))
- goto err;
-
- if (sk) {
- BUG_ON(sk->sd.already_dumped);
-
- ne.protocol = sk->protocol;
- ne.portid = sk->portid;
- ne.groups = sk->groups;
-
-
- ne.n_groups = sk->gsize / sizeof(ne.groups[0]);
- /*
- * On 64-bit sk->gsize is multiple to 8 bytes (sizeof(long)),
- * so remove the last 4 bytes if they are empty.
- */
- if (ne.n_groups && sk->groups[ne.n_groups - 1] == 0)
- ne.n_groups -= 1;
-
- if (ne.n_groups > 1) {
- pr_err("%d %x\n", sk->gsize, sk->groups[1]);
- pr_err("The netlink socket 0x%x has more than 32 groups\n", ne.ino);
- return -1;
- }
- if (sk->groups && !sk->portid) {
- pr_err("The netlink socket 0x%x is bound to groups but not to portid\n", ne.ino);
- return -1;
- }
- ne.state = sk->state;
- ne.dst_portid = sk->dst_portid;
- ne.dst_group = sk->dst_group;
- } else { /* unconnected and unbound socket */
- int val;
- socklen_t aux = sizeof(val);
-
- if (getsockopt(lfd, SOL_SOCKET, SO_PROTOCOL, &val, &aux) < 0) {
- pr_perror("Unable to get protocol for netlink socket");
- goto err;
- }
-
- ne.protocol = val;
- }
-
- ne.fown = (FownEntry *)&p->fown;
- ne.opts = &skopts;
-
- if (dump_socket_opts(lfd, &skopts))
- goto err;
-
- if (pb_write_one(img_from_set(glob_imgset, CR_FD_NETLINK_SK), &ne, PB_NETLINK_SK))
- goto err;
-
- return 0;
-err:
- return -1;
-}
-
-const struct fdtype_ops netlink_dump_ops = {
- .type = FD_TYPES__NETLINKSK,
- .dump = dump_one_netlink_fd,
-};
-
-struct netlink_sock_info {
- NetlinkSkEntry *nse;
- struct file_desc d;
-};
-
-static int open_netlink_sk(struct file_desc *d)
-{
- struct netlink_sock_info *nsi;
- NetlinkSkEntry *nse;
- struct sockaddr_nl addr;
- int sk = -1;
-
- nsi = container_of(d, struct netlink_sock_info, d);
- nse = nsi->nse;
-
- pr_info("Opening netlink socket id %#x\n", nse->id);
-
- sk = socket(PF_NETLINK, SOCK_RAW, nse->protocol);
- if (sk < 0) {
- pr_perror("Can't create netlink sock");
- goto err;
- }
-
- if (nse->portid) {
- memset(&addr, 0, sizeof(addr));
- addr.nl_family = AF_NETLINK;
- if (nse->n_groups > 1) {
- pr_err("Groups above 32 are not supported yet\n");
- goto err;
- }
- if (nse->n_groups)
- addr.nl_groups = nse->groups[0];
- addr.nl_pid = nse->portid;
-
- if (bind(sk, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
- pr_perror("Can't bind netlink socket");
- goto err;
- }
- }
-
- if (nse->state == NETLINK_CONNECTED) {
- addr.nl_family = AF_NETLINK;
- addr.nl_groups = 1 << (nse->dst_group - 1);
- addr.nl_pid = nse->dst_portid;
- if (connect(sk, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
- pr_perror("Can't connect netlink socket");
- goto err;
- }
- }
-
- if (rst_file_params(sk, nse->fown, nse->flags))
- goto err;
-
- if (restore_socket_opts(sk, nse->opts))
- goto err;
-
- return sk;
-err:
- close(sk);
- return -1;
-}
-
-static struct file_desc_ops netlink_sock_desc_ops = {
- .type = FD_TYPES__NETLINKSK,
- .open = open_netlink_sk,
-};
-
-static int collect_one_netlink_sk(void *o, ProtobufCMessage *base)
-{
- struct netlink_sock_info *si = o;
-
- si->nse = pb_msg(base, NetlinkSkEntry);
- return file_desc_add(&si->d, si->nse->id, &netlink_sock_desc_ops);
-}
-
-struct collect_image_info netlink_sk_cinfo = {
- .fd_type = CR_FD_NETLINK_SK,
- .pb_type = PB_NETLINK_SK,
- .priv_size = sizeof(struct netlink_sock_info),
- .collect = collect_one_netlink_sk,
-};
diff --git a/sk-packet.c b/sk-packet.c
deleted file mode 100644
index a296dfa16a7f..000000000000
--- a/sk-packet.c
+++ /dev/null
@@ -1,504 +0,0 @@
-#include <linux/if_packet.h>
-#include <sys/socket.h>
-#include <linux/netlink.h>
-#include <linux/rtnetlink.h>
-#include <unistd.h>
-#include <string.h>
-#include "asm/types.h"
-#include "imgset.h"
-#include "files.h"
-#include "sockets.h"
-#include "libnetlink.h"
-#include "sk-packet.h"
-#include "packet_diag.h"
-#include "vma.h"
-#include <arpa/inet.h>
-
-#include "protobuf.h"
-#include "protobuf/packet-sock.pb-c.h"
-#include "protobuf/fdinfo.pb-c.h"
-
-struct packet_sock_info {
- PacketSockEntry *pse;
- struct file_desc d;
-};
-
-struct packet_mreq_max {
- int mr_ifindex;
- unsigned short mr_type;
- unsigned short mr_alen;
- unsigned char mr_address[MAX_ADDR_LEN];
-};
-
-struct packet_sock_desc {
- struct socket_desc sd;
- unsigned int file_id;
- unsigned int type;
- unsigned short proto;
- struct packet_diag_info nli;
- int mreq_n;
- struct packet_diag_mclist *mreqs;
- unsigned int fanout;
- struct packet_diag_ring *rx, *tx;
-};
-
-#define NO_FANOUT ((unsigned int)-1)
-
-static int dump_mreqs(PacketSockEntry *psk, struct packet_sock_desc *sd)
-{
- int i;
-
- if (!sd->mreq_n)
- return 0;
-
- pr_debug("\tdumping %d mreqs\n", sd->mreq_n);
- psk->mclist = xmalloc(sd->mreq_n * sizeof(psk->mclist[0]));
- if (!psk->mclist)
- return -1;
-
- for (i = 0; i < sd->mreq_n; i++) {
- struct packet_diag_mclist *m = &sd->mreqs[i];
- PacketMclist *im;
-
- if (m->pdmc_count != 1) {
- pr_err("Multiple MC membership not supported (but can be)\n");
- goto err;
- }
-
- pr_debug("\tmr%d: idx %d type %d\n", i,
- m->pdmc_index, m->pdmc_type);
-
- im = xmalloc(sizeof(*im));
- if (!im)
- goto err;
-
- packet_mclist__init(im);
- psk->mclist[i] = im;
- psk->n_mclist++;
-
- im->index = m->pdmc_index;
- im->type = m->pdmc_type;
-
- switch (m->pdmc_type) {
- case PACKET_MR_MULTICAST:
- case PACKET_MR_UNICAST:
- im->addr.len = m->pdmc_alen;
- im->addr.data = xmalloc(m->pdmc_alen);
- if (!im->addr.data)
- goto err;
-
- memcpy(im->addr.data, m->pdmc_addr, m->pdmc_alen);
- break;
- case PACKET_MR_PROMISC:
- case PACKET_MR_ALLMULTI:
- break;
- default:
- pr_err("Unknown mc membership type %d\n", m->pdmc_type);
- goto err;
- }
- }
-
- return 0;
-err:
- return -1;
-}
-
-static PacketRing *dump_ring(struct packet_diag_ring *dr)
-{
- PacketRing *ring;
-
- ring = xmalloc(sizeof(*ring));
- if (!ring)
- return NULL;
-
- packet_ring__init(ring);
-
- ring->block_size = dr->pdr_block_size;
- ring->block_nr = dr->pdr_block_nr;
- ring->frame_size = dr->pdr_frame_size;
- ring->frame_nr = dr->pdr_frame_nr;
- ring->retire_tmo = dr->pdr_retire_tmo;
- ring->sizeof_priv = dr->pdr_sizeof_priv;
- ring->features = dr->pdr_features;
-
- return ring;
-}
-
-static int dump_rings(PacketSockEntry *psk, struct packet_sock_desc *sd)
-{
- if (sd->rx) {
- psk->rx_ring = dump_ring(sd->rx);
- if (!psk->rx_ring)
- return -1;
- }
-
- if (sd->tx) {
- psk->tx_ring = dump_ring(sd->tx);
- if (!psk->tx_ring)
- return -1;
- }
-
- return 0;
-}
-
-static int dump_one_packet_fd(int lfd, u32 id, const struct fd_parms *p)
-{
- PacketSockEntry psk = PACKET_SOCK_ENTRY__INIT;
- SkOptsEntry skopts = SK_OPTS_ENTRY__INIT;
- struct packet_sock_desc *sd;
- int i, ret;
-
- sd = (struct packet_sock_desc *)lookup_socket(p->stat.st_ino, PF_PACKET, 0);
- if (IS_ERR_OR_NULL(sd)) {
- pr_err("Can't find packet socket %"PRIu64"\n", p->stat.st_ino);
- return -1;
- }
-
- pr_info("Dumping packet socket fd %d id %#x\n", lfd, id);
- BUG_ON(sd->sd.already_dumped);
- sd->sd.already_dumped = 1;
-
- psk.id = sd->file_id = id;
- psk.type = sd->type;
- psk.flags = p->flags;
- psk.fown = (FownEntry *)&p->fown;
- psk.opts = &skopts;
-
- if (dump_socket_opts(lfd, &skopts))
- return -1;
-
- psk.protocol = sd->proto;
- psk.ifindex = sd->nli.pdi_index;
- psk.version = sd->nli.pdi_version;
- psk.reserve = sd->nli.pdi_reserve;
- psk.timestamp = sd->nli.pdi_tstamp;
- psk.copy_thresh = sd->nli.pdi_copy_thresh;
- psk.aux_data = (sd->nli.pdi_flags & PDI_AUXDATA ? true : false);
- psk.orig_dev = (sd->nli.pdi_flags & PDI_ORIGDEV ? true : false);
- psk.vnet_hdr = (sd->nli.pdi_flags & PDI_VNETHDR ? true : false);
- psk.loss = (sd->nli.pdi_flags & PDI_LOSS ? true : false);
-
- ret = dump_mreqs(&psk, sd);
- if (ret)
- goto out;
-
- if (sd->fanout != NO_FANOUT) {
- psk.has_fanout = true;
- psk.fanout = sd->fanout;
- }
-
- ret = dump_rings(&psk, sd);
- if (ret)
- goto out;
-
- ret = pb_write_one(img_from_set(glob_imgset, CR_FD_PACKETSK), &psk, PB_PACKET_SOCK);
-out:
- release_skopts(&skopts);
- xfree(psk.rx_ring);
- xfree(psk.tx_ring);
- for (i = 0; i < psk.n_mclist; i++)
- xfree(psk.mclist[i]->addr.data);
- xfree(psk.mclist);
- return ret;
-}
-
-const struct fdtype_ops packet_dump_ops = {
- .type = FD_TYPES__PACKETSK,
- .dump = dump_one_packet_fd,
-};
-
-int dump_socket_map(struct vma_area *vma)
-{
- struct packet_sock_desc *sd;
-
- sd = (struct packet_sock_desc *)lookup_socket(vma->vm_socket_id, PF_PACKET, 0);
- if (IS_ERR_OR_NULL(sd)) {
- pr_err("Can't find packet socket %u to mmap\n", vma->vm_socket_id);
- return -1;
- }
-
- if (!sd->file_id) {
- pr_err("Mmap-ed socket %u not open\n", vma->vm_socket_id);
- return -1;
- }
-
- pr_info("Dumping socket map %x -> %"PRIx64"\n", sd->file_id, vma->e->start);
- vma->e->shmid = sd->file_id;
- return 0;
-}
-
-static int packet_save_mreqs(struct packet_sock_desc *sd, struct rtattr *mc)
-{
- sd->mreq_n = RTA_PAYLOAD(mc) / sizeof(struct packet_diag_mclist);
- pr_debug("\tGot %d mreqs\n", sd->mreq_n);
- sd->mreqs = xmalloc(RTA_PAYLOAD(mc));
- if (!sd->mreqs)
- return -1;
-
- memcpy(sd->mreqs, RTA_DATA(mc), RTA_PAYLOAD(mc));
- return 0;
-}
-
-int packet_receive_one(struct nlmsghdr *hdr, void *arg)
-{
- struct packet_diag_msg *m;
- struct rtattr *tb[PACKET_DIAG_MAX + 1];
- struct packet_sock_desc *sd;
-
- m = NLMSG_DATA(hdr);
- parse_rtattr(tb, PACKET_DIAG_MAX, (struct rtattr *)(m + 1),
- hdr->nlmsg_len - NLMSG_LENGTH(sizeof(*m)));
- pr_info("Collect packet sock %u %u\n", m->pdiag_ino, (unsigned int)m->pdiag_num);
-
- if (!tb[PACKET_DIAG_INFO]) {
- pr_err("No packet sock info in nlm\n");
- return -1;
- }
-
- if (!tb[PACKET_DIAG_MCLIST]) {
- pr_err("No packet sock mclist in nlm\n");
- return -1;
- }
-
- sd = xmalloc(sizeof(*sd));
- if (!sd)
- return -1;
-
- sd->file_id = 0;
- sd->type = m->pdiag_type;
- sd->proto = htons(m->pdiag_num);
- sd->rx = NULL;
- sd->tx = NULL;
- memcpy(&sd->nli, RTA_DATA(tb[PACKET_DIAG_INFO]), sizeof(sd->nli));
-
- if (packet_save_mreqs(sd, tb[PACKET_DIAG_MCLIST]))
- goto err;
-
- if (tb[PACKET_DIAG_FANOUT])
- sd->fanout = *(__u32 *)RTA_DATA(tb[PACKET_DIAG_FANOUT]);
- else
- sd->fanout = NO_FANOUT;
-
- if (tb[PACKET_DIAG_RX_RING]) {
- sd->rx = xmalloc(sizeof(*sd->rx));
- if (sd->rx == NULL)
- goto err;
- memcpy(sd->rx, RTA_DATA(tb[PACKET_DIAG_RX_RING]), sizeof(*sd->rx));
- }
-
- if (tb[PACKET_DIAG_TX_RING]) {
- sd->tx = xmalloc(sizeof(*sd->tx));
- if (sd->tx == NULL)
- goto err;
- memcpy(sd->tx, RTA_DATA(tb[PACKET_DIAG_TX_RING]), sizeof(*sd->tx));
- }
-
- return sk_collect_one(m->pdiag_ino, PF_PACKET, &sd->sd);
-err:
- xfree(sd->tx);
- xfree(sd->rx);
- xfree(sd);
- return -1;
-}
-
-int get_socket_fd(int pid, VmaEntry *vma)
-{
- struct file_desc *fd;
- struct fdinfo_list_entry *le;
-
- pr_info("Getting packet socket fd for %d:%x\n",
- pid, (int)vma->shmid);
- fd = find_file_desc_raw(FD_TYPES__PACKETSK, vma->shmid);
- if (!fd) {
- pr_err("No packet socket %x\n", (int)vma->shmid);
- return -1;
- }
-
- list_for_each_entry(le, &fd->fd_info_head, desc_list)
- if (le->pid == pid) {
- int fd;
-
- /*
- * Restorer will close the mmap-ed fd
- */
-
- fd = dup(le->fe->fd);
- if (!fd) {
- pr_perror("Can't dup packet sk");
- return -1;
- }
-
- return fd;
- }
-
- pr_err("No open packet socket %x by %d\n", (int)vma->shmid, pid);
- return -1;
-}
-
-static int restore_mreqs(int sk, PacketSockEntry *pse)
-{
- int i;
-
- for (i = 0; i < pse->n_mclist; i++) {
- PacketMclist *ml;
- struct packet_mreq_max mreq;
-
- ml = pse->mclist[i];
- pr_info("Restoring mreq type %d\n", ml->type);
-
- if (ml->addr.len > sizeof(mreq.mr_address)) {
- pr_err("To big mcaddr %zu\n", ml->addr.len);
- return -1;
- }
-
- mreq.mr_ifindex = ml->index;
- mreq.mr_type = ml->type;
- mreq.mr_alen = ml->addr.len;
- memcpy(mreq.mr_address, ml->addr.data, ml->addr.len);
-
- if (restore_opt(sk, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mreq))
- return -1;
- }
-
- return 0;
-}
-
-static int restore_ring(int sk, int type, PacketRing *ring)
-{
- struct tpacket_req3 req;
-
- if (!ring)
- return 0;
-
- pr_debug("\tRestoring %d ring\n", type);
-
- req.tp_block_size = ring->block_size;
- req.tp_block_nr = ring->block_nr;
- req.tp_frame_size = ring->frame_size;
- req.tp_frame_nr = ring->frame_nr;
- req.tp_retire_blk_tov = ring->retire_tmo;
- req.tp_sizeof_priv = ring->sizeof_priv;
- req.tp_feature_req_word = ring->features;
-
- return restore_opt(sk, SOL_PACKET, type, &req);
-}
-
-static int restore_rings(int sk, PacketSockEntry *psk)
-{
- if (restore_ring(sk, PACKET_RX_RING, psk->rx_ring))
- return -1;
-
- if (restore_ring(sk, PACKET_TX_RING, psk->tx_ring))
- return -1;
-
- return 0;
-}
-
-static int open_packet_sk(struct file_desc *d)
-{
- struct packet_sock_info *psi;
- PacketSockEntry *pse;
- struct sockaddr_ll addr;
- int sk, yes;
-
- psi = container_of(d, struct packet_sock_info, d);
- pse = psi->pse;
-
- pr_info("Opening packet socket id %#x\n", pse->id);
-
- sk = socket(PF_PACKET, pse->type, pse->protocol);
- if (sk < 0) {
- pr_perror("Can't create packet sock");
- goto err;
- }
-
- memset(&addr, 0, sizeof(addr));
- addr.sll_family = AF_PACKET;
- addr.sll_ifindex = pse->ifindex;
-
- if (bind(sk, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
- pr_perror("Can't bind packet socket");
- goto err_cl;
- }
-
- if (restore_opt(sk, SOL_PACKET, PACKET_VERSION, &pse->version))
- goto err_cl;
-
- if (restore_opt(sk, SOL_PACKET, PACKET_RESERVE, &pse->reserve))
- goto err_cl;
-
- if (restore_opt(sk, SOL_PACKET, PACKET_TIMESTAMP, &pse->timestamp))
- goto err_cl;
-
- if (restore_opt(sk, SOL_PACKET, PACKET_COPY_THRESH, &pse->copy_thresh))
- goto err_cl;
-
- if (pse->aux_data) {
- yes = 1;
- if (restore_opt(sk, SOL_PACKET, PACKET_AUXDATA, &yes))
- goto err_cl;
- }
-
- if (pse->orig_dev) {
- yes = 1;
- if (restore_opt(sk, SOL_PACKET, PACKET_ORIGDEV, &yes))
- goto err_cl;
- }
-
- if (pse->vnet_hdr) {
- yes = 1;
- if (restore_opt(sk, SOL_PACKET, PACKET_VNET_HDR, &yes))
- goto err_cl;
- }
-
- if (pse->loss) {
- yes = 1;
- if (restore_opt(sk, SOL_PACKET, PACKET_LOSS, &yes))
- goto err_cl;
- }
-
- if (restore_mreqs(sk, pse))
- goto err_cl;
-
- if (restore_rings(sk, pse))
- goto err_cl;
-
- if (pse->has_fanout) {
- pr_info("Restoring fanout %x\n", pse->fanout);
- if (restore_opt(sk, SOL_PACKET, PACKET_FANOUT, &pse->fanout))
- goto err_cl;
- }
-
- if (rst_file_params(sk, pse->fown, pse->flags))
- goto err_cl;
-
- if (restore_socket_opts(sk, pse->opts))
- goto err_cl;
-
- return sk;
-
-err_cl:
- close(sk);
-err:
- return -1;
-}
-
-static struct file_desc_ops packet_sock_desc_ops = {
- .type = FD_TYPES__PACKETSK,
- .open = open_packet_sk,
-};
-
-static int collect_one_packet_sk(void *o, ProtobufCMessage *base)
-{
- struct packet_sock_info *si = o;
-
- si->pse = pb_msg(base, PacketSockEntry);
- return file_desc_add(&si->d, si->pse->id, &packet_sock_desc_ops);
-}
-
-struct collect_image_info packet_sk_cinfo = {
- .fd_type = CR_FD_PACKETSK,
- .pb_type = PB_PACKET_SOCK,
- .priv_size = sizeof(struct packet_sock_info),
- .collect = collect_one_packet_sk,
-};
diff --git a/sk-queue.c b/sk-queue.c
deleted file mode 100644
index 6a39c4b35a8c..000000000000
--- a/sk-queue.c
+++ /dev/null
@@ -1,256 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <errno.h>
-#include <unistd.h>
-#include <fcntl.h>
-#include <limits.h>
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/socket.h>
-#include <sys/sendfile.h>
-
-#include "asm/types.h"
-#include "list.h"
-#include "imgset.h"
-#include "image.h"
-#include "servicefd.h"
-#include "cr_options.h"
-#include "util.h"
-#include "util-pie.h"
-#include "sockets.h"
-
-#include "sk-queue.h"
-
-#include "protobuf.h"
-#include "protobuf/sk-packet.pb-c.h"
-
-struct sk_packet {
- struct list_head list;
- SkPacketEntry *entry;
- off_t img_off;
-};
-
-static LIST_HEAD(packets_list);
-
-int read_sk_queues(void)
-{
- struct sk_packet *pkt;
- int ret;
- struct cr_img *img;
-
- pr_info("Trying to read socket queues image\n");
-
- img = open_image(CR_FD_SK_QUEUES, O_RSTR);
- if (!img)
- return -1;
-
- while (1) {
- ret = -1;
- pkt = xmalloc(sizeof(*pkt));
- if (!pkt) {
- pr_err("Failed to allocate packet header\n");
- break;
- }
- ret = pb_read_one_eof(img, &pkt->entry, PB_SK_QUEUES);
- if (ret <= 0)
- break;
-
- pkt->img_off = lseek(img_raw_fd(img), 0, SEEK_CUR);
- /*
- * NOTE: packet must be added to the tail. Otherwise sequence
- * will be broken.
- */
- list_add_tail(&pkt->list, &packets_list);
- lseek(img_raw_fd(img), pkt->entry->length, SEEK_CUR);
- }
- close_image(img);
- xfree(pkt);
-
- return ret;
-}
-
-int dump_sk_queue(int sock_fd, int sock_id)
-{
- SkPacketEntry pe = SK_PACKET_ENTRY__INIT;
- int ret, size, orig_peek_off;
- void *data;
- socklen_t tmp;
-
- /*
- * Save original peek offset.
- */
- tmp = sizeof(orig_peek_off);
- orig_peek_off = 0;
- ret = getsockopt(sock_fd, SOL_SOCKET, SO_PEEK_OFF, &orig_peek_off, &tmp);
- if (ret < 0) {
- pr_perror("getsockopt failed");
- return ret;
- }
- /*
- * Discover max DGRAM size
- */
- tmp = sizeof(size);
- size = 0;
- ret = getsockopt(sock_fd, SOL_SOCKET, SO_SNDBUF, &size, &tmp);
- if (ret < 0) {
- pr_perror("getsockopt failed");
- return ret;
- }
-
- /* Note: 32 bytes will be used by kernel for protocol header. */
- size -= 32;
-
- /*
- * Allocate data for a stream.
- */
- data = xmalloc(size);
- if (!data)
- return -1;
-
- /*
- * Enable peek offset incrementation.
- */
- ret = setsockopt(sock_fd, SOL_SOCKET, SO_PEEK_OFF, &ret, sizeof(int));
- if (ret < 0) {
- pr_perror("setsockopt fail");
- goto err_brk;
- }
-
- pe.id_for = sock_id;
-
- while (1) {
- struct iovec iov = {
- .iov_base = data,
- .iov_len = size,
- };
- struct msghdr msg = {
- .msg_iov = &iov,
- .msg_iovlen = 1,
- };
-
- ret = pe.length = recvmsg(sock_fd, &msg, MSG_DONTWAIT | MSG_PEEK);
- if (!ret)
- /*
- * It means, that peer has performed an
- * orderly shutdown, so we're done.
- */
- break;
- else if (ret < 0) {
- if (errno == EAGAIN)
- break; /* we're done */
- pr_perror("recvmsg fail: error");
- goto err_set_sock;
- }
- if (msg.msg_flags & MSG_TRUNC) {
- /*
- * DGRAM truncated. This should not happen. But we have
- * to check...
- */
- pr_err("sys_recvmsg failed: truncated\n");
- ret = -E2BIG;
- goto err_set_sock;
- }
-
- ret = pb_write_one(img_from_set(glob_imgset, CR_FD_SK_QUEUES), &pe, PB_SK_QUEUES);
- if (ret < 0) {
- ret = -EIO;
- goto err_set_sock;
- }
-
- ret = write_img_buf(img_from_set(glob_imgset, CR_FD_SK_QUEUES), data, pe.length);
- if (ret < 0) {
- ret = -EIO;
- goto err_set_sock;
- }
- }
- ret = 0;
-
-err_set_sock:
- /*
- * Restore original peek offset.
- */
- if (setsockopt(sock_fd, SOL_SOCKET, SO_PEEK_OFF, &orig_peek_off, sizeof(int))) {
- pr_perror("setsockopt failed on restore");
- ret = -1;
- }
-err_brk:
- xfree(data);
- return ret;
-}
-
-void sk_queue_data_handler(struct cr_img *img, void *obj)
-{
- SkPacketEntry *e = obj;
- print_image_data(img, e->length, opts.show_pages_content);
-}
-
-int restore_sk_queue(int fd, unsigned int peer_id)
-{
- struct sk_packet *pkt, *tmp;
- int ret;
- struct cr_img *img;
-
- pr_info("Trying to restore recv queue for %u\n", peer_id);
-
- if (restore_prepare_socket(fd))
- return -1;
-
- img = open_image(CR_FD_SK_QUEUES, O_RSTR);
- if (!img)
- return -1;
-
- list_for_each_entry_safe(pkt, tmp, &packets_list, list) {
- SkPacketEntry *entry = pkt->entry;
- char *buf;
-
- if (entry->id_for != peer_id)
- continue;
-
- pr_info("\tRestoring %d-bytes skb for %u\n",
- (unsigned int)entry->length, peer_id);
-
- /*
- * Don't try to use sendfile here, because it use sendpage() and
- * all data are split on pages and a new skb is allocated for
- * each page. It creates a big overhead on SNDBUF.
- * sendfile() isn't suitable for DGRAM sockets, because message
- * boundaries messages should be saved.
- */
-
- buf = xmalloc(entry->length);
- if (buf ==NULL)
- goto err;
-
- if (lseek(img_raw_fd(img), pkt->img_off, SEEK_SET) == -1) {
- pr_perror("lseek() failed");
- xfree(buf);
- goto err;
- }
- if (read_img_buf(img, buf, entry->length) != 1) {
- xfree(buf);
- goto err;
- }
-
- ret = write(fd, buf, entry->length);
- xfree(buf);
- if (ret < 0) {
- pr_perror("Failed to send packet");
- goto err;
- }
- if (ret != entry->length) {
- pr_err("Restored skb trimmed to %d/%d\n",
- ret, (unsigned int)entry->length);
- goto err;
- }
- list_del(&pkt->list);
- sk_packet_entry__free_unpacked(entry, NULL);
- xfree(pkt);
- }
-
- close_image(img);
- return 0;
-err:
- close_image(img);
- return -1;
-}
diff --git a/sk-tcp.c b/sk-tcp.c
deleted file mode 100644
index f653446a3604..000000000000
--- a/sk-tcp.c
+++ /dev/null
@@ -1,771 +0,0 @@
-#include <netinet/tcp.h>
-#include <sys/ioctl.h>
-#include <linux/sockios.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <sys/mman.h>
-#include <string.h>
-#include <sched.h>
-#include <netinet/in.h>
-
-#include "cr_options.h"
-#include "util.h"
-#include "list.h"
-#include "log.h"
-#include "asm/types.h"
-#include "files.h"
-#include "sockets.h"
-#include "sk-inet.h"
-#include "netfilter.h"
-#include "image.h"
-#include "namespaces.h"
-#include "xmalloc.h"
-#include "config.h"
-#include "cr-show.h"
-#include "kerndat.h"
-#include "rst-malloc.h"
-
-#include "protobuf.h"
-#include "protobuf/tcp-stream.pb-c.h"
-
-#ifndef SIOCOUTQNSD
-/* MAO - Define SIOCOUTQNSD ioctl if we don't have it */
-#define SIOCOUTQNSD 0x894B
-#endif
-
-#ifndef CONFIG_HAS_TCP_REPAIR
-/*
- * It's been reported that both tcp_repair_opt
- * and TCP_ enum already shipped in netinet/tcp.h
- * system header by some distros thus we need a
- * test if we can use predefined ones or provide
- * our own.
- */
-struct tcp_repair_opt {
- u32 opt_code;
- u32 opt_val;
-};
-
-enum {
- TCP_NO_QUEUE,
- TCP_RECV_QUEUE,
- TCP_SEND_QUEUE,
- TCP_QUEUES_NR,
-};
-#endif
-
-#ifndef TCP_TIMESTAMP
-#define TCP_TIMESTAMP 24
-#endif
-
-#ifndef TCPOPT_SACK_PERM
-#define TCPOPT_SACK_PERM TCPOPT_SACK_PERMITTED
-#endif
-
-static LIST_HEAD(cpt_tcp_repair_sockets);
-static LIST_HEAD(rst_tcp_repair_sockets);
-
-static int tcp_repair_on(int fd)
-{
- int ret, aux = 1;
-
- ret = setsockopt(fd, SOL_TCP, TCP_REPAIR, &aux, sizeof(aux));
- if (ret < 0)
- pr_perror("Can't turn TCP repair mode ON");
-
- return ret;
-}
-
-static int refresh_inet_sk(struct inet_sk_desc *sk)
-{
- int size;
- struct tcp_info info;
-
- if (dump_opt(sk->rfd, SOL_TCP, TCP_INFO, &info)) {
- pr_perror("Failed to obtain TCP_INFO");
- return -1;
- }
-
- switch (info.tcpi_state) {
- case TCP_ESTABLISHED:
- case TCP_CLOSE:
- break;
- default:
- pr_err("Unknown state %d\n", sk->state);
- return -1;
- }
-
- if (ioctl(sk->rfd, SIOCOUTQ, &size) == -1) {
- pr_perror("Unable to get size of snd queue");
- return -1;
- }
-
- sk->wqlen = size;
-
- if (ioctl(sk->rfd, SIOCOUTQNSD, &size) == -1) {
- pr_perror("Unable to get size of unsent data");
- return -1;
- }
-
- sk->uwqlen = size;
-
- if (ioctl(sk->rfd, SIOCINQ, &size) == -1) {
- pr_perror("Unable to get size of recv queue");
- return -1;
- }
-
- sk->rqlen = size;
-
- return 0;
-}
-
-static int tcp_repair_establised(int fd, struct inet_sk_desc *sk)
-{
- int ret;
-
- pr_info("\tTurning repair on for socket %x\n", sk->sd.ino);
- /*
- * Keep the socket open in criu till the very end. In
- * case we close this fd after one task fd dumping and
- * fail we'll have to turn repair mode off
- */
- sk->rfd = dup(fd);
- if (sk->rfd < 0) {
- pr_perror("Can't save socket fd for repair");
- goto err1;
- }
-
- if (!(root_ns_mask & CLONE_NEWNET)) {
- ret = nf_lock_connection(sk);
- if (ret < 0)
- goto err2;
- }
-
- ret = tcp_repair_on(sk->rfd);
- if (ret < 0)
- goto err3;
-
- list_add_tail(&sk->rlist, &cpt_tcp_repair_sockets);
-
- ret = refresh_inet_sk(sk);
- if (ret < 0)
- goto err1;
-
- return 0;
-
-err3:
- if (!(root_ns_mask & CLONE_NEWNET))
- nf_unlock_connection(sk);
-err2:
- close(sk->rfd);
-err1:
- return -1;
-}
-
-static void tcp_unlock_one(struct inet_sk_desc *sk)
-{
- int ret;
-
- list_del(&sk->rlist);
-
- if (!(root_ns_mask & CLONE_NEWNET)) {
- ret = nf_unlock_connection(sk);
- if (ret < 0)
- pr_perror("Failed to unlock TCP connection");
- }
-
- tcp_repair_off(sk->rfd);
-
- /*
- * tcp_repair_off modifies SO_REUSEADDR so
- * don't forget to restore original value.
- */
- restore_opt(sk->rfd, SOL_SOCKET, SO_REUSEADDR, &sk->cpt_reuseaddr);
-
- close(sk->rfd);
-}
-
-void cpt_unlock_tcp_connections(void)
-{
- struct inet_sk_desc *sk, *n;
-
- list_for_each_entry_safe(sk, n, &cpt_tcp_repair_sockets, rlist)
- tcp_unlock_one(sk);
-}
-
-/*
- * TCP queues sequences and their relations to the code below
- *
- * output queue
- * net <----------------------------- sk
- * ^ ^ ^ seq >>
- * snd_una snd_nxt write_seq
- *
- * input queue
- * net -----------------------------> sk
- * << seq ^ ^
- * rcv_nxt copied_seq
- *
- *
- * inq_len = rcv_nxt - copied_seq = SIOCINQ
- * outq_len = write_seq - snd_una = SIOCOUTQ
- * inq_seq = rcv_nxt
- * outq_seq = write_seq
- *
- * On restore kernel moves the option we configure with setsockopt,
- * thus we should advance them on the _len value in restore_tcp_seqs.
- *
- */
-
-static int tcp_stream_get_queue(int sk, int queue_id,
- u32 *seq, u32 len, char **bufp)
-{
- int ret, aux;
- socklen_t auxl;
- char *buf;
-
- pr_debug("\tSet repair queue %d\n", queue_id);
- aux = queue_id;
- auxl = sizeof(aux);
- ret = setsockopt(sk, SOL_TCP, TCP_REPAIR_QUEUE, &aux, auxl);
- if (ret < 0)
- goto err_sopt;
-
- pr_debug("\tGet queue seq\n");
- auxl = sizeof(*seq);
- ret = getsockopt(sk, SOL_TCP, TCP_QUEUE_SEQ, seq, &auxl);
- if (ret < 0)
- goto err_sopt;
-
- pr_info("\t`- seq %u len %u\n", *seq, len);
-
- if (len) {
- /*
- * Try to grab one byte more from the queue to
- * make sure there are len bytes for real
- */
- buf = xmalloc(len + 1);
- if (!buf)
- goto err_buf;
-
- pr_debug("\tReading queue (%d bytes)\n", len);
- ret = recv(sk, buf, len + 1, MSG_PEEK | MSG_DONTWAIT);
- if (ret != len)
- goto err_recv;
- } else
- buf = NULL;
-
- *bufp = buf;
- return 0;
-
-err_sopt:
- pr_perror("\tsockopt failed");
-err_buf:
- return -1;
-
-err_recv:
- pr_perror("\trecv failed (%d, want %d, errno %d)", ret, len, errno);
- xfree(buf);
- goto err_buf;
-}
-
-static int tcp_stream_get_options(int sk, TcpStreamEntry *tse)
-{
- int ret;
- socklen_t auxl;
- struct tcp_info ti;
- int val;
-
- auxl = sizeof(ti);
- ret = getsockopt(sk, SOL_TCP, TCP_INFO, &ti, &auxl);
- if (ret < 0)
- goto err_sopt;
-
- auxl = sizeof(tse->mss_clamp);
- ret = getsockopt(sk, SOL_TCP, TCP_MAXSEG, &tse->mss_clamp, &auxl);
- if (ret < 0)
- goto err_sopt;
-
- tse->opt_mask = ti.tcpi_options;
- if (ti.tcpi_options & TCPI_OPT_WSCALE) {
- tse->snd_wscale = ti.tcpi_snd_wscale;
- tse->rcv_wscale = ti.tcpi_rcv_wscale;
- tse->has_rcv_wscale = true;
- }
-
- if (ti.tcpi_options & TCPI_OPT_TIMESTAMPS) {
- auxl = sizeof(val);
- ret = getsockopt(sk, SOL_TCP, TCP_TIMESTAMP, &val, &auxl);
- if (ret < 0)
- goto err_sopt;
-
- tse->has_timestamp = true;
- tse->timestamp = val;
- }
-
- pr_info("\toptions: mss_clamp %x wscale %x tstamp %d sack %d\n",
- (int)tse->mss_clamp,
- ti.tcpi_options & TCPI_OPT_WSCALE ? (int)tse->snd_wscale : -1,
- ti.tcpi_options & TCPI_OPT_TIMESTAMPS ? 1 : 0,
- ti.tcpi_options & TCPI_OPT_SACK ? 1 : 0);
-
- return 0;
-
-err_sopt:
- pr_perror("\tsockopt failed");
- return -1;
-}
-
-static int dump_tcp_conn_state(struct inet_sk_desc *sk)
-{
- int ret, aux;
- struct cr_img *img;
- TcpStreamEntry tse = TCP_STREAM_ENTRY__INIT;
- char *in_buf, *out_buf;
-
- /*
- * Read queue
- */
-
- pr_info("Reading inq for socket\n");
- tse.inq_len = sk->rqlen;
- ret = tcp_stream_get_queue(sk->rfd, TCP_RECV_QUEUE,
- &tse.inq_seq, tse.inq_len, &in_buf);
- if (ret < 0)
- goto err_in;
-
- /*
- * Write queue
- */
-
- pr_info("Reading outq for socket\n");
- tse.outq_len = sk->wqlen;
- tse.unsq_len = sk->uwqlen;
- tse.has_unsq_len = true;
- ret = tcp_stream_get_queue(sk->rfd, TCP_SEND_QUEUE,
- &tse.outq_seq, tse.outq_len, &out_buf);
- if (ret < 0)
- goto err_out;
-
- /*
- * Initial options
- */
-
- pr_info("Reading options for socket\n");
- ret = tcp_stream_get_options(sk->rfd, &tse);
- if (ret < 0)
- goto err_opt;
-
- /*
- * TCP socket options
- */
-
- if (dump_opt(sk->rfd, SOL_TCP, TCP_NODELAY, &aux))
- goto err_opt;
-
- if (aux) {
- tse.has_nodelay = true;
- tse.nodelay = true;
- }
-
- if (dump_opt(sk->rfd, SOL_TCP, TCP_CORK, &aux))
- goto err_opt;
-
- if (aux) {
- tse.has_cork = true;
- tse.cork = true;
- }
-
- /*
- * Push the stuff to image
- */
-
- img = open_image(CR_FD_TCP_STREAM, O_DUMP, sk->sd.ino);
- if (!img)
- goto err_img;
-
- ret = pb_write_one(img, &tse, PB_TCP_STREAM);
- if (ret < 0)
- goto err_iw;
-
- if (in_buf) {
- ret = write_img_buf(img, in_buf, tse.inq_len);
- if (ret < 0)
- goto err_iw;
- }
-
- if (out_buf) {
- ret = write_img_buf(img, out_buf, tse.outq_len);
- if (ret < 0)
- goto err_iw;
- }
-
- pr_info("Done\n");
-err_iw:
- close_image(img);
-err_img:
-err_opt:
- xfree(out_buf);
-err_out:
- xfree(in_buf);
-err_in:
- return ret;
-}
-
-int dump_one_tcp(int fd, struct inet_sk_desc *sk)
-{
- if (sk->state != TCP_ESTABLISHED)
- return 0;
-
- pr_info("Dumping TCP connection\n");
-
- if (tcp_repair_establised(fd, sk))
- return -1;
-
- if (dump_tcp_conn_state(sk))
- return -1;
-
- /*
- * Socket is left in repair mode, so that at the end it's just
- * closed and the connection is silently terminated
- */
- return 0;
-}
-
-static int set_tcp_queue_seq(int sk, int queue, u32 seq)
-{
- pr_debug("\tSetting %d queue seq to %u\n", queue, seq);
-
- if (setsockopt(sk, SOL_TCP, TCP_REPAIR_QUEUE, &queue, sizeof(queue)) < 0) {
- pr_perror("Can't set repair queue");
- return -1;
- }
-
- if (setsockopt(sk, SOL_TCP, TCP_QUEUE_SEQ, &seq, sizeof(seq)) < 0) {
- pr_perror("Can't set queue seq");
- return -1;
- }
-
- return 0;
-}
-
-static int restore_tcp_seqs(int sk, TcpStreamEntry *tse)
-{
- if (set_tcp_queue_seq(sk, TCP_RECV_QUEUE,
- tse->inq_seq - tse->inq_len))
- return -1;
- if (set_tcp_queue_seq(sk, TCP_SEND_QUEUE,
- tse->outq_seq - tse->outq_len))
- return -1;
-
- return 0;
-}
-
-static int __send_tcp_queue(int sk, int queue, u32 len, struct cr_img *img)
-{
- int ret, err = -1, max_chunk;
- int off;
- char *buf;
-
- buf = xmalloc(len);
- if (!buf)
- return -1;
-
- if (read_img_buf(img, buf, len) < 0)
- goto err;
-
- max_chunk = (queue == TCP_RECV_QUEUE ? kdat.tcp_max_rshare : len);
- off = 0;
- while (len) {
- int chunk = len;
-
- if (chunk > max_chunk)
- chunk = max_chunk;
-
- ret = send(sk, buf + off, chunk, 0);
- if (ret <= 0) {
- if ((queue == TCP_RECV_QUEUE) && (max_chunk > 1024) && (errno == ENOMEM)) {
- /*
- * When restoring recv queue in repair mode
- * kernel doesn't try hard and just allocates
- * a linear skb with the size we pass to the
- * system call. Thus, if the size is too big
- * for slab allocator, the send just fails
- * with ENOMEM. Try smaller chunk, hopefully
- * there's still enough memory in the system.
- */
- max_chunk >>= 1;
- continue;
- }
-
- pr_perror("Can't restore %d queue data (%d), want (%d:%d:%d)",
- queue, ret, chunk, len, max_chunk);
- goto err;
- }
- off += ret;
- len -= ret;
- }
-
- err = 0;
-err:
- xfree(buf);
-
- return err;
-}
-
-static int send_tcp_queue(int sk, int queue, u32 len, struct cr_img *img)
-{
- pr_debug("\tRestoring TCP %d queue data %u bytes\n", queue, len);
-
- if (setsockopt(sk, SOL_TCP, TCP_REPAIR_QUEUE, &queue, sizeof(queue)) < 0) {
- pr_perror("Can't set repair queue");
- return -1;
- }
-
- return __send_tcp_queue(sk, queue, len, img);
-}
-
-static int restore_tcp_queues(int sk, TcpStreamEntry *tse, struct cr_img *img, mutex_t *reuse_lock)
-{
- u32 len;
-
- if (restore_prepare_socket(sk))
- return -1;
-
- len = tse->inq_len;
- if (len && send_tcp_queue(sk, TCP_RECV_QUEUE, len, img))
- return -1;
-
- /*
- * All data in a write buffer can be divided on two parts sent
- * but not yet acknowledged data and unsent data.
- * The TCP stack must know which data have been sent, because
- * acknowledgment can be received for them. These data must be
- * restored in repair mode.
- */
- len = tse->outq_len - tse->unsq_len;
- if (len && send_tcp_queue(sk, TCP_SEND_QUEUE, len, img))
- return -1;
-
- /*
- * The second part of data have never been sent to outside, so
- * they can be restored without any tricks.
- */
- len = tse->unsq_len;
- mutex_lock(reuse_lock);
- tcp_repair_off(sk);
- if (len && __send_tcp_queue(sk, TCP_SEND_QUEUE, len, img)) {
- mutex_unlock(reuse_lock);
- return -1;
- }
- if (tcp_repair_on(sk)) {
- mutex_unlock(reuse_lock);
- return -1;
- }
- mutex_unlock(reuse_lock);
-
- return 0;
-}
-
-static int restore_tcp_opts(int sk, TcpStreamEntry *tse)
-{
- struct tcp_repair_opt opts[4];
- int onr = 0;
-
- pr_debug("\tRestoring TCP options\n");
-
- if (tse->opt_mask & TCPI_OPT_SACK) {
- pr_debug("\t\tWill turn SAK on\n");
- opts[onr].opt_code = TCPOPT_SACK_PERM;
- opts[onr].opt_val = 0;
- onr++;
- }
-
- if (tse->opt_mask & TCPI_OPT_WSCALE) {
- pr_debug("\t\tWill set snd_wscale to %u\n", tse->snd_wscale);
- pr_debug("\t\tWill set rcv_wscale to %u\n", tse->rcv_wscale);
- opts[onr].opt_code = TCPOPT_WINDOW;
- opts[onr].opt_val = tse->snd_wscale + (tse->rcv_wscale << 16);
- onr++;
- }
-
- if (tse->opt_mask & TCPI_OPT_TIMESTAMPS) {
- pr_debug("\t\tWill turn timestamps on\n");
- opts[onr].opt_code = TCPOPT_TIMESTAMP;
- opts[onr].opt_val = 0;
- onr++;
- }
-
- pr_debug("Will set mss clamp to %u\n", tse->mss_clamp);
- opts[onr].opt_code = TCPOPT_MAXSEG;
- opts[onr].opt_val = tse->mss_clamp;
- onr++;
-
- if (setsockopt(sk, SOL_TCP, TCP_REPAIR_OPTIONS,
- opts, onr * sizeof(struct tcp_repair_opt)) < 0) {
- pr_perror("Can't repair options");
- return -1;
- }
-
- if (tse->has_timestamp) {
- if (setsockopt(sk, SOL_TCP, TCP_TIMESTAMP,
- &tse->timestamp, sizeof(tse->timestamp)) < 0) {
- pr_perror("Can't set timestamp");
- return -1;
- }
- }
-
- return 0;
-}
-
-static int restore_tcp_conn_state(int sk, struct inet_sk_info *ii)
-{
- int aux;
- struct cr_img *img;
- TcpStreamEntry *tse;
-
- pr_info("Restoring TCP connection id %x ino %x\n", ii->ie->id, ii->ie->ino);
-
- img = open_image(CR_FD_TCP_STREAM, O_RSTR, ii->ie->ino);
- if (!img)
- goto err;
-
- if (pb_read_one(img, &tse, PB_TCP_STREAM) < 0)
- goto err_c;
-
- if (restore_tcp_seqs(sk, tse))
- goto err_c;
-
- if (inet_bind(sk, ii))
- goto err_c;
-
- if (inet_connect(sk, ii))
- goto err_c;
-
- if (restore_tcp_opts(sk, tse))
- goto err_c;
-
- if (restore_tcp_queues(sk, tse, img, inet_get_reuseaddr_lock(ii)))
- goto err_c;
-
- if (tse->has_nodelay && tse->nodelay) {
- aux = 1;
- if (restore_opt(sk, SOL_TCP, TCP_NODELAY, &aux))
- goto err_c;
- }
-
- if (tse->has_cork && tse->cork) {
- aux = 1;
- if (restore_opt(sk, SOL_TCP, TCP_CORK, &aux))
- goto err_c;
- }
-
- tcp_stream_entry__free_unpacked(tse, NULL);
- close_image(img);
- return 0;
-
-err_c:
- tcp_stream_entry__free_unpacked(tse, NULL);
- close_image(img);
-err:
- return -1;
-}
-
-unsigned long rst_tcp_socks_cpos;
-unsigned int rst_tcp_socks_nr = 0;
-
-int rst_tcp_socks_prep(void)
-{
- struct inet_sk_info *ii;
-
- rst_tcp_socks_cpos = rst_mem_align_cpos(RM_PRIVATE);
- list_for_each_entry(ii, &rst_tcp_repair_sockets, rlist) {
- struct rst_tcp_sock *rs;
-
- /*
- * rst_tcp_repair_sockets contains all sockets, so we need to
- * select sockets which restored in a current porcess.
- */
- if (ii->sk_fd == -1)
- continue;
-
- rs = rst_mem_alloc(sizeof(*rs), RM_PRIVATE);
- if (!rs)
- return -1;
-
- rs->sk = ii->sk_fd;
- rs->reuseaddr = ii->ie->opts->reuseaddr;
- rst_tcp_socks_nr++;
- }
-
- return 0;
-}
-
-int restore_one_tcp(int fd, struct inet_sk_info *ii)
-{
- pr_info("Restoring TCP connection\n");
-
- if (tcp_repair_on(fd))
- return -1;
-
- if (restore_tcp_conn_state(fd, ii))
- return -1;
-
- return 0;
-}
-
-void tcp_locked_conn_add(struct inet_sk_info *ii)
-{
- list_add_tail(&ii->rlist, &rst_tcp_repair_sockets);
- ii->sk_fd = -1;
-}
-
-void rst_unlock_tcp_connections(void)
-{
- struct inet_sk_info *ii;
-
- /* Network will be unlocked by network-unlock scripts */
- if (root_ns_mask & CLONE_NEWNET)
- return;
-
- list_for_each_entry(ii, &rst_tcp_repair_sockets, rlist)
- nf_unlock_connection_info(ii);
-}
-
-int check_tcp(void)
-{
- socklen_t optlen;
- int sk, ret;
- int val;
-
- sk = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP);
- if (sk < 0) {
- pr_perror("Can't create TCP socket :(");
- return -1;
- }
-
- ret = tcp_repair_on(sk);
- if (ret)
- goto out;
-
- optlen = sizeof(val);
- ret = getsockopt(sk, SOL_TCP, TCP_TIMESTAMP, &val, &optlen);
- if (ret)
- pr_perror("Can't get TCP_TIMESTAMP");
-
-out:
- close(sk);
-
- return ret;
-}
-
-void show_tcp_stream(struct cr_img *img, void *obj)
-{
- TcpStreamEntry *e = obj;
- if (opts.show_pages_content) {
- pr_msg("In-queue:");
- print_image_data(img, e->inq_len, 1);
- pr_msg("Out-queue:");
- print_image_data(img, e->outq_len, 1);
- }
-}
diff --git a/sk-unix.c b/sk-unix.c
deleted file mode 100644
index 608eb3eb3376..000000000000
--- a/sk-unix.c
+++ /dev/null
@@ -1,1435 +0,0 @@
-#include <sys/socket.h>
-#include <linux/netlink.h>
-#include <linux/rtnetlink.h>
-#include <unistd.h>
-#include <netinet/tcp.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <sys/un.h>
-#include <stdlib.h>
-#include <dlfcn.h>
-
-#include "asm/types.h"
-#include "libnetlink.h"
-#include "cr_options.h"
-#include "imgset.h"
-#include "unix_diag.h"
-#include "files.h"
-#include "file-ids.h"
-#include "image.h"
-#include "log.h"
-#include "util.h"
-#include "util-pie.h"
-#include "sockets.h"
-#include "sk-queue.h"
-#include "mount.h"
-#include "cr-service.h"
-#include "plugin.h"
-#include "namespaces.h"
-#include "pstree.h"
-
-#include "protobuf.h"
-#include "protobuf/sk-unix.pb-c.h"
-
-#undef LOG_PREFIX
-#define LOG_PREFIX "sk unix: "
-
-typedef struct {
- char *dir;
- unsigned int udiag_vfs_dev;
- unsigned int udiag_vfs_ino;
-} rel_name_desc_t;
-
-struct unix_sk_desc {
- struct socket_desc sd;
- unsigned int type;
- unsigned int state;
- unsigned int peer_ino;
- unsigned int rqlen;
- unsigned int wqlen;
- unsigned int namelen;
- char *name;
- rel_name_desc_t *rel_name;
- unsigned int nr_icons;
- unsigned int *icons;
- unsigned char shutdown;
-
- mode_t mode;
- uid_t uid;
- gid_t gid;
-
- struct list_head list;
-
- int fd;
- struct list_head peer_list;
- struct list_head peer_node;
-
- UnixSkEntry *ue;
-};
-
-static LIST_HEAD(unix_sockets);
-
-struct unix_sk_listen_icon {
- unsigned int peer_ino;
- struct unix_sk_desc *sk_desc;
- struct unix_sk_listen_icon *next;
-};
-
-struct unix_sk_exception {
- struct list_head unix_sk_list;
- ino_t unix_sk_ino;
-};
-
-#define SK_HASH_SIZE 32
-
-static struct unix_sk_listen_icon *unix_listen_icons[SK_HASH_SIZE];
-
-static struct unix_sk_listen_icon *lookup_unix_listen_icons(int peer_ino)
-{
- struct unix_sk_listen_icon *ic;
-
- for (ic = unix_listen_icons[peer_ino % SK_HASH_SIZE];
- ic; ic = ic->next)
- if (ic->peer_ino == peer_ino)
- return ic;
- return NULL;
-}
-
-static void show_one_unix(char *act, const struct unix_sk_desc *sk)
-{
- pr_debug("\t%s: ino %#x peer_ino %#x family %4d type %4d state %2d name %s\n",
- act, sk->sd.ino, sk->peer_ino, sk->sd.family, sk->type, sk->state, sk->name);
-
- if (sk->nr_icons) {
- int i;
-
- for (i = 0; i < sk->nr_icons; i++)
- pr_debug("\t\ticon: %4d\n", sk->icons[i]);
- }
-}
-
-static void show_one_unix_img(const char *act, const UnixSkEntry *e)
-{
- pr_info("\t%s: id %#x ino %#x peer %#x type %d state %d name %d bytes\n",
- act, e->id, e->ino, e->peer, e->type, e->state, (int)e->name.len);
-}
-
-static int can_dump_unix_sk(const struct unix_sk_desc *sk)
-{
- /*
- * The last case in this "if" is seqpacket socket,
- * that is connected to cr_service. We will dump
- * it properly below.
- */
- if (sk->type != SOCK_STREAM &&
- sk->type != SOCK_DGRAM &&
- sk->type != SOCK_SEQPACKET) {
- pr_err("Unsupported type (%d) on socket %x.\n"
- "Only stream/dgram/seqpacket are supported.\n",
- sk->type, sk->sd.ino);
- return 0;
- }
-
- switch (sk->state) {
- case TCP_LISTEN:
- case TCP_ESTABLISHED:
- case TCP_CLOSE:
- break;
- default:
- pr_err("Unknown state %d for unix socket %x\n",
- sk->state, sk->sd.ino);
- return 0;
- }
-
- return 1;
-}
-
-static bool unix_sk_exception_lookup_id(ino_t ino)
-{
- bool ret = false;
- struct unix_sk_exception *sk;
-
- list_for_each_entry(sk, &opts.ext_unixsk_ids, unix_sk_list) {
- if (sk->unix_sk_ino == ino) {
- pr_debug("Found ino %u in exception unix sk list\n", (unsigned int)ino);
- ret = true;
- break;
- }
- }
-
- return ret;
-}
-
-static int write_unix_entry(struct unix_sk_desc *sk)
-{
- int ret;
-
- ret = pb_write_one(img_from_set(glob_imgset, CR_FD_UNIXSK), sk->ue, PB_UNIX_SK);
-
- show_one_unix_img("Dumped", sk->ue);
-
- release_skopts(sk->ue->opts);
- xfree(sk->ue);
-
- sk->ue = NULL;
-
- return ret;
-}
-
-static int resolve_rel_name(struct unix_sk_desc *sk, const struct fd_parms *p)
-{
- rel_name_desc_t *rel_name = sk->rel_name;
- const char *dirs[] = { "cwd", "root" };
- struct pstree_item *task;
- int mntns_root, i;
- struct ns_id *ns;
-
- for_each_pstree_item(task) {
- if (task->pid.real == p->pid)
- break;
- }
- if (!task) {
- pr_err("Can't find task with pid %d\n", p->pid);
- return -ENOENT;
- }
-
- ns = lookup_ns_by_id(task->ids->mnt_ns_id, &mnt_ns_desc);
- if (!ns) {
- pr_err("Can't resolve mount namespace for pid %d\n", p->pid);
- return -ENOENT;
- }
-
- mntns_root = mntns_get_root_fd(ns);
- if (mntns_root < 0) {
- pr_err("Can't resolve fs root for pid %d\n", p->pid);
- return -ENOENT;
- }
-
- pr_debug("Resolving relative name %s for socket %x\n",
- sk->name, sk->sd.ino);
-
- for (i = 0; i < ARRAY_SIZE(dirs); i++) {
- char dir[PATH_MAX], path[PATH_MAX];
- struct stat st;
- int ret;
-
- snprintf(path, sizeof(path), "/proc/%d/%s", p->pid, dirs[i]);
- ret = readlink(path, dir, sizeof(dir));
- if (ret < 0 || (size_t)ret == sizeof(dir)) {
- pr_err("Can't readlink for %s\n", dirs[i]);
- return -1;
- }
- dir[ret] = 0;
-
- snprintf(path, sizeof(path), ".%s/%s", dir, sk->name);
- if (fstatat(mntns_root, path, &st, 0)) {
- if (errno == ENOENT)
- continue;
- goto err;
- }
-
- if ((st.st_ino == rel_name->udiag_vfs_ino) &&
- phys_stat_dev_match(st.st_dev, rel_name->udiag_vfs_dev, ns, &path[1])) {
- rel_name->dir = xstrdup(dir);
- if (!rel_name->dir)
- return -ENOMEM;
-
- pr_debug("Resolved relative socket name to dir %s\n", rel_name->dir);
- sk->mode = st.st_mode;
- sk->uid = st.st_uid;
- sk->gid = st.st_gid;
- return 0;
- }
- }
-
-err:
- pr_err("Can't resolve name for socket %#x\n", rel_name->udiag_vfs_ino);
- return -ENOENT;
-}
-
-static int dump_one_unix_fd(int lfd, u32 id, const struct fd_parms *p)
-{
- struct unix_sk_desc *sk, *peer;
- UnixSkEntry *ue;
- SkOptsEntry *skopts;
- FilePermsEntry *perms;
- FownEntry *fown;
-
- ue = xmalloc(sizeof(UnixSkEntry) +
- sizeof(SkOptsEntry) +
- sizeof(FilePermsEntry) +
- sizeof(FownEntry));
- if (ue == NULL)
- return -1;
-
- skopts = (void *) ue + sizeof(UnixSkEntry);
- perms = (void *) skopts + sizeof(SkOptsEntry);
- fown = (void *) perms + sizeof(FilePermsEntry);
-
- unix_sk_entry__init(ue);
- sk_opts_entry__init(skopts);
- file_perms_entry__init(perms);
-
- *fown = p->fown;
-
- sk = (struct unix_sk_desc *)lookup_socket(p->stat.st_ino, PF_UNIX, 0);
- if (IS_ERR_OR_NULL(sk)) {
- pr_err("Unix socket %#x not found\n", (int)p->stat.st_ino);
- goto err;
- }
-
- if (!can_dump_unix_sk(sk))
- goto err;
-
- BUG_ON(sk->sd.already_dumped);
-
- ue->name.len = (size_t)sk->namelen;
- ue->name.data = (void *)sk->name;
-
- ue->id = id;
- ue->ino = sk->sd.ino;
- ue->type = sk->type;
- ue->state = sk->state;
- ue->flags = p->flags;
- ue->backlog = sk->wqlen;
- ue->peer = sk->peer_ino;
- ue->fown = fown;
- ue->opts = skopts;
- ue->uflags = 0;
-
- if (sk->rel_name) {
- if (resolve_rel_name(sk, p))
- goto err;
- ue->name_dir = sk->rel_name->dir;
- }
-
- /*
- * Check if this socket is connected to criu service.
- * Dump it like closed one and mark it for restore.
- */
- if (unlikely(ue->peer == service_sk_ino)) {
- ue->state = TCP_CLOSE;
- ue->peer = 0;
- ue->uflags |= USK_SERVICE;
- }
-
- if (sk->namelen && *sk->name) {
- ue->file_perms = perms;
-
- perms->mode = sk->mode;
- perms->uid = userns_uid(sk->uid);
- perms->gid = userns_gid(sk->gid);
- }
-
- sk_encode_shutdown(ue, sk->shutdown);
-
- if (ue->peer) {
- peer = (struct unix_sk_desc *)lookup_socket(ue->peer, PF_UNIX, 0);
- if (IS_ERR_OR_NULL(peer)) {
- pr_err("Unix socket %#x without peer %#x\n",
- ue->ino, ue->peer);
- goto err;
- }
-
- /*
- * Peer should have us as peer or have a name by which
- * we can access one.
- */
- if (peer->peer_ino != ue->ino) {
- if (!peer->name) {
- pr_err("Unix socket %#x with unreachable peer %#x (%#x/%s)\n",
- ue->ino, ue->peer, peer->peer_ino, peer->name);
- goto err;
- }
- }
-
- /*
- * It can be external socket, so we defer dumping
- * until all sockets the program owns are processed.
- */
- if (!peer->sd.already_dumped) {
- if (list_empty(&peer->list)) {
- show_one_unix("Add a peer", peer);
- list_add_tail(&peer->list, &unix_sockets);
- }
-
- list_add(&sk->peer_node, &peer->peer_list);
- sk->fd = dup(lfd);
- if (sk->fd < 0) {
- pr_perror("Unable to dup(%d)", lfd);
- goto err;
- }
- }
-
- if ((ue->type != SOCK_DGRAM) && (
- ((ue->shutdown == SK_SHUTDOWN__READ) &&
- (peer->shutdown != SK_SHUTDOWN__WRITE)) ||
- ((ue->shutdown == SK_SHUTDOWN__WRITE) &&
- (peer->shutdown != SK_SHUTDOWN__READ)) ||
- ((ue->shutdown == SK_SHUTDOWN__BOTH) &&
- (peer->shutdown != SK_SHUTDOWN__BOTH)) )) {
- /*
- * On restore we assume, that stream pairs must
- * be shut down from one end only
- */
- pr_err("Shutdown mismatch %u:%d -> %u:%d\n",
- ue->ino, ue->shutdown, peer->sd.ino, peer->shutdown);
- goto err;
- }
- } else if (ue->state == TCP_ESTABLISHED) {
- const struct unix_sk_listen_icon *e;
-
- e = lookup_unix_listen_icons(ue->ino);
- if (!e) {
- /*
- * ESTABLISHED socket without peer and without
- * anyone waiting for it should be semi-closed
- * connection.
- */
-
- if (ue->shutdown == SK_SHUTDOWN__BOTH) {
- pr_info("Dumping semi-closed connection\n");
- goto dump;
- }
-
- pr_err("Dangling connection %#x\n", ue->ino);
- goto err;
- }
-
- /*
- * If this is in-flight connection we need to figure
- * out where to connect it on restore. Thus, tune up peer
- * id by searching an existing listening socket.
- *
- * Note the socket name will be found at restore stage,
- * not now, just to reduce size of dump files.
- */
-
- /* e->sk_desc is _never_ NULL */
- if (e->sk_desc->state != TCP_LISTEN) {
- pr_err("In-flight connection on "
- "non-listening socket %d\n", ue->ino);
- goto err;
- }
-
- ue->peer = e->sk_desc->sd.ino;
-
- pr_debug("\t\tFixed inflight socket %#x peer %#x)\n",
- ue->ino, ue->peer);
- }
-dump:
- if (dump_socket_opts(lfd, skopts))
- goto err;
-
- /*
- * If a stream listening socket has non-zero rqueue, this
- * means there are in-flight connections waiting to get
- * accept()-ed. We handle them separately with the "icons"
- * (i stands for in-flight, cons -- for connections) things.
- */
- if (sk->rqlen != 0 && !(sk->type == SOCK_STREAM &&
- sk->state == TCP_LISTEN))
- if (dump_sk_queue(lfd, id))
- goto err;
-
- pr_info("Dumping unix socket at %d\n", p->fd);
- show_one_unix("Dumping", sk);
-
- sk->ue = ue;
- /*
- * Postpone writing the entry if a peer isn't found yet.
- * It's required, because we may need to modify the entry.
- * For example, if a socket is external and is dumped by
- * a callback, the USK_CALLBACK flag must be set.
- */
- if (list_empty(&sk->peer_node) && write_unix_entry(sk))
- return -1;
-
- list_del_init(&sk->list);
- sk->sd.already_dumped = 1;
-
- while (!list_empty(&sk->peer_list)) {
- struct unix_sk_desc *psk;
- psk = list_first_entry(&sk->peer_list, struct unix_sk_desc, peer_node);
- close_safe(&psk->fd);
- list_del_init(&psk->peer_node);
-
- if (write_unix_entry(psk))
- return -1;
- }
-
- return 0;
-
-err:
- release_skopts(skopts);
- xfree(ue);
- return -1;
-}
-
-const struct fdtype_ops unix_dump_ops = {
- .type = FD_TYPES__UNIXSK,
- .dump = dump_one_unix_fd,
-};
-
-/*
- * Returns: < 0 on error, 0 if OK, 1 to skip the socket
- */
-static int unix_process_name(struct unix_sk_desc *d, const struct unix_diag_msg *m, struct rtattr **tb)
-{
- int len, ret;
- char *name;
-
- len = RTA_PAYLOAD(tb[UNIX_DIAG_NAME]);
- name = xmalloc(len + 1);
- if (!name)
- return -ENOMEM;
-
- memcpy(name, RTA_DATA(tb[UNIX_DIAG_NAME]), len);
- name[len] = '\0';
-
- if (name[0] != '\0') {
- struct unix_diag_vfs *uv;
- bool drop_path = false;
- char rpath[PATH_MAX];
- struct ns_id *ns;
- struct stat st;
- int mntns_root;
-
- if (!tb[UNIX_DIAG_VFS]) {
- pr_err("Bound socket w/o inode %#x\n", m->udiag_ino);
- goto skip;
- }
-
- ns = lookup_ns_by_id(root_item->ids->mnt_ns_id, &mnt_ns_desc);
- if (!ns) {
- ret = -ENOENT;
- goto out;
- }
-
- mntns_root = mntns_get_root_fd(ns);
- if (mntns_root < 0) {
- ret = -ENOENT;
- goto out;
- }
-
- uv = RTA_DATA(tb[UNIX_DIAG_VFS]);
- if (name[0] != '/') {
- /*
- * Relative names are be resolved later at first
- * dump attempt.
- */
- rel_name_desc_t *rel_name = xzalloc(sizeof(*rel_name));
- if (!rel_name) {
- ret = -ENOMEM;
- goto out;
- }
- rel_name->udiag_vfs_dev = uv->udiag_vfs_dev;
- rel_name->udiag_vfs_ino = uv->udiag_vfs_ino;
-
- d->rel_name = rel_name;
- goto postprone;
- }
-
- snprintf(rpath, sizeof(rpath), ".%s", name);
- if (fstatat(mntns_root, rpath, &st, 0)) {
- if (errno != ENOENT) {
- pr_warn("Can't stat socket %#x(%s), skipping: %m (err %d)\n",
- m->udiag_ino, rpath, errno);
- goto skip;
- }
-
- pr_info("unix: Dropping path %s for unlinked sk %#x\n",
- name, m->udiag_ino);
- drop_path = true;
- } else if ((st.st_ino != uv->udiag_vfs_ino) ||
- !phys_stat_dev_match(st.st_dev, uv->udiag_vfs_dev, ns, name)) {
- pr_info("unix: Dropping path %s for unlinked bound "
- "sk %#x.%#x real %#x.%#x\n",
- name, (int)st.st_dev, (int)st.st_ino,
- (int)uv->udiag_vfs_dev, (int)uv->udiag_vfs_ino);
- drop_path = true;
- }
-
- if (drop_path) {
- /*
- * When a socket is bound to unlinked file, we
- * just drop his name, since no one will access
- * it via one.
- */
- xfree(name);
- len = 0;
- name = NULL;
- }
-
- d->mode = st.st_mode;
- d->uid = st.st_uid;
- d->gid = st.st_gid;
- }
-
-postprone:
- d->namelen = len;
- d->name = name;
- return 0;
-
-out:
- xfree(name);
- return ret;
-skip:
- ret = 1;
- goto out;
-}
-
-static int unix_collect_one(const struct unix_diag_msg *m,
- struct rtattr **tb)
-{
- struct unix_sk_desc *d;
- int ret = 0;
-
- d = xzalloc(sizeof(*d));
- if (!d)
- return -1;
-
- d->type = m->udiag_type;
- d->state = m->udiag_state;
- INIT_LIST_HEAD(&d->list);
-
- INIT_LIST_HEAD(&d->peer_list);
- INIT_LIST_HEAD(&d->peer_node);
- d->fd = -1;
-
- if (tb[UNIX_DIAG_SHUTDOWN])
- d->shutdown = *(u8 *)RTA_DATA(tb[UNIX_DIAG_SHUTDOWN]);
- else
- pr_err_once("No socket shutdown info\n");
-
- if (tb[UNIX_DIAG_PEER])
- d->peer_ino = *(int *)RTA_DATA(tb[UNIX_DIAG_PEER]);
-
- if (tb[UNIX_DIAG_NAME]) {
- ret = unix_process_name(d, m, tb);
- if (ret < 0)
- goto err;
- else if (ret == 1)
- goto skip;
- BUG_ON(ret != 0);
- }
-
- if (tb[UNIX_DIAG_ICONS]) {
- int len = RTA_PAYLOAD(tb[UNIX_DIAG_ICONS]);
- int i;
-
- d->icons = xmalloc(len);
- if (!d->icons)
- goto err;
-
- memcpy(d->icons, RTA_DATA(tb[UNIX_DIAG_ICONS]), len);
- d->nr_icons = len / sizeof(u32);
-
- /*
- * Remember these sockets, we will need them
- * to fix up in-flight sockets peers.
- */
- for (i = 0; i < d->nr_icons; i++) {
- struct unix_sk_listen_icon *e, **chain;
- int n;
-
- e = xzalloc(sizeof(*e));
- if (!e)
- goto err;
-
- n = d->icons[i];
- chain = &unix_listen_icons[n % SK_HASH_SIZE];
- e->next = *chain;
- *chain = e;
-
- pr_debug("\t\tCollected icon %d\n", d->icons[i]);
-
- e->peer_ino = n;
- e->sk_desc = d;
- }
- }
-
- if (tb[UNIX_DIAG_RQLEN]) {
- struct unix_diag_rqlen *rq;
-
- rq = (struct unix_diag_rqlen *)RTA_DATA(tb[UNIX_DIAG_RQLEN]);
- d->rqlen = rq->udiag_rqueue;
- d->wqlen = rq->udiag_wqueue;
- }
-
- sk_collect_one(m->udiag_ino, AF_UNIX, &d->sd);
- show_one_unix("Collected", d);
-
- return 0;
-err:
- ret = -1;
-skip:
- xfree(d->icons);
- xfree(d->name);
- xfree(d);
- return ret;
-}
-
-int unix_receive_one(struct nlmsghdr *h, void *arg)
-{
- struct unix_diag_msg *m = NLMSG_DATA(h);
- struct rtattr *tb[UNIX_DIAG_MAX+1];
-
- parse_rtattr(tb, UNIX_DIAG_MAX, (struct rtattr *)(m + 1),
- h->nlmsg_len - NLMSG_LENGTH(sizeof(*m)));
-
- return unix_collect_one(m, tb);
-}
-
-static int dump_external_sockets(struct unix_sk_desc *peer)
-{
- struct unix_sk_desc *sk;
- int ret;
-
- while (!list_empty(&peer->peer_list)) {
- sk = list_first_entry(&peer->peer_list, struct unix_sk_desc, peer_node);
-
- ret = run_plugins(DUMP_UNIX_SK, sk->fd, sk->sd.ino);
- if (ret == -ENOTSUP) {
- if (!opts.ext_unix_sk) {
- show_one_unix("Runaway socket", peer);
- pr_err("External socket is used. "
- "Consider using --" USK_EXT_PARAM " option.\n");
- return -1;
- }
-
- if (unix_sk_exception_lookup_id(sk->sd.ino)) {
- pr_debug("found exception for unix name-less external socket.\n");
- } else {
- if (peer->type != SOCK_DGRAM) {
- show_one_unix("Ext stream not supported", peer);
- pr_err("Can't dump half of stream unix connection.\n");
- return -1;
- }
-
- if (!peer->name) {
- show_one_unix("Ext dgram w/o name", peer);
- pr_err("Can't dump name-less external socket.\n");
- pr_err("%d\n", sk->fd);
- return -1;
- }
- }
- } else if (ret < 0)
- return -1;
- else
- sk->ue->uflags |= USK_CALLBACK;
-
- if (write_unix_entry(sk))
- return -1;
- close_safe(&sk->fd);
- list_del_init(&sk->peer_node);
- }
-
- return 0;
-}
-
-int fix_external_unix_sockets(void)
-{
- struct unix_sk_desc *sk;
-
- pr_debug("Dumping external sockets\n");
-
- list_for_each_entry(sk, &unix_sockets, list) {
- UnixSkEntry e = UNIX_SK_ENTRY__INIT;
- FownEntry fown = FOWN_ENTRY__INIT;
- SkOptsEntry skopts = SK_OPTS_ENTRY__INIT;
-
- show_one_unix("Dumping extern", sk);
-
- BUG_ON(sk->sd.already_dumped);
-
- fd_id_generate_special(NULL, &e.id);
- e.ino = sk->sd.ino;
- e.type = SOCK_DGRAM;
- e.state = TCP_LISTEN;
- e.name.data = (void *)sk->name;
- e.name.len = (size_t)sk->namelen;
- e.uflags = USK_EXTERN;
- e.peer = 0;
- e.fown = &fown;
- e.opts = &skopts;
-
- if (pb_write_one(img_from_set(glob_imgset, CR_FD_UNIXSK), &e, PB_UNIX_SK))
- goto err;
-
- show_one_unix_img("Dumped extern", &e);
-
- if (dump_external_sockets(sk))
- goto err;
- }
-
- return 0;
-err:
- return -1;
-}
-
-struct unix_sk_info {
- UnixSkEntry *ue;
- struct list_head list;
- char *name;
- char *name_dir;
- unsigned flags;
- struct unix_sk_info *peer;
- struct file_desc d;
-
- /*
- * Futex to signal when the socket is prepared. In particular, we
- * signal after bind()ing the socket if it is not in TCP_LISTEN, or
- * after listen() if the socket is in TCP_LISTEN.
- */
- futex_t prepared;
-
- /*
- * For DGRAM sockets with queues, we should only restore the queue
- * once although it may be open by more than one tid. This is the peer
- * that should do the queueing.
- */
- u32 queuer;
-};
-
-#define USK_PAIR_MASTER 0x1
-#define USK_PAIR_SLAVE 0x2
-
-static struct unix_sk_info *find_unix_sk_by_ino(int ino)
-{
- struct unix_sk_info *ui;
-
- list_for_each_entry(ui, &unix_sockets, list) {
- if (ui->ue->ino == ino)
- return ui;
- }
-
- return NULL;
-}
-
-static int shutdown_unix_sk(int sk, struct unix_sk_info *ui)
-{
- int how;
- UnixSkEntry *ue = ui->ue;
-
- if (!ue->has_shutdown || ue->shutdown == SK_SHUTDOWN__NONE)
- return 0;
-
- how = sk_decode_shutdown(ue->shutdown);
- if (shutdown(sk, how)) {
- pr_perror("Can't shutdown unix socket");
- return -1;
- }
-
- pr_debug("Socket %#x is shut down %d\n", ue->ino, how);
- return 0;
-}
-
-static void revert_unix_sk_cwd(int *prev_cwd_fd)
-{
- if (prev_cwd_fd && *prev_cwd_fd >= 0) {
- if (fchdir(*prev_cwd_fd))
- pr_perror("Can't revert working dir");
- else
- pr_debug("Reverted working dir\n");
- close(*prev_cwd_fd);
- *prev_cwd_fd = -1;
- }
-}
-
-static int prep_unix_sk_cwd(struct unix_sk_info *ui, int *prev_cwd_fd)
-{
- if (ui->name_dir) {
- *prev_cwd_fd = open(".", O_RDONLY);
- if (*prev_cwd_fd < 0) {
- pr_err("Can't open current dir\n");
- return -1;
- }
- if (chdir(ui->name_dir)) {
- pr_perror("Can't change working dir %s",
- ui->name_dir);
- close(*prev_cwd_fd);
- *prev_cwd_fd = -1;
- return -1;
- }
- pr_debug("Change working dir to %s\n", ui->name_dir);
- } else
- *prev_cwd_fd = -1;
- return 0;
-}
-
-static int post_open_unix_sk(struct file_desc *d, int fd)
-{
- struct unix_sk_info *ui;
- struct unix_sk_info *peer;
- struct sockaddr_un addr;
- int cwd_fd = -1;
-
- ui = container_of(d, struct unix_sk_info, d);
- if (ui->flags & (USK_PAIR_MASTER | USK_PAIR_SLAVE))
- return 0;
-
- peer = ui->peer;
-
- if (peer == NULL)
- return 0;
-
- if (ui->ue->uflags & USK_CALLBACK)
- return 0;
-
- /* Skip external sockets */
- if (!list_empty(&peer->d.fd_info_head))
- futex_wait_while(&peer->prepared, 0);
-
- if (ui->ue->uflags & USK_INHERIT)
- return 0;
-
- memset(&addr, 0, sizeof(addr));
- addr.sun_family = AF_UNIX;
- memcpy(&addr.sun_path, peer->name, peer->ue->name.len);
-
- pr_info("\tConnect %#x to %#x\n", ui->ue->ino, peer->ue->ino);
-
- if (prep_unix_sk_cwd(peer, &cwd_fd))
- return -1;
-
- if (connect(fd, (struct sockaddr *)&addr,
- sizeof(addr.sun_family) +
- peer->ue->name.len) < 0) {
- revert_unix_sk_cwd(&cwd_fd);
- pr_perror("Can't connect %#x socket", ui->ue->ino);
- return -1;
- }
-
- revert_unix_sk_cwd(&cwd_fd);
-
- if (peer->queuer == ui->ue->ino && restore_sk_queue(fd, peer->ue->id))
- return -1;
-
- if (rst_file_params(fd, ui->ue->fown, ui->ue->flags))
- return -1;
-
- if (restore_socket_opts(fd, ui->ue->opts))
- return -1;
-
- if (shutdown_unix_sk(fd, ui))
- return -1;
-
- return 0;
-}
-
-static int bind_unix_sk(int sk, struct unix_sk_info *ui)
-{
- struct sockaddr_un addr;
- int cwd_fd = -1;
- int ret = -1;
-
- if ((ui->ue->type == SOCK_STREAM) && (ui->ue->state == TCP_ESTABLISHED)) {
- /*
- * FIXME this can be done, but for doing this properly we
- * need to bind socket to its name, then rename one to
- * some temporary unique one and after all the sockets are
- * restored we should walk those temp names and rename
- * some of them back to real ones.
- */
- ret = 0;
- goto done;
- }
-
- memset(&addr, 0, sizeof(addr));
- addr.sun_family = AF_UNIX;
- memcpy(&addr.sun_path, ui->name, ui->ue->name.len);
-
- if (prep_unix_sk_cwd(ui, &cwd_fd))
- return -1;
-
- if (bind(sk, (struct sockaddr *)&addr,
- sizeof(addr.sun_family) + ui->ue->name.len)) {
- pr_perror("Can't bind socket");
- goto done;
- }
-
- if (ui->ue->name.len && *ui->name && ui->ue->file_perms) {
- FilePermsEntry *perms = ui->ue->file_perms;
- char fname[PATH_MAX];
-
- if (ui->ue->name.len >= sizeof(fname)) {
- pr_err("The file name is too long\n");
- goto done;
- }
-
- memcpy(fname, ui->name, ui->ue->name.len);
- fname[ui->ue->name.len] = '\0';
-
- if (fchownat(AT_FDCWD, fname, perms->uid, perms->gid, 0) == -1) {
- pr_perror("Unable to change file owner and group");
- goto done;
- }
-
- if (fchmodat(AT_FDCWD, fname, perms->mode, 0) == -1) {
- pr_perror("Unable to change file mode bits");
- goto done;
- }
- }
-
- if (ui->ue->state != TCP_LISTEN)
- futex_set_and_wake(&ui->prepared, 1);
-
- ret = 0;
-done:
- revert_unix_sk_cwd(&cwd_fd);
- return ret;
-}
-
-static int unixsk_should_open_transport(FdinfoEntry *fe,
- struct file_desc *d)
-{
- struct unix_sk_info *ui;
-
- ui = container_of(d, struct unix_sk_info, d);
- return ui->flags & USK_PAIR_SLAVE;
-}
-
-static int open_unixsk_pair_master(struct unix_sk_info *ui)
-{
- int sk[2], tsk;
- struct unix_sk_info *peer = ui->peer;
- struct fdinfo_list_entry *fle;
-
- pr_info("Opening pair master (id %#x ino %#x peer %#x)\n",
- ui->ue->id, ui->ue->ino, ui->ue->peer);
-
- if (socketpair(PF_UNIX, ui->ue->type, 0, sk) < 0) {
- pr_perror("Can't make socketpair");
- return -1;
- }
-
- if (restore_sk_queue(sk[0], peer->ue->id))
- return -1;
- if (restore_sk_queue(sk[1], ui->ue->id))
- return -1;
-
- if (bind_unix_sk(sk[0], ui))
- return -1;
-
- if (rst_file_params(sk[0], ui->ue->fown, ui->ue->flags))
- return -1;
-
- if (restore_socket_opts(sk[0], ui->ue->opts))
- return -1;
-
- if (shutdown_unix_sk(sk[0], ui))
- return -1;
-
- tsk = socket(PF_UNIX, SOCK_DGRAM, 0);
- if (tsk < 0) {
- pr_perror("Can't make transport socket");
- return -1;
- }
-
- fle = file_master(&peer->d);
- if (send_fd_to_peer(sk[1], fle, tsk)) {
- pr_err("Can't send pair slave\n");
- return -1;
- }
-
- close(tsk);
- close(sk[1]);
-
- return sk[0];
-}
-
-static int open_unixsk_pair_slave(struct unix_sk_info *ui)
-{
- struct fdinfo_list_entry *fle;
- int sk;
-
- fle = file_master(&ui->d);
-
- pr_info("Opening pair slave (id %#x ino %#x peer %#x) on %d\n",
- ui->ue->id, ui->ue->ino, ui->ue->peer, fle->fe->fd);
-
- sk = recv_fd(fle->fe->fd);
- if (sk < 0) {
- pr_err("Can't recv pair slave\n");
- return -1;
- }
- close(fle->fe->fd);
-
- if (bind_unix_sk(sk, ui))
- return -1;
-
- if (rst_file_params(sk, ui->ue->fown, ui->ue->flags))
- return -1;
-
- if (restore_socket_opts(sk, ui->ue->opts))
- return -1;
-
- if (ui->ue->type == SOCK_DGRAM)
- /*
- * Stream socket's "slave" end will be shut down
- * together with master
- */
- if (shutdown_unix_sk(sk, ui))
- return -1;
-
- return sk;
-}
-
-static int open_unixsk_standalone(struct unix_sk_info *ui)
-{
- int sk;
-
- pr_info("Opening standalone socket (id %#x ino %#x peer %#x)\n",
- ui->ue->id, ui->ue->ino, ui->ue->peer);
-
- /*
- * Check if this socket was connected to criu service.
- * If so, put response, that dumping and restoring
- * was successful.
- */
- if (ui->ue->uflags & USK_SERVICE) {
- int sks[2];
-
- if (socketpair(PF_UNIX, ui->ue->type, 0, sks)) {
- pr_perror("Can't create socketpair");
- return -1;
- }
-
- if (send_criu_dump_resp(sks[1], true, true) == -1)
- return -1;
-
- close(sks[1]);
- sk = sks[0];
- } else if ((ui->ue->state == TCP_ESTABLISHED) && !ui->ue->peer) {
- int ret, sks[2];
-
- if (ui->ue->type != SOCK_STREAM) {
- pr_err("Non-stream socket %x in established state\n",
- ui->ue->ino);
- return -1;
- }
-
- if (ui->ue->shutdown != SK_SHUTDOWN__BOTH) {
- pr_err("Wrong shutdown/peer state for %x\n",
- ui->ue->ino);
- return -1;
- }
-
- ret = socketpair(PF_UNIX, ui->ue->type, 0, sks);
- if (ret < 0) {
- pr_perror("Can't create socketpair");
- return -1;
- }
-
- /*
- * Restore queue at the one end,
- * before closing the second one.
- */
- if (restore_sk_queue(sks[1], ui->ue->id)) {
- pr_perror("Can't restore socket queue");
- return -1;
- }
-
- close(sks[1]);
- sk = sks[0];
- } else if (ui->ue->type == SOCK_DGRAM && !ui->queuer) {
- struct sockaddr_un addr;
- int sks[2];
-
- if (socketpair(PF_UNIX, ui->ue->type, 0, sks) < 0) {
- pr_perror("Can't create socketpair");
- return -1;
- }
-
- sk = sks[0];
- addr.sun_family = AF_UNSPEC;
-
- /*
- * socketpair() assigns sks[1] as a peer of sks[0]
- * (and vice versa). But in this case (not zero peer)
- * it's impossible for other sockets to connect
- * to sks[0] (see unix_dgram_connect()->unix_may_send()).
- * The below is hack: we use that connect with AF_UNSPEC
- * clears socket's peer.
- */
- if (connect(sk, &addr, sizeof(addr.sun_family))) {
- pr_perror("Can't clear socket's peer");
- return -1;
- }
-
- /*
- * This must be after the connect() hack, because
- * connect() flushes receive queue.
- */
- if (restore_sk_queue(sks[1], ui->ue->id)) {
- pr_perror("Can't restore socket queue");
- return -1;
- }
- close(sks[1]);
- } else {
- if (ui->ue->uflags & USK_CALLBACK) {
- sk = run_plugins(RESTORE_UNIX_SK, ui->ue->ino);
- if (sk >= 0)
- goto out;
- }
-
- /*
- * Connect to external sockets requires
- * special option to be passed.
- */
- if (ui->peer && (ui->peer->ue->uflags & USK_EXTERN) &&
- !(opts.ext_unix_sk)) {
- pr_err("External socket found in image. "
- "Consider using the --" USK_EXT_PARAM
- "option to allow restoring it.\n");
- return -1;
- }
-
-
- sk = socket(PF_UNIX, ui->ue->type, 0);
- if (sk < 0) {
- pr_perror("Can't make unix socket");
- return -1;
- }
- }
-
- if (bind_unix_sk(sk, ui))
- return -1;
-
- if (ui->ue->state == TCP_LISTEN) {
- pr_info("\tPutting %#x into listen state\n", ui->ue->ino);
- if (listen(sk, ui->ue->backlog) < 0) {
- pr_perror("Can't make usk listen");
- return -1;
- }
- futex_set_and_wake(&ui->prepared, 1);
- }
-out:
- if (rst_file_params(sk, ui->ue->fown, ui->ue->flags))
- return -1;
-
- if (restore_socket_opts(sk, ui->ue->opts))
- return -1;
-
- return sk;
-}
-
-static int open_unix_sk(struct file_desc *d)
-{
- struct unix_sk_info *ui;
-
- ui = container_of(d, struct unix_sk_info, d);
-
- int unixsk_fd = -1;
-
- if (inherited_fd(d, &unixsk_fd)) {
- ui->ue->uflags |= USK_INHERIT;
- return unixsk_fd;
- } else if (ui->flags & USK_PAIR_MASTER)
- return open_unixsk_pair_master(ui);
- else if (ui->flags & USK_PAIR_SLAVE)
- return open_unixsk_pair_slave(ui);
- else
- return open_unixsk_standalone(ui);
-}
-
-static char *socket_d_name(struct file_desc *d, char *buf, size_t s)
-{
- struct unix_sk_info *ui;
-
- ui = container_of(d, struct unix_sk_info, d);
-
- if (snprintf(buf, s, "socket:[%d]", ui->ue->ino) >= s) {
- pr_err("Not enough room for unixsk %d identifier string\n",
- ui->ue->ino);
- return NULL;
- }
-
- return buf;
-}
-
-static struct file_desc_ops unix_desc_ops = {
- .type = FD_TYPES__UNIXSK,
- .open = open_unix_sk,
- .post_open = post_open_unix_sk,
- .want_transport = unixsk_should_open_transport,
- .name = socket_d_name,
-};
-
-/*
- * Make FS clean from sockets we're about to
- * restore. See for how we bind them for details
- */
-static void unlink_stale(struct unix_sk_info *ui)
-{
- int ret, cwd_fd;
-
- if (ui->name[0] == '\0' || (ui->ue->uflags & USK_EXTERN))
- return;
-
- if (prep_unix_sk_cwd(ui, &cwd_fd))
- return;
-
- ret = unlinkat(AT_FDCWD, ui->name, 0) ? -1 : 0;
- if (ret < 0) {
- pr_warn("Can't unlink stale socket %#x peer %#x (name %s dir %s)\n",
- ui->ue->ino, ui->ue->peer,
- ui->name ? (ui->name[0] ? ui->name : &ui->name[1]) : "-",
- ui->name_dir ? ui->name_dir : "-");
- }
- revert_unix_sk_cwd(&cwd_fd);
-}
-
-static int collect_one_unixsk(void *o, ProtobufCMessage *base)
-{
- struct unix_sk_info *ui = o;
-
- ui->ue = pb_msg(base, UnixSkEntry);
- ui->name_dir = (void *)ui->ue->name_dir;
-
- if (ui->ue->name.len) {
- if (ui->ue->name.len > UNIX_PATH_MAX) {
- pr_err("Bad unix name len %d\n", (int)ui->ue->name.len);
- return -1;
- }
-
- ui->name = (void *)ui->ue->name.data;
-
- unlink_stale(ui);
- } else
- ui->name = NULL;
-
- futex_init(&ui->prepared);
- ui->queuer = 0;
- ui->peer = NULL;
- ui->flags = 0;
- pr_info(" `- Got %#x peer %#x (name %s dir %s)\n",
- ui->ue->ino, ui->ue->peer,
- ui->name ? (ui->name[0] ? ui->name : &ui->name[1]) : "-",
- ui->name_dir ? ui->name_dir : "-");
- list_add_tail(&ui->list, &unix_sockets);
- return file_desc_add(&ui->d, ui->ue->id, &unix_desc_ops);
-}
-
-struct collect_image_info unix_sk_cinfo = {
- .fd_type = CR_FD_UNIXSK,
- .pb_type = PB_UNIX_SK,
- .priv_size = sizeof(struct unix_sk_info),
- .collect = collect_one_unixsk,
- .flags = COLLECT_SHARED,
-};
-
-int collect_unix_sockets(void)
-{
- return read_sk_queues();
-}
-
-int resolve_unix_peers(void)
-{
- struct unix_sk_info *ui, *peer;
- struct fdinfo_list_entry *fle, *fle_peer;
-
- list_for_each_entry(ui, &unix_sockets, list) {
- if (ui->peer)
- continue;
- if (!ui->ue->peer)
- continue;
-
- peer = find_unix_sk_by_ino(ui->ue->peer);
-
- if (!peer) {
- pr_err("FATAL: Peer %#x unresolved for %#x\n",
- ui->ue->peer, ui->ue->ino);
- return -1;
- }
-
- ui->peer = peer;
- if (!peer->queuer)
- peer->queuer = ui->ue->ino;
- if (ui == peer)
- /* socket connected to self %) */
- continue;
- if (peer->ue->peer != ui->ue->ino)
- continue;
-
- /* socketpair or interconnected sockets */
- peer->peer = ui;
-
- /*
- * Select who will restore the pair. Check is identical to
- * the one in pipes.c and makes sure tasks wait for each other
- * in pids sorting order (ascending).
- */
-
- fle = file_master(&ui->d);
- fle_peer = file_master(&peer->d);
-
- if (fdinfo_rst_prio(fle, fle_peer)) {
- ui->flags |= USK_PAIR_MASTER;
- peer->flags |= USK_PAIR_SLAVE;
- } else {
- peer->flags |= USK_PAIR_MASTER;
- ui->flags |= USK_PAIR_SLAVE;
- }
- }
-
- pr_info("Unix sockets:\n");
- list_for_each_entry(ui, &unix_sockets, list) {
- struct fdinfo_list_entry *fle;
-
- pr_info("\t%#x -> %#x (%#x) flags %#x\n", ui->ue->ino, ui->ue->peer,
- ui->peer ? ui->peer->ue->ino : 0, ui->flags);
- list_for_each_entry(fle, &ui->d.fd_info_head, desc_list)
- pr_info("\t\tfd %d in pid %d\n",
- fle->fe->fd, fle->pid);
-
- }
-
- return 0;
-}
-
-int unix_sk_id_add(ino_t ino)
-{
- struct unix_sk_exception *unix_sk;
-
- /* TODO: validate inode here? */
-
- unix_sk = xmalloc(sizeof *unix_sk);
- if (unix_sk == NULL)
- return -1;
- unix_sk->unix_sk_ino = ino;
- list_add_tail(&unix_sk->unix_sk_list, &opts.ext_unixsk_ids);
-
- return 0;
-}
-
-int unix_sk_ids_parse(char *optarg)
-{
- /*
- * parsing option of the following form: --ext-unix-sk=<inode value>,<inode
- * value>... or short form -x<inode>,<inode>...
- */
-
- char *iter = optarg;
-
- while (*iter != '\0') {
- if (*iter == ',')
- iter++;
- else {
- ino_t ino = (ino_t)strtoul(iter, &iter, 10);
-
- if (0 == ino) {
- pr_err("Can't parse unix socket inode from optarg: %s\n", optarg);
- return -1;
- }
- if (unix_sk_id_add(ino) < 0) {
- pr_err("Can't add unix socket inode in list: %s\n", optarg);
- return -1;
- }
- }
- }
-
- return 0;
-}
-
diff --git a/sockets.c b/sockets.c
deleted file mode 100644
index d8d09aae2d15..000000000000
--- a/sockets.c
+++ /dev/null
@@ -1,731 +0,0 @@
-#include <unistd.h>
-#include <sys/socket.h>
-#include <linux/netlink.h>
-#include <linux/rtnetlink.h>
-#include <netinet/tcp.h>
-#include <errno.h>
-#include <linux/if.h>
-#include <linux/filter.h>
-#include <string.h>
-#include <netinet/in.h>
-
-#include "libnetlink.h"
-#include "sockets.h"
-#include "unix_diag.h"
-#include "inet_diag.h"
-#include "packet_diag.h"
-#include "netlink_diag.h"
-#include "files.h"
-#include "util-pie.h"
-#include "sk-packet.h"
-#include "namespaces.h"
-#include "net.h"
-#include "fs-magic.h"
-
-#ifndef SOCK_DIAG_BY_FAMILY
-#define SOCK_DIAG_BY_FAMILY 20
-#endif
-
-#define SK_HASH_SIZE 32
-
-#ifndef SO_GET_FILTER
-#define SO_GET_FILTER SO_ATTACH_FILTER
-#endif
-
-struct sock_diag_greq {
- u8 family;
- u8 protocol;
-};
-
-struct sock_diag_req {
- struct nlmsghdr hdr;
- union {
- struct unix_diag_req u;
- struct inet_diag_req_v2 i;
- struct packet_diag_req p;
- struct netlink_diag_req n;
- struct sock_diag_greq g;
- } r;
-};
-
-enum socket_cl_bits
-{
- NETLINK_CL_BIT,
- INET_TCP_CL_BIT,
- INET_UDP_CL_BIT,
- INET_UDPLITE_CL_BIT,
- INET6_TCP_CL_BIT,
- INET6_UDP_CL_BIT,
- INET6_UDPLITE_CL_BIT,
- UNIX_CL_BIT,
- PACKET_CL_BIT,
- _MAX_CL_BIT,
-};
-
-#define MAX_CL_BIT (_MAX_CL_BIT - 1)
-
-static DECLARE_BITMAP(socket_cl_bits, MAX_CL_BIT);
-
-static inline
-enum socket_cl_bits get_collect_bit_nr(unsigned int family, unsigned int proto)
-{
- if (family == AF_NETLINK)
- return NETLINK_CL_BIT;
- if (family == AF_UNIX)
- return UNIX_CL_BIT;
- if (family == AF_PACKET)
- return PACKET_CL_BIT;
- if (family == AF_INET) {
- if (proto == IPPROTO_TCP)
- return INET_TCP_CL_BIT;
- if (proto == IPPROTO_UDP)
- return INET_UDP_CL_BIT;
- if (proto == IPPROTO_UDPLITE)
- return INET_UDPLITE_CL_BIT;
- }
- if (family == AF_INET6) {
- if (proto == IPPROTO_TCP)
- return INET6_TCP_CL_BIT;
- if (proto == IPPROTO_UDP)
- return INET6_UDP_CL_BIT;
- if (proto == IPPROTO_UDPLITE)
- return INET6_UDPLITE_CL_BIT;
- }
-
- pr_err("Unknown pair family %d proto %d\n", family, proto);
- BUG();
- return -1;
-}
-
-static void set_collect_bit(unsigned int family, unsigned int proto)
-{
- enum socket_cl_bits nr;
-
- nr = get_collect_bit_nr(family, proto);
- set_bit(nr, socket_cl_bits);
-}
-
-bool socket_test_collect_bit(unsigned int family, unsigned int proto)
-{
- enum socket_cl_bits nr;
-
- nr = get_collect_bit_nr(family, proto);
- return test_bit(nr, socket_cl_bits) != 0;
-}
-
-static int probe_recv_one(struct nlmsghdr *h, void *arg)
-{
- pr_err("PROBE RECEIVED\n");
- return -1;
-}
-
-static int probe_err(int err, void *arg)
-{
- int expected_err = *(int *)arg;
-
- if (err == expected_err)
- return 0;
-
- pr_err("Diag module missing (%d)\n", err);
- return err;
-}
-
-static inline void probe_diag(int nl, struct sock_diag_req *req, int expected_err)
-{
- do_rtnl_req(nl, req, req->hdr.nlmsg_len, probe_recv_one, probe_err, &expected_err);
-}
-
-void preload_socket_modules()
-{
- int nl;
- struct sock_diag_req req;
-
- /*
- * If the task to dump (e.g. an LXC container) has any netlink
- * KOBJECT_UEVENT socket open and the _diag modules aren't
- * loaded is dumped, criu will freeze the task and then the
- * kernel will send it messages on the socket, and then we will
- * fail to dump because the socket has pending data. The Real
- * Solution is to dump this pending data, but we just make sure
- * modules are there beforehand for now so that the first dump
- * doesn't fail.
- */
-
- nl = socket(PF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG);
- if (nl < 0)
- return;
-
- pr_info("Probing sock diag modules\n");
-
- memset(&req, 0, sizeof(req));
- req.hdr.nlmsg_type = SOCK_DIAG_BY_FAMILY;
- req.hdr.nlmsg_seq = CR_NLMSG_SEQ;
-
- /*
- * Probe UNIX, netlink and packet diag-s by feeding
- * to the kernel request that is shorter than they
- * expect, byt still containing the family to make
- * sure the family handler is there. The family-level
- * diag module would report EINVAL in this case.
- */
-
- req.hdr.nlmsg_len = sizeof(req.hdr) + sizeof(req.r.g);
- req.hdr.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST;
-
- req.r.g.family = AF_UNIX;
- probe_diag(nl, &req, -EINVAL);
-
- req.r.g.family = AF_PACKET;
- probe_diag(nl, &req, -EINVAL);
-
- req.r.g.family = AF_NETLINK;
- probe_diag(nl, &req, -EINVAL);
-
- /*
- * TCP and UDP(LITE) diags do not support such trick, only
- * inet_diag module can be probed like that. For the protocol
- * level ones it's OK to request for exact non-existing socket
- * and check for ENOENT being reported back as error.
- */
-
- req.hdr.nlmsg_len = sizeof(req.hdr) + sizeof(req.r.i);
- req.hdr.nlmsg_flags = NLM_F_REQUEST;
- req.r.i.sdiag_family = AF_INET;
-
- req.r.i.sdiag_protocol = IPPROTO_TCP;
- probe_diag(nl, &req, -ENOENT);
-
- req.r.i.sdiag_protocol = IPPROTO_UDP; /* UDLITE is merged with UDP */
- probe_diag(nl, &req, -ENOENT);
-
- close(nl);
- pr_info("Done probing\n");
-}
-
-static int dump_bound_dev(int sk, SkOptsEntry *soe)
-{
- int ret;
- char dev[IFNAMSIZ];
- socklen_t len = sizeof(dev);
-
- ret = getsockopt(sk, SOL_SOCKET, SO_BINDTODEVICE, &dev, &len);
- if (ret) {
- pr_perror("Can't get bound dev");
- return ret;
- }
-
- if (len == 0)
- return 0;
-
- pr_debug("\tDumping %s bound dev for sk\n", dev);
- soe->so_bound_dev = xmalloc(len);
- if (soe->so_bound_dev == NULL)
- return -1;
- strcpy(soe->so_bound_dev, dev);
- return 0;
-}
-
-static int restore_bound_dev(int sk, SkOptsEntry *soe)
-{
- char *n = soe->so_bound_dev;
-
- if (!n)
- return 0;
-
- pr_debug("\tBinding socket to %s dev\n", n);
- return do_restore_opt(sk, SOL_SOCKET, SO_BINDTODEVICE, n, strlen(n));
-}
-
-/*
- * Protobuf handles le/be himself, but the sock_filter is not just u64,
- * it's a structure and we have to preserve the fields order to be able
- * to move socket image across architectures.
- */
-
-static void encode_filter(struct sock_filter *f, u64 *img, int n)
-{
- int i;
-
- BUILD_BUG_ON(sizeof(*f) != sizeof(*img));
-
- for (i = 0; i < n; i++)
- img[i] = ((u64)f[i].code << 48) |
- ((u64)f[i].jt << 40) |
- ((u64)f[i].jf << 32) |
- ((u64)f[i].k << 0);
-}
-
-static void decode_filter(u64 *img, struct sock_filter *f, int n)
-{
- int i;
-
- for (i = 0; i < n; i++) {
- f[i].code = img[i] >> 48;
- f[i].jt = img[i] >> 40;
- f[i].jf = img[i] >> 32;
- f[i].k = img[i] >> 0;
- }
-}
-
-static int dump_socket_filter(int sk, SkOptsEntry *soe)
-{
- socklen_t len = 0;
- int ret;
- struct sock_filter *flt;
-
- ret = getsockopt(sk, SOL_SOCKET, SO_GET_FILTER, NULL, &len);
- if (ret) {
- pr_perror("Can't get socket filter len");
- return ret;
- }
-
- if (!len) {
- pr_info("No filter for socket\n");
- return 0;
- }
-
- flt = xmalloc(len * sizeof(*flt));
- if (!flt)
- return -1;
-
- ret = getsockopt(sk, SOL_SOCKET, SO_GET_FILTER, flt, &len);
- if (ret) {
- pr_perror("Can't get socket filter");
- xfree(flt);
- return ret;
- }
-
- soe->so_filter = xmalloc(len * sizeof(*soe->so_filter));
- if (!soe->so_filter) {
- xfree(flt);
- return -1;
- }
-
- encode_filter(flt, soe->so_filter, len);
- soe->n_so_filter = len;
- xfree(flt);
- return 0;
-}
-
-static int restore_socket_filter(int sk, SkOptsEntry *soe)
-{
- int ret;
- struct sock_fprog sfp;
-
- if (!soe->n_so_filter)
- return 0;
-
- pr_info("Restoring socket filter\n");
- sfp.len = soe->n_so_filter;
- sfp.filter = xmalloc(soe->n_so_filter * sfp.len);
- if (!sfp.filter)
- return -1;
-
- decode_filter(soe->so_filter, sfp.filter, sfp.len);
- ret = restore_opt(sk, SOL_SOCKET, SO_ATTACH_FILTER, &sfp);
- xfree(sfp.filter);
-
- return ret;
-}
-
-static struct socket_desc *sockets[SK_HASH_SIZE];
-
-struct socket_desc *lookup_socket(int ino, int family, int proto)
-{
- struct socket_desc *sd;
-
- if (!socket_test_collect_bit(family, proto)) {
- pr_err("Sockets (family %d, proto %d) are not collected\n",
- family, proto);
- return ERR_PTR(-EINVAL);
- }
-
- pr_debug("\tSearching for socket %x (family %d.%d)\n", ino, family, proto);
- for (sd = sockets[ino % SK_HASH_SIZE]; sd; sd = sd->next)
- if (sd->ino == ino) {
- BUG_ON(sd->family != family);
- return sd;
- }
-
- return NULL;
-}
-
-int sk_collect_one(int ino, int family, struct socket_desc *d)
-{
- struct socket_desc **chain;
-
- d->ino = ino;
- d->family = family;
- d->already_dumped = 0;
-
- chain = &sockets[ino % SK_HASH_SIZE];
- d->next = *chain;
- *chain = d;
-
- return 0;
-}
-
-int do_restore_opt(int sk, int level, int name, void *val, int len)
-{
- if (setsockopt(sk, level, name, val, len) < 0) {
- pr_perror("Can't set %d:%d (len %d)", level, name, len);
- return -1;
- }
-
- return 0;
-}
-
-static int sk_setbufs(void *arg, int fd, pid_t pid)
-{
- u32 *buf = (u32 *)arg;
-
- if (restore_opt(fd, SOL_SOCKET, SO_SNDBUFFORCE, &buf[0]))
- return -1;
- if (restore_opt(fd, SOL_SOCKET, SO_RCVBUFFORCE, &buf[1]))
- return -1;
-
- return 0;
-}
-
-/*
- * Set sizes of buffers to maximum and prevent blocking
- * Caller of this fn should call other socket restoring
- * routines to drop the non-blocking and set proper send
- * and receive buffers.
- */
-int restore_prepare_socket(int sk)
-{
- int flags;
- /* In kernel a bufsize has type int and a value is doubled. */
- u32 maxbuf[2] = { INT_MAX / 2, INT_MAX / 2 };
-
- if (userns_call(sk_setbufs, 0, maxbuf, sizeof(maxbuf), sk))
- return -1;
-
- /* Prevent blocking on restore */
- flags = fcntl(sk, F_GETFL, 0);
- if (flags == -1) {
- pr_perror("Unable to get flags for %d", sk);
- return -1;
- }
- if (fcntl(sk, F_SETFL, flags | O_NONBLOCK) ) {
- pr_perror("Unable to set O_NONBLOCK for %d", sk);
- return -1;
- }
-
- return 0;
-}
-
-int restore_socket_opts(int sk, SkOptsEntry *soe)
-{
- int ret = 0, val;
- struct timeval tv;
- /* In kernel a bufsize value is doubled. */
- u32 bufs[2] = { soe->so_sndbuf / 2, soe->so_rcvbuf / 2};
-
- pr_info("%d restore sndbuf %d rcv buf %d\n", sk, soe->so_sndbuf, soe->so_rcvbuf);
-
- /* setsockopt() multiplies the input values by 2 */
- ret |= userns_call(sk_setbufs, UNS_ASYNC, bufs, sizeof(bufs), sk);
-
- if (soe->has_so_priority) {
- pr_debug("\trestore priority %d for socket\n", soe->so_priority);
- ret |= restore_opt(sk, SOL_SOCKET, SO_PRIORITY, &soe->so_priority);
- }
- if (soe->has_so_rcvlowat) {
- pr_debug("\trestore rcvlowat %d for socket\n", soe->so_rcvlowat);
- ret |= restore_opt(sk, SOL_SOCKET, SO_RCVLOWAT, &soe->so_rcvlowat);
- }
- if (soe->has_so_mark) {
- pr_debug("\trestore mark %d for socket\n", soe->so_mark);
- ret |= restore_opt(sk, SOL_SOCKET, SO_MARK, &soe->so_mark);
- }
- if (soe->has_so_passcred && soe->so_passcred) {
- val = 1;
- pr_debug("\tset passcred for socket\n");
- ret |= restore_opt(sk, SOL_SOCKET, SO_PASSCRED, &val);
- }
- if (soe->has_so_passsec && soe->so_passsec) {
- val = 1;
- pr_debug("\tset passsec for socket\n");
- ret |= restore_opt(sk, SOL_SOCKET, SO_PASSSEC, &val);
- }
- if (soe->has_so_dontroute && soe->so_dontroute) {
- val = 1;
- pr_debug("\tset dontroute for socket\n");
- ret |= restore_opt(sk, SOL_SOCKET, SO_DONTROUTE, &val);
- }
- if (soe->has_so_no_check && soe->so_no_check) {
- val = 1;
- pr_debug("\tset no_check for socket\n");
- ret |= restore_opt(sk, SOL_SOCKET, SO_NO_CHECK, &val);
- }
-
- tv.tv_sec = soe->so_snd_tmo_sec;
- tv.tv_usec = soe->so_snd_tmo_usec;
- ret |= restore_opt(sk, SOL_SOCKET, SO_SNDTIMEO, &tv);
-
- tv.tv_sec = soe->so_rcv_tmo_sec;
- tv.tv_usec = soe->so_rcv_tmo_usec;
- ret |= restore_opt(sk, SOL_SOCKET, SO_RCVTIMEO, &tv);
-
- ret |= restore_bound_dev(sk, soe);
- ret |= restore_socket_filter(sk, soe);
-
- /* The restore of SO_REUSEADDR depends on type of socket */
-
- return ret;
-}
-
-int do_dump_opt(int sk, int level, int name, void *val, int len)
-{
- socklen_t aux = len;
-
- if (getsockopt(sk, level, name, val, &aux) < 0) {
- pr_perror("Can't get %d:%d opt", level, name);
- return -1;
- }
-
- if (aux != len) {
- pr_err("Len mismatch on %d:%d : %d, want %d\n",
- level, name, aux, len);
- return -1;
- }
-
- return 0;
-}
-
-int dump_socket_opts(int sk, SkOptsEntry *soe)
-{
- int ret = 0, val;
- struct timeval tv;
-
- ret |= dump_opt(sk, SOL_SOCKET, SO_SNDBUF, &soe->so_sndbuf);
- ret |= dump_opt(sk, SOL_SOCKET, SO_RCVBUF, &soe->so_rcvbuf);
- soe->has_so_priority = true;
- ret |= dump_opt(sk, SOL_SOCKET, SO_PRIORITY, &soe->so_priority);
- soe->has_so_rcvlowat = true;
- ret |= dump_opt(sk, SOL_SOCKET, SO_RCVLOWAT, &soe->so_rcvlowat);
- soe->has_so_mark = true;
- ret |= dump_opt(sk, SOL_SOCKET, SO_MARK, &soe->so_mark);
-
- ret |= dump_opt(sk, SOL_SOCKET, SO_SNDTIMEO, &tv);
- soe->so_snd_tmo_sec = tv.tv_sec;
- soe->so_snd_tmo_usec = tv.tv_usec;
-
- ret |= dump_opt(sk, SOL_SOCKET, SO_RCVTIMEO, &tv);
- soe->so_rcv_tmo_sec = tv.tv_sec;
- soe->so_rcv_tmo_usec = tv.tv_usec;
-
- ret |= dump_opt(sk, SOL_SOCKET, SO_REUSEADDR, &val);
- soe->reuseaddr = val ? true : false;
- soe->has_reuseaddr = true;
-
- ret |= dump_opt(sk, SOL_SOCKET, SO_PASSCRED, &val);
- soe->has_so_passcred = true;
- soe->so_passcred = val ? true : false;
-
- ret |= dump_opt(sk, SOL_SOCKET, SO_PASSSEC, &val);
- soe->has_so_passsec = true;
- soe->so_passsec = val ? true : false;
-
- ret |= dump_opt(sk, SOL_SOCKET, SO_DONTROUTE, &val);
- soe->has_so_dontroute = true;
- soe->so_dontroute = val ? true : false;
-
- ret |= dump_opt(sk, SOL_SOCKET, SO_NO_CHECK, &val);
- soe->has_so_no_check = true;
- soe->so_no_check = val ? true : false;
-
- ret |= dump_bound_dev(sk, soe);
- ret |= dump_socket_filter(sk, soe);
-
- return ret;
-}
-
-void release_skopts(SkOptsEntry *soe)
-{
- xfree(soe->so_filter);
- xfree(soe->so_bound_dev);
-}
-
-int dump_socket(struct fd_parms *p, int lfd, struct cr_img *img)
-{
- int family;
- const struct fdtype_ops *ops;
-
- if (dump_opt(lfd, SOL_SOCKET, SO_DOMAIN, &family))
- return -1;
-
- switch (family) {
- case AF_UNIX:
- ops = &unix_dump_ops;
- break;
- case AF_INET:
- ops = &inet_dump_ops;
- break;
- case AF_INET6:
- ops = &inet6_dump_ops;
- break;
- case AF_PACKET:
- ops = &packet_dump_ops;
- break;
- case AF_NETLINK:
- ops = &netlink_dump_ops;
- break;
- default:
- pr_err("BUG! Unknown socket collected (family %d)\n", family);
- return -1;
- }
-
- return do_dump_gen_file(p, lfd, ops, img);
-}
-
-static int inet_receive_one(struct nlmsghdr *h, void *arg)
-{
- struct inet_diag_req_v2 *i = arg;
- int type;
-
- switch (i->sdiag_protocol) {
- case IPPROTO_TCP:
- type = SOCK_STREAM;
- break;
- case IPPROTO_UDP:
- case IPPROTO_UDPLITE:
- type = SOCK_DGRAM;
- break;
- default:
- BUG_ON(1);
- return -1;
- }
-
- return inet_collect_one(h, i->sdiag_family, type);
-}
-
-static int do_collect_req(int nl, struct sock_diag_req *req, int size,
- int (*receive_callback)(struct nlmsghdr *h, void *), void *arg)
-{
- int tmp;
-
- tmp = do_rtnl_req(nl, req, size, receive_callback, NULL, arg);
-
- if (tmp == 0)
- set_collect_bit(req->r.n.sdiag_family, req->r.n.sdiag_protocol);
-
- return tmp;
-}
-
-int collect_sockets(struct ns_id *ns)
-{
- int err = 0, tmp;
- int nl = ns->net.nlsk;
- struct sock_diag_req req;
-
- memset(&req, 0, sizeof(req));
- req.hdr.nlmsg_len = sizeof(req);
- req.hdr.nlmsg_type = SOCK_DIAG_BY_FAMILY;
- req.hdr.nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST;
- req.hdr.nlmsg_seq = CR_NLMSG_SEQ;
-
- /* Collect UNIX sockets */
- req.r.u.sdiag_family = AF_UNIX;
- req.r.u.udiag_states = -1; /* All */
- req.r.u.udiag_show = UDIAG_SHOW_NAME | UDIAG_SHOW_VFS |
- UDIAG_SHOW_PEER | UDIAG_SHOW_ICONS |
- UDIAG_SHOW_RQLEN;
- tmp = do_collect_req(nl, &req, sizeof(req), unix_receive_one, NULL);
- if (tmp)
- err = tmp;
-
- /* Collect IPv4 TCP sockets */
- req.r.i.sdiag_family = AF_INET;
- req.r.i.sdiag_protocol = IPPROTO_TCP;
- req.r.i.idiag_ext = 0;
- /* Only listening and established sockets supported yet */
- req.r.i.idiag_states = (1 << TCP_LISTEN) | (1 << TCP_ESTABLISHED);
- tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, &req.r.i);
- if (tmp)
- err = tmp;
-
- /* Collect IPv4 UDP sockets */
- req.r.i.sdiag_family = AF_INET;
- req.r.i.sdiag_protocol = IPPROTO_UDP;
- req.r.i.idiag_ext = 0;
- req.r.i.idiag_states = -1; /* All */
- tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, &req.r.i);
- if (tmp)
- err = tmp;
-
- /* Collect IPv4 UDP-lite sockets */
- req.r.i.sdiag_family = AF_INET;
- req.r.i.sdiag_protocol = IPPROTO_UDPLITE;
- req.r.i.idiag_ext = 0;
- req.r.i.idiag_states = -1; /* All */
- tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, &req.r.i);
- if (tmp)
- err = tmp;
-
- /* Collect IPv6 TCP sockets */
- req.r.i.sdiag_family = AF_INET6;
- req.r.i.sdiag_protocol = IPPROTO_TCP;
- req.r.i.idiag_ext = 0;
- /* Only listening sockets supported yet */
- req.r.i.idiag_states = (1 << TCP_LISTEN) | (1 << TCP_ESTABLISHED);
- tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, &req.r.i);
- if (tmp)
- err = tmp;
-
- /* Collect IPv6 UDP sockets */
- req.r.i.sdiag_family = AF_INET6;
- req.r.i.sdiag_protocol = IPPROTO_UDP;
- req.r.i.idiag_ext = 0;
- req.r.i.idiag_states = -1; /* All */
- tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, &req.r.i);
- if (tmp)
- err = tmp;
-
- /* Collect IPv6 UDP-lite sockets */
- req.r.i.sdiag_family = AF_INET6;
- req.r.i.sdiag_protocol = IPPROTO_UDPLITE;
- req.r.i.idiag_ext = 0;
- req.r.i.idiag_states = -1; /* All */
- tmp = do_collect_req(nl, &req, sizeof(req), inet_receive_one, &req.r.i);
- if (tmp)
- err = tmp;
-
- req.r.p.sdiag_family = AF_PACKET;
- req.r.p.sdiag_protocol = 0;
- req.r.p.pdiag_show = PACKET_SHOW_INFO | PACKET_SHOW_MCLIST |
- PACKET_SHOW_FANOUT | PACKET_SHOW_RING_CFG;
- tmp = do_collect_req(nl, &req, sizeof(req), packet_receive_one, NULL);
- if (tmp) {
- pr_warn("The current kernel doesn't support packet_diag\n");
- if (ns->ns_pid == 0 || tmp != -ENOENT) /* Fedora 19 */
- err = tmp;
- }
-
- req.r.n.sdiag_family = AF_NETLINK;
- req.r.n.sdiag_protocol = NDIAG_PROTO_ALL;
- req.r.n.ndiag_show = NDIAG_SHOW_GROUPS;
- tmp = do_collect_req(nl, &req, sizeof(req), netlink_receive_one, NULL);
- if (tmp) {
- pr_warn("The current kernel doesn't support netlink_diag\n");
- if (ns->ns_pid == 0 || tmp != -ENOENT) /* Fedora 19 */
- err = tmp;
- }
-
- /* don't need anymore */
- close(nl);
- ns->net.nlsk = -1;
-
- if (err && (ns->type == NS_CRIU)) {
- /*
- * If netns isn't dumped, criu will fail only
- * if an unsupported socket will be really dumped.
- */
- pr_info("Uncollected sockets! Will probably fail later.\n");
- err = 0;
- }
-
- return err;
-}
diff --git a/stats.c b/stats.c
deleted file mode 100644
index 2a80bb31bc0a..000000000000
--- a/stats.c
+++ /dev/null
@@ -1,157 +0,0 @@
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/time.h>
-#include "asm/atomic.h"
-#include "protobuf.h"
-#include "stats.h"
-#include "image.h"
-#include "protobuf/stats.pb-c.h"
-
-struct timing {
- struct timeval start;
- struct timeval total;
-};
-
-struct dump_stats {
- struct timing timings[DUMP_TIME_NR_STATS];
- unsigned long counts[DUMP_CNT_NR_STATS];
-};
-
-struct restore_stats {
- struct timing timings[RESTORE_TIME_NS_STATS];
- atomic_t counts[RESTORE_CNT_NR_STATS];
-};
-
-struct dump_stats *dstats;
-struct restore_stats *rstats;
-
-void cnt_add(int c, unsigned long val)
-{
- if (dstats != NULL) {
- BUG_ON(c >= DUMP_CNT_NR_STATS);
- dstats->counts[c] += val;
- } else if (rstats != NULL) {
- BUG_ON(c >= RESTORE_CNT_NR_STATS);
- atomic_add(val, &rstats->counts[c]);
- } else
- BUG();
-}
-
-static void timeval_accumulate(const struct timeval *from, const struct timeval *to,
- struct timeval *res)
-{
- suseconds_t usec;
-
- res->tv_sec += to->tv_sec - from->tv_sec;
- usec = to->tv_usec;
- if (usec < from->tv_usec) {
- usec += USEC_PER_SEC;
- res->tv_sec -= 1;
- }
- res->tv_usec += usec - from->tv_usec;
- if (res->tv_usec > USEC_PER_SEC) {
- res->tv_usec -= USEC_PER_SEC;
- res->tv_sec += 1;
- }
-}
-
-static struct timing *get_timing(int t)
-{
- if (dstats != NULL) {
- BUG_ON(t >= DUMP_TIME_NR_STATS);
- return &dstats->timings[t];
- } else if (rstats != NULL) {
- /*
- * FIXME -- this does _NOT_ work when called
- * from different tasks.
- */
- BUG_ON(t >= RESTORE_TIME_NS_STATS);
- return &rstats->timings[t];
- }
-
- BUG();
- return NULL;
-}
-
-void timing_start(int t)
-{
- struct timing *tm;
-
- tm = get_timing(t);
- gettimeofday(&tm->start, NULL);
-}
-
-void timing_stop(int t)
-{
- struct timing *tm;
- struct timeval now;
-
- tm = get_timing(t);
- gettimeofday(&now, NULL);
- timeval_accumulate(&tm->start, &now, &tm->total);
-}
-
-static void encode_time(int t, u_int32_t *to)
-{
- struct timing *tm;
-
- tm = get_timing(t);
- *to = tm->total.tv_sec * USEC_PER_SEC + tm->total.tv_usec;
-}
-
-void write_stats(int what)
-{
- StatsEntry stats = STATS_ENTRY__INIT;
- DumpStatsEntry ds_entry = DUMP_STATS_ENTRY__INIT;
- RestoreStatsEntry rs_entry = RESTORE_STATS_ENTRY__INIT;
- char *name;
- struct cr_img *img;
-
- pr_info("Writing stats\n");
- if (what == DUMP_STATS) {
- stats.dump = &ds_entry;
-
- encode_time(TIME_FREEZING, &ds_entry.freezing_time);
- encode_time(TIME_FROZEN, &ds_entry.frozen_time);
- encode_time(TIME_MEMDUMP, &ds_entry.memdump_time);
- encode_time(TIME_MEMWRITE, &ds_entry.memwrite_time);
- ds_entry.has_irmap_resolve = true;
- encode_time(TIME_IRMAP_RESOLVE, &ds_entry.irmap_resolve);
-
- ds_entry.pages_scanned = dstats->counts[CNT_PAGES_SCANNED];
- ds_entry.pages_skipped_parent = dstats->counts[CNT_PAGES_SKIPPED_PARENT];
- ds_entry.pages_written = dstats->counts[CNT_PAGES_WRITTEN];
-
- name = "dump";
- } else if (what == RESTORE_STATS) {
- stats.restore = &rs_entry;
-
- rs_entry.pages_compared = atomic_read(&rstats->counts[CNT_PAGES_COMPARED]);
- rs_entry.pages_skipped_cow = atomic_read(&rstats->counts[CNT_PAGES_SKIPPED_COW]);
- rs_entry.has_pages_restored = true;
- rs_entry.pages_restored = atomic_read(&rstats->counts[CNT_PAGES_RESTORED]);
-
- encode_time(TIME_FORK, &rs_entry.forking_time);
- encode_time(TIME_RESTORE, &rs_entry.restore_time);
-
- name = "restore";
- } else
- return;
-
- img = open_image_at(AT_FDCWD, CR_FD_STATS, O_DUMP, name);
- if (img) {
- pb_write_one(img, &stats, PB_STATS);
- close_image(img);
- }
-}
-
-int init_stats(int what)
-{
- if (what == DUMP_STATS) {
- dstats = xzalloc(sizeof(*dstats));
- return dstats ? 0 : -1;
- }
-
- rstats = shmalloc(sizeof(struct restore_stats));
- return rstats ? 0 : -1;
-}
diff --git a/string.c b/string.c
deleted file mode 100644
index 543c642912c6..000000000000
--- a/string.c
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Adopted from linux kernel
- */
-#include <sys/types.h>
-#include <string.h>
-
-#include "string.h"
-
-#ifndef CONFIG_HAS_STRLCPY
-/**
- * strlcpy - Copy a %NUL terminated string into a sized buffer
- * @dest: Where to copy the string to
- * @src: Where to copy the string from
- * @size: size of destination buffer
- *
- * Compatible with *BSD: the result is always a valid
- * NUL-terminated string that fits in the buffer (unless,
- * of course, the buffer size is zero). It does not pad
- * out the result like strncpy() does.
- */
-size_t strlcpy(char *dest, const char *src, size_t size)
-{
- size_t ret = strlen(src);
-
- if (size) {
- size_t len = (ret >= size) ? size - 1 : ret;
- memcpy(dest, src, len);
- dest[len] = '\0';
- }
- return ret;
-}
-#endif
-
-#ifndef CONFIG_HAS_STRLCAT
-/**
- * strlcat - Append a length-limited, %NUL-terminated string to another
- * @dest: The string to be appended to
- * @src: The string to append to it
- * @count: The size of the destination buffer.
- */
-size_t strlcat(char *dest, const char *src, size_t count)
-{
- size_t dsize = strlen(dest);
- size_t len = strlen(src);
- size_t res = dsize + len;
-
- /*
- * It's assumed that @dsize strictly
- * less than count. Otherwise it's
- * a bug. But we left it to a caller.
- */
- dest += dsize;
- count -= dsize;
- if (len >= count)
- len = count-1;
- memcpy(dest, src, len);
- dest[len] = 0;
- return res;
-}
-#endif
diff --git a/sysctl.c b/sysctl.c
deleted file mode 100644
index 21ae4cef19e8..000000000000
--- a/sysctl.c
+++ /dev/null
@@ -1,467 +0,0 @@
-#include <unistd.h>
-#include <fcntl.h>
-#include <ctype.h>
-#include <string.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <sched.h>
-
-#include "asm/types.h"
-#include "namespaces.h"
-#include "sysctl.h"
-#include "util.h"
-
-/* These are the namespaces we know how to restore in various ways.
- */
-#define KNOWN_NS_MASK (CLONE_NEWUTS | CLONE_NEWNET | CLONE_NEWIPC)
-
-struct sysctl_userns_req {
- int op;
- unsigned int ns;
- size_t nr_req;
- struct sysctl_req *reqs;
-};
-
-#define __SYSCTL_OP(__ret, __fd, __req, __type, __nr, __op) \
-do { \
- if (__op == CTL_READ) \
- __ret = sysctl_read_##__type(__fd, __req, \
- (__type *)(__req)->arg, \
- __nr); \
- else if (__op == CTL_WRITE) \
- __ret = sysctl_write_##__type(__fd, __req, \
- (__type *)(__req)->arg, \
- __nr); \
- else \
- __ret = -1; \
-} while (0)
-
-#define GEN_SYSCTL_READ_FUNC(__type, __conv) \
-static int sysctl_read_##__type(int fd, \
- struct sysctl_req *req, \
- __type *arg, \
- int nr) \
-{ \
- char buf[1024] = {0}; \
- int i, ret = -1; \
- char *p = buf; \
- \
- ret = read(fd, buf, sizeof(buf)); \
- if (ret < 0) { \
- pr_perror("Can't read %s", req->name); \
- ret = -1; \
- goto err; \
- } \
- \
- for (i = 0; i < nr && p < buf + sizeof(buf); p++, i++) \
- ((__type *)arg)[i] = __conv(p, &p, 10); \
- \
- if (i != nr) { \
- pr_err("Not enough params for %s (%d != %d)\n", \
- req->name, i, nr); \
- goto err; \
- } \
- \
- ret = 0; \
- \
-err: \
- return ret; \
-}
-
-#define GEN_SYSCTL_WRITE_FUNC(__type, __fmt) \
-static int sysctl_write_##__type(int fd, \
- struct sysctl_req *req, \
- __type *arg, \
- int nr) \
-{ \
- char buf[1024]; \
- int i, ret = -1; \
- int off = 0; \
- \
- for (i = 0; i < nr && off < sizeof(buf) - 1; i++) { \
- snprintf(&buf[off], sizeof(buf) - off, __fmt, arg[i]); \
- off += strlen(&buf[off]); \
- } \
- \
- if (i != nr) { \
- pr_err("Not enough space for %s (%d != %d)\n", \
- req->name, i, nr); \
- goto err; \
- } \
- \
- /* trailing spaces in format */ \
- while (off > 0 && isspace(buf[off - 1])) \
- off--; \
- buf[off + 0] = '\n'; \
- ret = write(fd, buf, off + 1); \
- if (ret < 0) { \
- pr_perror("Can't write %s", req->name); \
- ret = -1; \
- goto err; \
- } \
- \
- ret = 0; \
-err: \
- return ret; \
-}
-
-GEN_SYSCTL_READ_FUNC(u32, strtoul);
-GEN_SYSCTL_READ_FUNC(u64, strtoull);
-GEN_SYSCTL_READ_FUNC(s32, strtol);
-
-GEN_SYSCTL_WRITE_FUNC(u32, "%u ");
-GEN_SYSCTL_WRITE_FUNC(u64, "%"PRIu64" ");
-GEN_SYSCTL_WRITE_FUNC(s32, "%d ");
-
-static int
-sysctl_write_char(int fd, struct sysctl_req *req, char *arg, int nr)
-{
- pr_debug("%s nr %d\n", req->name, nr);
- if (dprintf(fd, "%s\n", arg) < 0)
- return -1;
-
- return 0;
-}
-
-static int
-sysctl_read_char(int fd, struct sysctl_req *req, char *arg, int nr)
-{
- int ret = -1;
-
- pr_debug("%s nr %d\n", req->name, nr);
- ret = read(fd, arg, nr);
- if (ret < 0) {
- pr_perror("Can't read %s", req->name);
- goto err;
- }
- ret = 0;
-
-err:
- return ret;
-}
-
-static int sysctl_userns_arg_size(int type)
-{
- switch(CTL_TYPE(type)) {
- case __CTL_U32A:
- return sizeof(u32) * CTL_LEN(type);
- case CTL_U32:
- return sizeof(u32);
- case CTL_32:
- return sizeof(s32);
- case __CTL_U64A:
- return sizeof(u64) * CTL_LEN(type);
- case CTL_U64:
- return sizeof(u64);
- case __CTL_STR:
- return sizeof(char) * CTL_LEN(type) + 1;
- default:
- pr_err("unknown arg type %d\n", type);
-
- /* Ensure overflow to cause an error */
- return MAX_UNSFD_MSG_SIZE;
- }
-}
-
-static int do_sysctl_op(int fd, struct sysctl_req *req, int op)
-{
- int ret = -1, nr = 1;
-
- switch (CTL_TYPE(req->type)) {
- case __CTL_U32A:
- nr = CTL_LEN(req->type);
- /* fallthrough */
- case CTL_U32:
- __SYSCTL_OP(ret, fd, req, u32, nr, op);
- break;
- case CTL_32:
- __SYSCTL_OP(ret, fd, req, s32, nr, op);
- break;
- case __CTL_U64A:
- nr = CTL_LEN(req->type);
- /* fallthrough */
- case CTL_U64:
- __SYSCTL_OP(ret, fd, req, u64, nr, op);
- break;
- case __CTL_STR:
- nr = CTL_LEN(req->type);
- __SYSCTL_OP(ret, fd, req, char, nr, op);
- break;
- }
-
- return ret;
-}
-
-static int __userns_sysctl_op(void *arg, int proc_fd, pid_t pid)
-{
- int fd, ret = -1, dir, i, status, *fds = NULL;
- struct sysctl_userns_req *userns_req = arg;
- int op = userns_req->op;
- struct sysctl_req *req, **reqs = NULL;
- sigset_t blockmask, oldmask;
- pid_t worker;
-
- // fix up the pointer
- req = userns_req->reqs = (struct sysctl_req *) &userns_req[1];
-
- /* For files in the IPC/UTS namespaces, restoring is more complicated
- * than for net. Unprivileged users cannot even open these files, so
- * they must be opened by usernsd. However, the value in the kernel is
- * changed for the IPC/UTS namespace that write()s to the open sysctl
- * file (not who opened it). So, we must set the value from inside the
- * usernsd caller's namespace. We:
- *
- * 1. unsd opens the sysctl files
- * 2. forks a task
- * 3. setns()es to the UTS/IPC namespace of the caller
- * 4. write()s to the files and exits
- */
- dir = open("/proc/sys", O_RDONLY, O_DIRECTORY);
- if (dir < 0) {
- pr_perror("Can't open sysctl dir");
- return -1;
- }
-
- fds = xmalloc(sizeof(int) * userns_req->nr_req);
- if (!fds)
- goto out;
-
- reqs = xmalloc(sizeof(struct sysctl_req) * userns_req->nr_req);
- if (!reqs)
- goto out;
-
- memset(fds, -1, sizeof(int) * userns_req->nr_req);
-
- for (i = 0; i < userns_req->nr_req; i++) {
- int arg_len = sysctl_userns_arg_size(req->type);
- int name_len = strlen((char *) &req[1]) + 1;
- int total_len = sizeof(*req) + arg_len + name_len;
- int flags;
-
- /* fix up the pointers */
- req->name = (char *) &req[1];
- req->arg = req->name + name_len;
-
- if (((char *) req) + total_len >= ((char *) userns_req) + MAX_UNSFD_MSG_SIZE) {
- pr_err("bad sysctl req %s, too big: %d\n", req->name, total_len);
- goto out;
- }
-
- if (op == CTL_READ)
- flags = O_RDONLY;
- else
- flags = O_WRONLY;
-
- fd = openat(dir, req->name, flags);
- if (fd < 0) {
- if (errno == ENOENT && (req->flags & CTL_FLAGS_OPTIONAL))
- continue;
- pr_perror("Can't open sysctl %s", req->name);
- goto out;
- }
-
- /* save a pointer to the req, so we don't need to recompute its
- * location
- */
- reqs[i] = req;
- fds[i] = fd;
-
- req = (struct sysctl_req *) (((char *) req) + total_len);
- }
-
- /*
- * Don't let the sigchld_handler() mess with us
- * calling waitpid() on the exited worker. The
- * same is done in cr_system().
- */
-
- sigemptyset(&blockmask);
- sigaddset(&blockmask, SIGCHLD);
- sigprocmask(SIG_BLOCK, &blockmask, &oldmask);
-
- worker = fork();
- if (worker < 0)
- goto out;
-
- if (!worker) {
- int nsfd;
- const char *nsname = ns_to_string(userns_req->ns);
-
- BUG_ON(!nsname);
- nsfd = openat(proc_fd, nsname, O_RDONLY);
- if (nsfd < 0) {
- pr_perror("failed to open pid %d's ns %s", pid, nsname);
- exit(1);
- }
-
- if (setns(nsfd, 0) < 0) {
- pr_perror("failed to setns to %d's ns %s", pid, nsname);
- exit(1);
- }
-
- close(nsfd);
-
- for (i = 0; i < userns_req->nr_req; i++) {
- if (do_sysctl_op(fds[i], reqs[i], op) < 0)
- exit(1);
- }
-
- exit(0);
- }
-
- if (waitpid(worker, &status, 0) != worker) {
- pr_perror("worker didn't die?");
- kill(worker, SIGKILL);
- goto out;
- }
- sigprocmask(SIG_SETMASK, &oldmask, NULL);
-
- if (!WIFEXITED(status) || WEXITSTATUS(status)) {
- pr_err("worker failed: %d\n", status);
- goto out;
- }
-
- ret = 0;
-
-out:
- if (fds) {
- for (i = 0; i < userns_req->nr_req; i++) {
- if (fds[i] < 0)
- break;
- close_safe(&fds[i]);
- }
-
- xfree(fds);
- }
-
- if (reqs)
- xfree(reqs);
-
- close_safe(&dir);
-
- return ret;
-}
-
-static int __nonuserns_sysctl_op(struct sysctl_req *req, size_t nr_req, int op)
-{
- int dir, ret, exit_code = -1;;
-
- dir = open("/proc/sys", O_RDONLY, O_DIRECTORY);
- if (dir < 0) {
- pr_perror("Can't open sysctl dir");
- return -1;
- }
-
- while (nr_req--) {
- int fd, flags;
-
- if (op == CTL_READ)
- flags = O_RDONLY;
- else
- flags = O_WRONLY;
-
- fd = openat(dir, req->name, flags);
- if (fd < 0) {
- if (errno == ENOENT && (req->flags & CTL_FLAGS_OPTIONAL)) {
- req++;
- continue;
- }
- pr_perror("Can't open sysctl %s", req->name);
- goto out;
- }
-
- ret = do_sysctl_op(fd, req, op);
- if (ret)
- goto out;
- close(fd);
- req++;
- }
-
- exit_code = 0;
-out:
- close(dir);
- return exit_code;
-}
-
-int sysctl_op(struct sysctl_req *req, size_t nr_req, int op, unsigned int ns)
-{
- int i, fd, ret;
- struct sysctl_userns_req *userns_req;
- struct sysctl_req *cur;
-
- if (nr_req == 0)
- return 0;
-
- if (ns & ~KNOWN_NS_MASK) {
- pr_err("don't know how to restore some namespaces in %u\n", ns);
- return -1;
- }
-
- /* The way sysctl files behave on open/write depends on the namespace
- * they correspond to. If we don't want to interact with something in a
- * namespace (e.g. kernel/cap_last_cap is global), we can do this from
- * the current process. Similarly, if we're accessing net namespaces,
- * we can just do the operation from our current process, since
- * anything with CAP_NET_ADMIN can write to the net/ sysctls, and we
- * still have that even when restoring in a user ns.
- *
- * For IPC/UTS, we restore them as described above.
- *
- * For read operations, we need to copy the values back to return.
- * Fortunately, we only do read on dump (or global reads on restore),
- * so we can do those in process as well.
- */
- if (!ns || ns & CLONE_NEWNET || op == CTL_READ)
- return __nonuserns_sysctl_op(req, nr_req, op);
-
- /*
- * In order to avoid lots of opening of /proc/sys for each struct sysctl_req,
- * we encode each array of sysctl_reqs into one contiguous region of memory so
- * it can be passed via userns_call if necessary. It looks like this:
- *
- * struct sysctl_userns_req struct sysctl_req name arg
- * ---------------------------------------------------------------------------
- * | op | nr_req | reqs | <fields> | name | arg | "the name" | "the arg" ...
- * ---------------------------------------------------------------------------
- * |____^ |______|__^ ^
- * |_______________|
- */
- userns_req = alloca(MAX_UNSFD_MSG_SIZE);
- userns_req->op = op;
- userns_req->nr_req = nr_req;
- userns_req->ns = ns;
- userns_req->reqs = (struct sysctl_req *) (&userns_req[1]);
-
- cur = userns_req->reqs;
- for (i = 0; i < nr_req; i++) {
- int arg_len = sysctl_userns_arg_size(req[i].type);
- int name_len = strlen(req[i].name) + 1;
- int total_len = sizeof(*cur) + arg_len + name_len;
-
- if (((char *) cur) + total_len >= ((char *) userns_req) + MAX_UNSFD_MSG_SIZE) {
- pr_err("sysctl msg %s too big: %d\n", req[i].name, total_len);
- return -1;
- }
-
- /* copy over the non-pointer fields */
- cur->type = req[i].type;
- cur->flags = req[i].flags;
-
- cur->name = (char *) &cur[1];
- strcpy(cur->name, req[i].name);
-
- cur->arg = cur->name + name_len;
- memcpy(cur->arg, req[i].arg, arg_len);
-
- cur = (struct sysctl_req *) (((char *) cur) + total_len);
- }
-
- fd = open_proc(PROC_SELF, "ns");
- if (fd < 0)
- return -1;
-
- ret = userns_call(__userns_sysctl_op, 0, userns_req, MAX_UNSFD_MSG_SIZE, fd);
- close(fd);
- return ret;
-}
diff --git a/sysfs_parse.c b/sysfs_parse.c
deleted file mode 100644
index 6497d53497e9..000000000000
--- a/sysfs_parse.c
+++ /dev/null
@@ -1,325 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-#include <string.h>
-#include <ctype.h>
-#include <errno.h>
-#include <sys/types.h>
-#include <dirent.h>
-#include <sys/stat.h>
-
-#include "cr_options.h"
-#include "criu-log.h"
-#include "xmalloc.h"
-#include "files.h"
-#include "proc_parse.h"
-#include "util.h"
-#include "sysfs_parse.h"
-#include "namespaces.h"
-
-/*
- * Currently, there are two kernel problems dealing with AUFS
- * filesystems. Until these problems are fixed in the kernel,
- * we have AUFS support in CRIU to handle the following issues:
- *
- * 1) /proc/<pid>/mountinfo: The problem is that for AUFS the root field
- * of the root entry is missing the pathname (it's only /). For example:
- *
- * 90 61 0:33 / / rw,relatime - aufs none rw,si=4476a910a24617e6
- *
- * To handle this issue, the user has to specify the root of the AUFS
- * filesystem with the --root command line option.
- *
- * 2) /proc/<pid>/map_files: The symlinks are absolute pathnames of the
- * corresponding *physical* files in the branch they exist. For example,
- * for a Docker container using AUFS, a symlink would look like:
- * 400000-489000 -> /var/lib/docker/aufs/diff/<LAYER_ID>/bin/<cmd>
- *
- * Therefore, when we use the link file descriptor vm_file_fd in
- * dump_one_reg_file() to read the link, we get the file's physical
- * absolute pathname which does not exist relative to the root of the
- * mount namespace and even if we used its relative pathname, the dev:ino
- * values would be different from the physical file's dev:ino causing the
- * dump to fail.
- *
- * To handle this issue, we figure out the "correct" paths when parsing
- * map_files and save it for later use. See fixup_aufs_vma_fd() for
- * details.
- */
-
-struct ns_id *aufs_nsid;
-static char **aufs_branches;
-
-/*
- * Parse out and save the AUFS superblock info in the
- * given buffer.
- */
-static int parse_aufs_sbinfo(struct mount_info *mi, char *sbinfo, int len)
-{
- char *cp;
- int n;
-
- cp = strstr(mi->options, "si=");
- if (!cp) {
- pr_err("Cannot find sbinfo in option string %s\n", mi->options);
- return -1;
- }
-
- /* all ok, copy */
- if (len < 4) { /* 4 for "si_" */
- pr_err("Buffer of %d bytes too small for sbinfo\n", len);
- return -1;
- }
- strcpy(sbinfo, "si_");
- n = 3;
- sbinfo += n;
- cp += n;
- while (isxdigit(*cp) && n < len) {
- *sbinfo++ = *cp++;
- n++;
- }
- if (n >= len) {
- pr_err("Sbinfo in options string %s too long\n", mi->options);
- return -1;
- }
- *sbinfo = '\0';
- return 0;
-}
-
-/*
- * If the specified path is in a branch, replace it
- * with pathname from root.
- */
-static int fixup_aufs_path(char *path, int size)
-{
- char rpath[PATH_MAX];
- int n;
- int blen;
-
- if (aufs_branches == NULL) {
- pr_err("No aufs branches to search for %s\n", path);
- return -1;
- }
-
- for (n = 0; aufs_branches[n] != NULL; n++) {
- blen = strlen(aufs_branches[n]);
- if (!strncmp(path, aufs_branches[n], blen))
- break;
- }
-
- if (aufs_branches[n] == NULL)
- return 0; /* not in a branch */
-
- n = snprintf(rpath, PATH_MAX, "%s", &path[blen]);
- if (n >= min(PATH_MAX, size)) {
- pr_err("Not enough space to replace %s\n", path);
- return -1;
- }
-
- pr_debug("Replacing %s with %s\n", path, rpath);
- strcpy(path, rpath);
- return n;
-}
-
-/*
- * Kernel stores patchnames to AUFS branches in the br<n> files in
- * the /sys/fs/aufs/si_<sbinfo> directory where <n> denotes a branch
- * number and <sbinfo> is a hexadecimal number in %lx format. For
- * example:
- *
- * $ cat /sys/fs/aufs/si_f598876b087ed883/br0
- * /path/to/branch0/directory=rw
- *
- * This function sets up an array of pointers to branch pathnames.
- */
-int parse_aufs_branches(struct mount_info *mi)
-{
- char path[AUFSBR_PATH_LEN];
- char *cp;
- int n;
- int ret;
- unsigned int br_num;
- unsigned int br_max;
- DIR *dp;
- FILE *fp;
- struct dirent *de;
-
- pr_info("Collecting AUFS branch pathnames ...\n");
-
- if (mi->nsid == 0) {
- pr_err("No nsid to parse its aufs branches\n");
- return -1;
- }
-
- if (mi->nsid == aufs_nsid) {
- pr_debug("Using cached aufs branch paths for nsid %p\n", aufs_nsid);
- return 0;
- }
-
- if (aufs_nsid)
- free_aufs_branches();
-
- strcpy(path, SYSFS_AUFS); /* /sys/fs/aufs/ */
- if (parse_aufs_sbinfo(mi, &path[sizeof SYSFS_AUFS - 1], SBINFO_LEN) < 0)
- return -1;
- if ((dp = opendir(path)) == NULL) {
- pr_perror("Cannot opendir %s", path);
- return -1;
- }
-
- /*
- * Find out how many branches we have.
- */
- br_max = 0;
- ret = 0;
- while (1) {
- errno = 0;
- if ((de = readdir(dp)) == NULL) {
- if (errno) {
- pr_perror("Cannot readdir %s", path);
- ret = -1;
- }
- break;
- }
-
- ret = sscanf(de->d_name, "br%d", &br_num);
- if (ret == 1 && br_num > br_max)
- br_max = br_num;
- }
- closedir(dp);
- if (ret == -1)
- return -1;
-
- /*
- * Default AUFS maximum is 127, so 1000 should be plenty.
- * If you increase the maximum to more than 3 digits,
- * make sure to change AUFSBR_PATH_LEN accordingly.
- */
- if (br_max > 999) {
- pr_err("Too many branches %d\n", br_max);
- return -1;
- }
-
- /*
- * Allocate an array of pointers to branch pathnames to be read.
- * Branches are indexed from 0 and we need a NULL pointer at the end.
- */
- aufs_branches = xzalloc((br_max + 2) * sizeof (char *));
- if (!aufs_branches)
- return -1;
-
- /*
- * Now read branch pathnames from the branch files.
- */
- n = strlen(path);
- for (br_num = 0; br_num <= br_max; br_num++) {
- fp = NULL;
-
- ret = snprintf(&path[n], sizeof path - n, "/br%d", br_num);
- if (ret >= sizeof path - n) {
- pr_err("Buffer overrun creating path for branch %d\n", br_num);
- goto err;
- }
-
- if ((fp = fopen(path, "r")) == NULL) {
- pr_perror("Cannot fopen %s", path);
- goto err;
- }
-
- if (fscanf(fp, "%ms=", &aufs_branches[br_num]) != 1 ||
- aufs_branches[br_num] == NULL) {
- pr_perror("Parse error reading %s", path);
- goto err;
- }
-
- /* chop off the trailing "=..." stuff */
- if ((cp = strchr(aufs_branches[br_num], '=')) == NULL) {
- pr_err("Bad format in branch pathname %s\n", aufs_branches[br_num]);
- goto err;
- }
- *cp = '\0';
-
- fclose(fp);
- /*
- * Log branch information for extenal utitilies that
- * want to recreate the process's AUFS filesystem
- * before calling criu restore.
- *
- * DO NOT CHANGE this format!
- */
- pr_info("%s : %s\n", path, aufs_branches[br_num]);
- }
-
- aufs_nsid = mi->nsid;
- return 0;
-
-err:
- if (fp)
- fclose(fp);
- free_aufs_branches();
- return -1;
-}
-
-/*
- * AUFS support to compensate for the kernel bug
- * exposing branch pathnames in map_files and providing
- * a wrong mnt_id value in /proc/<pid>/fdinfo/<fd>.
- *
- * If the link points inside a branch, save the
- * relative pathname from the root of the mount
- * namespace as well as the full pathname from
- * globl root (/) for later use in dump_filemap()
- * and parse_smaps().
- */
-int fixup_aufs_vma_fd(struct vma_area *vma)
-{
- char path[PATH_MAX];
- int len;
-
- path[0] = '.';
- len = read_fd_link(vma->vm_file_fd, &path[1], sizeof path - 1);
- if (len < 0)
- return -1;
-
- len = fixup_aufs_path(&path[1], sizeof path - 1);
- if (len <= 0)
- return len;
-
- vma->aufs_rpath = xmalloc(len + 2);
- if (!vma->aufs_rpath)
- return -1;
-
- strcpy(vma->aufs_rpath, path);
- if (opts.root) {
- /* skip ./ in path */
- vma->aufs_fpath = xsprintf("%s/%s", opts.root, &path[2]);
- if (!vma->aufs_fpath)
- return -1;
- }
- pr_debug("Saved AUFS paths %s and %s\n", vma->aufs_rpath, vma->aufs_fpath);
-
- if (stat(vma->aufs_fpath, vma->vmst) < 0) {
- pr_perror("Failed stat on map %"PRIx64" (%s)",
- vma->e->start, vma->aufs_fpath);
- return -1;
- }
-
- /* tell parse_smap() not to call get_fd_mntid() */
- vma->mnt_id = -1;
- return len;
-}
-
-void free_aufs_branches(void)
-{
- int n;
-
- if (aufs_branches) {
- for (n = 0; aufs_branches[n] != NULL; n++)
- xfree(aufs_branches[n]);
-
- xfree(aufs_branches);
- aufs_branches = NULL;
- }
-
- aufs_nsid = NULL;
-}
diff --git a/timerfd.c b/timerfd.c
deleted file mode 100644
index 019de69ef61b..000000000000
--- a/timerfd.c
+++ /dev/null
@@ -1,211 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <unistd.h>
-
-#include <sys/timerfd.h>
-#include <sys/ioctl.h>
-
-#include "protobuf.h"
-#include "protobuf/timerfd.pb-c.h"
-
-#include "proc_parse.h"
-#include "rst-malloc.h"
-#include "cr_options.h"
-#include "restorer.h"
-#include "timerfd.h"
-#include "pstree.h"
-#include "files.h"
-#include "imgset.h"
-#include "util.h"
-#include "log.h"
-#include "bug.h"
-
-#undef LOG_PREFIX
-#define LOG_PREFIX "timerfd: "
-
-struct timerfd_dump_arg {
- u32 id;
- const struct fd_parms *p;
-};
-
-struct timerfd_info {
- TimerfdEntry *tfe;
- struct file_desc d;
- int t_fd;
- struct list_head rlist;
-};
-
-static LIST_HEAD(rst_timerfds);
-
-unsigned long rst_timerfd_cpos;
-unsigned int rst_timerfd_nr = 0;
-
-int check_timerfd(void)
-{
- int fd, ret = -1;
-
- fd = timerfd_create(CLOCK_MONOTONIC, 0);
- if (fd < 0) {
- pr_perror("timerfd_create failed");
- return -1;
- } else {
- ret = ioctl(fd, TFD_IOC_SET_TICKS, NULL);
- if (ret < 0) {
- if (errno != EFAULT)
- pr_perror("No timerfd support for c/r");
- else
- ret = 0;
- }
- }
-
- close(fd);
- return ret;
-}
-
-int is_timerfd_link(char *link)
-{
- return is_anon_link_type(link, "[timerfd]");
-}
-
-static int dump_timerfd_entry(union fdinfo_entries *e, void *arg)
-{
- struct timerfd_dump_arg *da = arg;
- TimerfdEntry *tfy = &e->tfy;
-
- tfy->id = da->id;
- tfy->flags = da->p->flags;
- tfy->fown = (FownEntry *)&da->p->fown;
-
- pr_info("Dumping id %#x clockid %d it_value(%llu, %llu) it_interval(%llu, %llu)\n",
- tfy->id, tfy->clockid, (unsigned long long)tfy->vsec, (unsigned long long)tfy->vnsec,
- (unsigned long long)tfy->isec, (unsigned long long)tfy->insec);
-
- return pb_write_one(img_from_set(glob_imgset, CR_FD_TIMERFD), &e->tfy, PB_TIMERFD);
-}
-
-static int dump_one_timerfd(int lfd, u32 id, const struct fd_parms *p)
-{
- struct timerfd_dump_arg da = { .id = id, .p = p, };
- return parse_fdinfo(lfd, FD_TYPES__TIMERFD, dump_timerfd_entry, &da);
-}
-
-const struct fdtype_ops timerfd_dump_ops = {
- .type = FD_TYPES__TIMERFD,
- .dump = dump_one_timerfd,
-};
-
-/*
- * We need to restore timers at the very late stage in restorer
- * to eliminate the case when timer is expired but we have not
- * yet finished restore procedure and signal handlers are not
- * set up properly. We need to copy timers settings into restorer
- * area that's why post-open is used for.
- */
-static int timerfd_post_open(struct file_desc *d, int fd)
-{
- struct timerfd_info *info = container_of(d, struct timerfd_info, d);
-
- info->t_fd = fd;
- list_add_tail(&info->rlist, &rst_timerfds);
- return 0;
-}
-
-int rst_timerfd_prep(void)
-{
- struct timerfd_info *ti;
- struct restore_timerfd *t;
-
- rst_timerfd_cpos = rst_mem_align_cpos(RM_PRIVATE);
- list_for_each_entry(ti, &rst_timerfds, rlist) {
- TimerfdEntry *tfe = ti->tfe;
-
- t = rst_mem_alloc(sizeof(*t), RM_PRIVATE);
- if (!t)
- return -1;
-
- t->id = tfe->id;
- t->fd = ti->t_fd;
- t->clockid = tfe->clockid;
- t->ticks = (unsigned long)tfe->ticks;
- t->settime_flags = tfe->settime_flags;
- t->val.it_interval.tv_sec = (time_t)tfe->isec;
- t->val.it_interval.tv_nsec = (long)tfe->insec;
- t->val.it_value.tv_sec = (time_t)tfe->vsec;
- t->val.it_value.tv_nsec = (long)tfe->vnsec;
-
- rst_timerfd_nr++;
- }
-
- return 0;
-}
-
-static int timerfd_open(struct file_desc *d)
-{
- struct timerfd_info *info;
- TimerfdEntry *tfe;
- int tmp = -1;
-
- info = container_of(d, struct timerfd_info, d);
- tfe = info->tfe;
- pr_info("Creating timerfd id %#x clockid %d settime_flags %x ticks %llu "
- "it_value(%llu, %llu) it_interval(%llu, %llu)\n",
- tfe->id, tfe->clockid, tfe->settime_flags, (unsigned long long)tfe->ticks,
- (unsigned long long)tfe->vsec, (unsigned long long)tfe->vnsec,
- (unsigned long long)tfe->isec, (unsigned long long)tfe->insec);
-
- tmp = timerfd_create(tfe->clockid, 0);
- if (tmp < 0) {
- pr_perror("Can't create for %#x", tfe->id);
- return -1;
- }
-
- if (rst_file_params(tmp, tfe->fown, tfe->flags)) {
- pr_perror("Can't restore params for %#x", tfe->id);
- goto err_close;
- }
-
- return tmp;
-
-err_close:
- close_safe(&tmp);
- return -1;
-}
-
-static struct file_desc_ops timerfd_desc_ops = {
- .type = FD_TYPES__TIMERFD,
- .open = timerfd_open,
- .post_open = timerfd_post_open,
-};
-
-static int verify_timerfd(TimerfdEntry *tfe)
-{
- if (tfe->clockid != CLOCK_REALTIME &&
- tfe->clockid != CLOCK_MONOTONIC) {
- pr_err("Unknown clock type %d for %#x\n", tfe->clockid, tfe->id);
- return -1;
- }
-
- return 0;
-}
-
-static int collect_one_timerfd(void *o, ProtobufCMessage *msg)
-{
- struct timerfd_info *info = o;
-
- info->tfe = pb_msg(msg, TimerfdEntry);
- if (verify_timerfd(info->tfe)) {
- pr_err("Verification failed for %#x\n", info->tfe->id);
- return -1;
- }
-
- info->t_fd = -1;
-
- return file_desc_add(&info->d, info->tfe->id, &timerfd_desc_ops);
-}
-
-struct collect_image_info timerfd_cinfo = {
- .fd_type = CR_FD_TIMERFD,
- .pb_type = PB_TIMERFD,
- .priv_size = sizeof(struct timerfd_info),
- .collect = collect_one_timerfd,
-};
diff --git a/tty.c b/tty.c
deleted file mode 100644
index ef82583ddfd6..000000000000
--- a/tty.c
+++ /dev/null
@@ -1,1712 +0,0 @@
-#include <unistd.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <errno.h>
-#include <string.h>
-#include <limits.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sys/mman.h>
-#include <sys/ioctl.h>
-#include <termios.h>
-#include <linux/major.h>
-
-#include "compiler.h"
-#include "asm/types.h"
-
-#include "files.h"
-#include "cr_options.h"
-#include "imgset.h"
-#include "servicefd.h"
-#include "image.h"
-#include "util.h"
-#include "log.h"
-#include "list.h"
-#include "util-pie.h"
-#include "proc_parse.h"
-#include "file-ids.h"
-#include "files-reg.h"
-#include "namespaces.h"
-
-#include "protobuf.h"
-#include "protobuf/tty.pb-c.h"
-
-#include "parasite-syscall.h"
-#include "parasite.h"
-
-#include "pstree.h"
-#include "tty.h"
-
-/*
- * Here are some notes about overall TTY c/r design. At moment
- * we support unix98 ptys only. Supporting legacy BSD terminals
- * is impossible without help from the kernel side -- the indices
- * of such terminals are not reported anywhere in the kernel so that
- * we can't figure out active pairs.
- *
- * Usually the PTYs represent a pair of links -- master peer and slave
- * peer. Master peer must be opened before slave. Internally, when kernel
- * creates master peer it also generates a slave interface in a form of
- * /dev/pts/N, where N is that named pty "index". Master/slave connection
- * unambiguously identified by this index.
- *
- * Still, one master can carry multiple slaves -- for example a user opens
- * one master via /dev/ptmx and appropriate /dev/pts/N in sequence.
- * The result will be the following
- *
- * master
- * `- slave 1
- * `- slave 2
- *
- * both slave will have same master index but different file descriptors.
- * Still inside the kernel pty parameters are same for both slaves. Thus
- * only one slave parameters should be restored, there is no need to carry
- * all parameters for every slave peer we've found.
- *
- * Note the /dev/pts/ is rather convenient agreement and internally the
- * kernel doesn't care where exactly the inodes of ptys are laying --
- * it depends on "devpts" mount point path.
- */
-
-#undef LOG_PREFIX
-#define LOG_PREFIX "tty: "
-
-struct tty_info_entry {
- struct list_head list;
- TtyInfoEntry *tie;
-};
-
-struct tty_info {
- struct list_head list;
- struct file_desc d;
-
- struct file_desc *reg_d;
-
- TtyFileEntry *tfe;
- TtyInfoEntry *tie;
-
- struct list_head sibling;
- struct tty_driver *driver;
-
- bool create;
- bool inherit;
-
- struct tty_info *ctl_tty;
-};
-
-struct tty_dump_info {
- struct list_head list;
-
- u32 id;
- pid_t sid;
- pid_t pgrp;
- int fd;
- struct tty_driver *driver;
-};
-
-static LIST_HEAD(all_tty_info_entries);
-static LIST_HEAD(all_ttys);
-
-/*
- * Usually an application has not that many ttys opened.
- * If this won't be enough in future we simply need to
- * change tracking mechanism to some more extendable.
- *
- * This particular bitmap requires 256 bytes of memory.
- * Pretty acceptable trade off in a sake of simplicity.
- */
-
-#define MAX_TTYS 1024
-
-/*
- * Custom indices should be even numbers just in case if we
- * need odds for pair numbering someday.
- */
-
-#define MAX_PTY_INDEX 1000
-#define CONSOLE_INDEX 1002
-#define VT_INDEX 1004
-#define CTTY_INDEX 1006
-#define INDEX_ERR (MAX_TTYS + 1)
-
-static DECLARE_BITMAP(tty_bitmap, (MAX_TTYS << 1));
-static DECLARE_BITMAP(tty_active_pairs, (MAX_TTYS << 1));
-
-struct tty_driver {
- short type;
- short subtype;
- char *name;
- int index;
- int (*fd_get_index)(int fd, const struct fd_parms *p);
- int (*img_get_index)(struct tty_info *ti);
- int (*open)(struct tty_info *ti);
-};
-
-#define TTY_SUBTYPE_MASTER 0x0001
-#define TTY_SUBTYPE_SLAVE 0x0002
-
-static int ptm_fd_get_index(int fd, const struct fd_parms *p)
-{
- int index;
-
- if (ioctl(fd, TIOCGPTN, &index)) {
- pr_perror("Can't obtain ptmx index");
- return INDEX_ERR;
- }
-
- if (index > MAX_PTY_INDEX) {
- pr_err("Index %d on ptmx is too big\n", index);
- return INDEX_ERR;
- }
-
- return index;
-}
-
-static int pty_get_index(struct tty_info *ti)
-{
- return ti->tie->pty->index;
-}
-
-static int pty_open_ptmx(struct tty_info *info);
-
-static struct tty_driver ptm_driver = {
- .type = TTY_TYPE__PTY,
- .subtype = TTY_SUBTYPE_MASTER,
- .name = "ptmx",
- .fd_get_index = ptm_fd_get_index,
- .img_get_index = pty_get_index,
- .open = pty_open_ptmx,
-};
-
-static int open_simple_tty(struct tty_info *info);
-
-static struct tty_driver console_driver = {
- .type = TTY_TYPE__CONSOLE,
- .name = "console",
- .index = CONSOLE_INDEX,
- .open = open_simple_tty,
-};
-
-static struct tty_driver ctty_driver = {
- .type = TTY_TYPE__CTTY,
- .name = "ctty",
- .index = CTTY_INDEX,
- .open = open_simple_tty,
-};
-
-static struct tty_driver vt_driver = {
- .type = TTY_TYPE__VT,
- .name = "vt",
- .index = VT_INDEX,
- .open = open_simple_tty,
-};
-
-static int open_ext_tty(struct tty_info *info);
-static struct tty_driver ext_driver = {
- .type = TTY_TYPE__EXT_TTY,
- .name = "ext",
- .open = open_ext_tty,
-};
-
-static int pts_fd_get_index(int fd, const struct fd_parms *p)
-{
- int index;
- const struct fd_link *link = p->link;
- char *pos = strrchr(link->name, '/');
-
- if (!pos || pos == (link->name + link->len - 1)) {
- pr_err("Unexpected format on path %s\n", link->name + 1);
- return INDEX_ERR;
- }
-
- index = atoi(pos + 1);
- if (index > MAX_PTY_INDEX) {
- pr_err("Index %d on pts is too big\n", index);
- return INDEX_ERR;
- }
-
- return index;
-}
-
-static struct tty_driver pts_driver = {
- .type = TTY_TYPE__PTY,
- .subtype = TTY_SUBTYPE_SLAVE,
- .name = "pts",
- .fd_get_index = pts_fd_get_index,
- .img_get_index = pty_get_index,
- .open = pty_open_ptmx,
-};
-
-struct tty_driver *get_tty_driver(dev_t rdev, dev_t dev)
-{
- int major, minor;
- char id[42];
-
- snprintf(id, sizeof(id), "tty[%"PRIx64":%"PRIx64"]", rdev, dev);
- if (external_lookup_id(id) || inherit_fd_lookup_id(id) >= 0)
- return &ext_driver;
-
- major = major(rdev);
- minor = minor(rdev);
-
- switch (major) {
- case TTYAUX_MAJOR:
- if (minor == 2)
- return &ptm_driver;
- else if (minor == 1)
- return &console_driver;
- else if (minor == 0)
- return &ctty_driver;
- break;
- case TTY_MAJOR:
- if (minor > MIN_NR_CONSOLES && minor < MAX_NR_CONSOLES)
- /*
- * Minors [MIN_NR_CONSOLES; MAX_NR_CONSOLES] stand
- * for consoles (virtual terminals, VT in terms
- * of kernel).
- */
- return &vt_driver;
- case UNIX98_PTY_MASTER_MAJOR ... (UNIX98_PTY_MASTER_MAJOR + UNIX98_PTY_MAJOR_COUNT - 1):
- return &ptm_driver;
- case UNIX98_PTY_SLAVE_MAJOR:
- return &pts_driver;
- }
- return NULL;
-}
-
-static inline int is_pty(struct tty_driver *driver)
-{
- return driver->type == TTY_TYPE__PTY;
-}
-
-/*
- * /dev/ptmx is a shared resource between all tasks
- * so we need to serialize access to it.
- */
-static mutex_t *tty_mutex;
-
-static bool tty_is_master(struct tty_info *info);
-
-int prepare_shared_tty(void)
-{
- tty_mutex = shmalloc(sizeof(*tty_mutex));
- if (!tty_mutex) {
- pr_err("Can't create ptmx index mutex\n");
- return -1;
- }
-
- mutex_init(tty_mutex);
-
- return 0;
-}
-
-#define winsize_copy(d, s) \
- do { \
- ASSIGN_MEMBER((d), (s), ws_row); \
- ASSIGN_MEMBER((d), (s), ws_col); \
- ASSIGN_MEMBER((d), (s), ws_xpixel); \
- ASSIGN_MEMBER((d), (s), ws_ypixel); \
- } while (0)
-
-#define termios_copy(d, s) \
- do { \
- struct termios __t; \
- \
- memcpy((d)->c_cc, (s)->c_cc, \
- sizeof(__t.c_cc)); \
- \
- ASSIGN_MEMBER((d),(s), c_iflag); \
- ASSIGN_MEMBER((d),(s), c_oflag); \
- ASSIGN_MEMBER((d),(s), c_cflag); \
- ASSIGN_MEMBER((d),(s), c_lflag); \
- ASSIGN_MEMBER((d),(s), c_line); \
- } while (0)
-
-static int tty_gen_id(struct tty_driver *driver, int index)
-{
- return (index << 1) + (driver->subtype == TTY_SUBTYPE_MASTER);
-}
-
-static int tty_get_index(u32 id)
-{
- return id >> 1;
-}
-
-/* Make sure the active pairs do exist */
-int tty_verify_active_pairs(void)
-{
- unsigned long i, unpaired_slaves = 0;
-
- for_each_bit(i, tty_active_pairs) {
- if ((i % 2) == 0) {
- if (test_bit(i + 1, tty_active_pairs)) {
- i++;
- continue;
- }
-
- if (!opts.shell_job) {
- pr_err("Found slave peer index %d without "
- "correspond master peer\n",
- tty_get_index(i));
- return -1;
- }
-
- pr_debug("Unpaired slave %d\n", tty_get_index(i));
-
- if (++unpaired_slaves > 1) {
- pr_err("Only one slave external peer "
- "is allowed (index %d)\n",
- tty_get_index(i));
- return -1;
- }
- }
- }
-
- return 0;
-}
-
-static int tty_test_and_set(int bit, unsigned long *bitmap)
-{
- int ret;
-
- ret = test_bit(bit, bitmap);
- if (!ret)
- set_bit(bit, bitmap);
- return ret;
-}
-
-/*
- * Generate a regular file object in case if such is missed
- * in the image file, ie obsolete interface has been used on
- * checkpoint.
- */
-static struct file_desc *pty_alloc_reg(struct tty_info *info, bool add)
-{
- TtyFileEntry *tfe = info->tfe;
- const size_t namelen = 64;
- struct reg_file_info *r;
- static struct file_desc_ops noops = {};
-
- r = xzalloc(sizeof(*r) + sizeof(*r->rfe) + namelen);
- if (!r)
- return NULL;
-
- r->rfe = (void *)r + sizeof(*r);
- reg_file_entry__init(r->rfe);
-
- r->rfe->name = (void *)r + sizeof(*r) + sizeof(*r->rfe);
- if (tty_is_master(info))
- strcpy(r->rfe->name, "/dev/ptmx");
- else
- snprintf(r->rfe->name, namelen, "/dev/pts/%u",
- info->tie->pty->index);
-
- if (add)
- file_desc_add(&r->d, tfe->id, &noops);
- else
- file_desc_init(&r->d, tfe->id, &noops);
-
- r->rfe->id = tfe->id;
- r->rfe->flags = tfe->flags;
- r->rfe->fown = tfe->fown;
- r->path = &r->rfe->name[1];
-
- return &r->d;
-}
-
-/*
- * In case if we need to open a fake pty (for example
- * a master peer which were deleted at checkpoint moment,
- * or open a slave peer when restoring control terminal)
- * we need to create a new reg-file object taking @info
- * as a template. Here is a trick though: the @info might
- * represent master peer while we need to allocate a slave
- * one and the reverse. For such case taking path from the
- * @info as a template we generate that named 'inverted-path'.
- *
- * For example if the master peer was /dev/pts/ptmx with index 1,
- * the inverted path is /dev/pts/1, for inverted slaves it's simplier
- * we just add 'ptmx' postfix.
- */
-static struct reg_file_info *pty_alloc_fake_reg(struct tty_info *info, int subtype)
-{
- struct reg_file_info *new, *orig;
- struct file_desc *fake_desc;
-
- pr_debug("Allocating fake descriptor for %#x (reg_d %p)\n",
- info->tfe->id, info->reg_d);
-
- BUG_ON(!info->reg_d);
- BUG_ON(!is_pty(info->driver));
-
- fake_desc = pty_alloc_reg(info, false);
- if (!fake_desc)
- return NULL;
-
- orig = container_of(info->reg_d, struct reg_file_info, d);
- new = container_of(fake_desc, struct reg_file_info, d);
-
- if ((subtype == TTY_SUBTYPE_MASTER && tty_is_master(info)) ||
- (subtype == TTY_SUBTYPE_SLAVE && !tty_is_master(info))) {
- new->path = xstrdup(orig->path);
- new->rfe->name = &new->path[1];
- } else {
- char *pos = strrchr(orig->rfe->name, '/');
- size_t len = strlen(orig->rfe->name) + 1;
- size_t slash_at = pos - orig->rfe->name;
- char *inverted_path = xmalloc(len + 32);
-
- BUG_ON(!pos || !inverted_path);
-
- memcpy(inverted_path, orig->rfe->name, slash_at + 1);
- if (subtype == TTY_SUBTYPE_MASTER)
- strcat(inverted_path, "ptmx");
- else {
- if (slash_at >= 3 && strncmp(&inverted_path[slash_at - 3], "pts", 3))
- snprintf(&inverted_path[slash_at + 1], 10, "pts/%u",
- info->tie->pty->index);
- else
- snprintf(&inverted_path[slash_at + 1], 10, "%u",
- info->tie->pty->index);
- }
-
- new->rfe->name = inverted_path;
- new->path = &inverted_path[1];
- }
-
- return new;
-}
-
-#define pty_alloc_fake_master(info) pty_alloc_fake_reg(info, TTY_SUBTYPE_MASTER)
-#define pty_alloc_fake_slave(info) pty_alloc_fake_reg(info, TTY_SUBTYPE_SLAVE)
-
-static void pty_free_fake_reg(struct reg_file_info **r)
-{
- if (*r) {
- xfree((*r)->rfe->name);
- xfree((*r));
- *r = NULL;
- }
-}
-
-static int open_tty_reg(struct file_desc *reg_d, u32 flags)
-{
- /*
- * Never set as a control terminal automatically, all
- * ctty magic happens only in tty_set_sid().
- */
- flags |= O_NOCTTY;
- return open_path(reg_d, do_open_reg_noseek_flags, &flags);
-}
-
-static char *path_from_reg(struct file_desc *d)
-{
- struct reg_file_info *rfi = container_of(d, struct reg_file_info, d);
- return rfi->path;
-}
-
-static int pty_open_ptmx_index(struct file_desc *d, int index, int flags)
-{
- int fds[32], i, ret = -1, cur_idx;
-
- memset(fds, 0xff, sizeof(fds));
-
- mutex_lock(tty_mutex);
-
- for (i = 0; i < ARRAY_SIZE(fds); i++) {
- fds[i] = open_tty_reg(d, flags);
- if (fds[i] < 0) {
- pr_perror("Can't open %s", path_from_reg(d));
- break;
- }
-
- if (ioctl(fds[i], TIOCGPTN, &cur_idx)) {
- pr_perror("Can't obtain current index on %s",
- path_from_reg(d));
- break;
- }
-
- pr_debug("\t\tptmx opened with index %d\n", cur_idx);
-
- if (cur_idx == index) {
- pr_info("ptmx opened with index %d\n", cur_idx);
- ret = fds[i];
- fds[i] = -1;
- break;
- }
-
- /*
- * Maybe indices are already borrowed by
- * someone else, so no need to continue.
- */
- if (cur_idx < index && (index - cur_idx) < ARRAY_SIZE(fds))
- continue;
-
- pr_err("Unable to open %s with specified index %d\n",
- path_from_reg(d), index);
- break;
- }
-
- for (i = 0; i < ARRAY_SIZE(fds); i++) {
- if (fds[i] >= 0)
- close(fds[i]);
- }
-
- mutex_unlock(tty_mutex);
-
- return ret;
-}
-
-static int unlock_pty(int fd)
-{
- const int lock = 0;
-
- /*
- * Usually when ptmx opened it gets locked
- * by kernel and we need to unlock it to be
- * able to connect slave peer.
- */
- if (ioctl(fd, TIOCSPTLCK, &lock)) {
- pr_err("Unable to unlock pty device via y%d\n", fd);
- return -1;
- }
-
- return 0;
-}
-
-static int lock_pty(int fd)
-{
- const int lock = 1;
-
- if (ioctl(fd, TIOCSPTLCK, &lock)) {
- pr_err("Unable to lock pty device via %d\n", fd);
- return -1;
- }
-
- return 0;
-}
-
-static int tty_set_sid(int fd)
-{
- if (ioctl(fd, TIOCSCTTY, 1)) {
- pr_perror("Can't set sid on terminal fd %d", fd);
- return -1;
- }
-
- return 0;
-}
-
-static int tty_set_prgp(int fd, int group)
-{
- if (ioctl(fd, TIOCSPGRP, &group)) {
- pr_perror("Failed to set group %d on %d", group, fd);
- return -1;
- }
- return 0;
-}
-
-static int tty_restore_ctl_terminal(struct file_desc *d, int fd)
-{
- struct tty_info *info = container_of(d, struct tty_info, d);
- struct tty_driver *driver = info->driver;
- struct reg_file_info *fake = NULL;
- struct file_desc *slave_d;
- int slave = -1, ret = -1, index = -1;
-
- if (!is_service_fd(fd, CTL_TTY_OFF))
- return 0;
-
- if (driver->type == TTY_TYPE__EXT_TTY) {
- slave = -1;
- if (!inherited_fd(&info->d, &slave) && slave < 0)
- return -1;
- goto out;
- }
- if (driver->img_get_index)
- index = driver->img_get_index(info);
- else
- index = driver->index;
-
- if (is_pty(info->driver)) {
- fake = pty_alloc_fake_slave(info);
- if (!fake)
- goto err;
-
- slave_d = &fake->d;
- } else
- slave_d = info->reg_d;
-
- slave = open_tty_reg(slave_d, O_RDONLY);
- if (slave < 0) {
- pr_perror("Can't open %s", path_from_reg(slave_d));
- goto err;
- }
-
-out:
- pr_info("Restore session %d by %d tty (index %d)\n",
- info->tie->sid, (int)getpid(), index);
-
- ret = tty_set_sid(slave);
- if (!ret)
- ret = tty_set_prgp(slave, info->tie->pgrp);
-
- close(slave);
-err:
- pty_free_fake_reg(&fake);
- close(fd);
- return ret;
-}
-
-static bool tty_is_master(struct tty_info *info)
-{
- if (info->driver->subtype == TTY_SUBTYPE_MASTER)
- return true;
-
- switch (info->driver->type) {
- case TTY_TYPE__CONSOLE:
- case TTY_TYPE__CTTY:
- return true;
- case TTY_TYPE__VT:
- if (!opts.shell_job)
- return true;
- break;
- case TTY_TYPE__EXT_TTY:
- return true;
- }
-
- return false;
-}
-
-static bool tty_is_hung(struct tty_info *info)
-{
- return info->tie->termios == NULL;
-}
-
-static bool tty_has_active_pair(struct tty_info *info)
-{
- int d = tty_is_master(info) ? -1 : + 1;
-
- return test_bit(info->tfe->tty_info_id + d,
- tty_active_pairs);
-}
-
-static void tty_show_pty_info(char *prefix, struct tty_info *info)
-{
- int index = -1;
- struct tty_driver *driver = info->driver;
-
- if (driver->img_get_index)
- index = driver->img_get_index(info);
- else
- index = driver->index;
-
- pr_info("%s driver %s id %#x index %d (master %d sid %d pgrp %d inherit %d)\n",
- prefix, info->driver->name, info->tfe->id, index,
- tty_is_master(info), info->tie->sid, info->tie->pgrp, info->inherit);
-}
-
-struct tty_parms {
- int tty_id;
- unsigned has;
-#define HAS_TERMIOS_L 0x1
-#define HAS_TERMIOS 0x2
-#define HAS_WINS 0x4
- struct termios tl;
- struct termios t;
- struct winsize w;
-};
-
-static int do_restore_tty_parms(void *arg, int fd, pid_t pid)
-{
- struct tty_parms *p = arg;
-
- /*
- * Only locked termios need CAP_SYS_ADMIN, but we
- * restore them all here, since the regular tremios
- * restore is affected by locked and thus we would
- * have to do synchronous usernsd call which is not
- * nice.
- *
- * Window size is restored here as it might depend
- * on termios too. Just to be on the safe side.
- */
-
- if ((p->has & HAS_TERMIOS_L) &&
- ioctl(fd, TIOCSLCKTRMIOS, &p->tl) < 0)
- goto err;
-
- if ((p->has & HAS_TERMIOS) &&
- ioctl(fd, TCSETS, &p->t) < 0)
- goto err;
-
- if ((p->has & HAS_WINS) &&
- ioctl(fd, TIOCSWINSZ, &p->w) < 0)
- goto err;
-
- return 0;
-
-err:
- pr_perror("Can't set tty params on %d", p->tty_id);
- return -1;
-}
-
-static int restore_tty_params(int fd, struct tty_info *info)
-{
- struct tty_parms p;
-
- /*
- * It's important to zeroify termios
- * because it contain @c_cc array which
- * is bigger than TERMIOS_NCC. Same applies
- * to winsize usage, we can't guarantee the
- * structure taken from the system headers will
- * never be extended.
- */
-
- p.has = 0;
- p.tty_id = info->tfe->id;
-
- if (info->tie->termios_locked) {
- memzero(&p.tl, sizeof(p.tl));
- p.has |= HAS_TERMIOS_L;
- termios_copy(&p.tl, info->tie->termios_locked);
- }
-
- if (info->tie->termios) {
- memzero(&p.t, sizeof(p.t));
- p.has |= HAS_TERMIOS;
- termios_copy(&p.t, info->tie->termios);
- }
-
- if (info->tie->winsize) {
- memzero(&p.w, sizeof(p.w));
- p.has |= HAS_WINS;
- winsize_copy(&p.w, info->tie->winsize);
- }
-
- return userns_call(do_restore_tty_parms, UNS_ASYNC, &p, sizeof(p), fd);
-}
-
-static int pty_open_slaves(struct tty_info *info)
-{
- int sock = -1, fd = -1, ret = -1;
- struct fdinfo_list_entry *fle;
- struct tty_info *slave;
-
- sock = socket(PF_UNIX, SOCK_DGRAM, 0);
- if (sock < 0) {
- pr_perror("Can't create socket");
- goto err;
- }
-
- list_for_each_entry(slave, &info->sibling, sibling) {
- BUG_ON(tty_is_master(slave));
-
- fd = open_tty_reg(slave->reg_d, slave->tfe->flags);
- if (fd < 0) {
- pr_perror("Can't open slave %s", path_from_reg(slave->reg_d));
- goto err;
- }
-
- if (restore_tty_params(fd, slave))
- goto err;
-
- fle = file_master(&slave->d);
-
- pr_debug("send slave %#x fd %d connected on %s (pid %d)\n",
- slave->tfe->id, fd, path_from_reg(slave->reg_d), fle->pid);
-
- if (send_fd_to_peer(fd, fle, sock)) {
- pr_perror("Can't send file descriptor");
- goto err;
- }
-
- close(fd);
- fd = -1;
- }
- ret = 0;
-
-err:
- close_safe(&fd);
- close_safe(&sock);
- return ret;
-}
-
-static int receive_tty(struct tty_info *info)
-{
- struct fdinfo_list_entry *fle;
- int fd;
-
- fle = file_master(&info->d);
- pr_info("\tWaiting tty fd %d (pid %d)\n", fle->fe->fd, fle->pid);
-
- fd = recv_fd(fle->fe->fd);
- close(fle->fe->fd);
- if (fd < 0) {
- pr_err("Can't get fd %d\n", fd);
- return -1;
- }
-
- if (rst_file_params(fd, info->tfe->fown, info->tfe->flags))
- close_safe(&fd);
-
- return fd;
-}
-
-static int pty_open_unpaired_slave(struct file_desc *d, struct tty_info *slave)
-{
- struct reg_file_info *fake = NULL;
- int master = -1, ret = -1, fd = -1;
-
- /*
- * We may have 2 cases here: the slave either need to
- * be inherited, either it requires a fake master.
- */
-
- if (likely(slave->inherit)) {
- fd = dup(get_service_fd(SELF_STDIN_OFF));
- if (fd < 0) {
- pr_perror("Can't dup SELF_STDIN_OFF");
- return -1;
- }
- pr_info("Migrated slave peer %x -> to fd %d\n",
- slave->tfe->id, fd);
- } else {
- fake = pty_alloc_fake_master(slave);
- if (!fake)
- goto err;
- master = pty_open_ptmx_index(&fake->d, slave->tie->pty->index, O_RDONLY);
- if (master < 0) {
- pr_perror("Can't open fale %x (index %d)",
- slave->tfe->id, slave->tie->pty->index);
- goto err;
- }
-
- unlock_pty(master);
-
- fd = open_tty_reg(slave->reg_d, slave->tfe->flags);
- if (fd < 0) {
- pr_perror("Can't open slave %s", path_from_reg(slave->reg_d));
- goto err;
- }
-
- }
-
- if (restore_tty_params(fd, slave))
- goto err;
-
- /*
- * If tty is migrated we need to set its group
- * to the parent group, because signals on key
- * presses are delivered to a group of terminal.
- *
- * Note, at this point the group/session should
- * be already restored properly thus we can simply
- * use syscalls instead of lookup via process tree.
- */
- if (likely(slave->inherit)) {
- /*
- * The restoration procedure only works if we're
- * migrating not a session leader, otherwise it's
- * not allowed to restore a group and one better to
- * checkpoint complete process tree together with
- * the process which keeps the master peer.
- */
- if (root_item->sid != root_item->pid.virt) {
- pr_debug("Restore inherited group %d\n",
- getpgid(getppid()));
- if (tty_set_prgp(fd, getpgid(getppid())))
- goto err;
- }
- }
-
- if (pty_open_slaves(slave))
- goto err;
-
- ret = fd;
- fd = -1;
-err:
- close_safe(&master);
- close_safe(&fd);
- pty_free_fake_reg(&fake);
- return ret;
-}
-
-static int pty_open_ptmx(struct tty_info *info)
-{
- int master = -1;
-
- master = pty_open_ptmx_index(info->reg_d, info->tie->pty->index, info->tfe->flags);
- if (master < 0) {
- pr_perror("Can't open %x (index %d)",
- info->tfe->id, info->tie->pty->index);
- return -1;
- }
-
- unlock_pty(master);
-
- if (restore_tty_params(master, info))
- goto err;
-
- if (info->tie->packet_mode) {
- int packet_mode = 1;
-
- if (ioctl(master, TIOCPKT, &packet_mode) < 0) {
- pr_perror("Can't set packed mode on %x",
- info->tfe->id);
- goto err;
- }
- }
-
- if (pty_open_slaves(info))
- goto err;
-
- if (info->tie->locked)
- lock_pty(master);
-
- return master;
-err:
- close_safe(&master);
- return -1;
-}
-
-static int open_simple_tty(struct tty_info *info)
-{
- int fd = -1;
-
- fd = open_tty_reg(info->reg_d, info->tfe->flags);
- if (fd < 0) {
- pr_perror("Can't open %s %x",
- info->driver->name, info->tfe->id);
- return -1;
- }
-
- if (restore_tty_params(fd, info))
- goto err;
-
- return fd;
-err:
- close_safe(&fd);
- return -1;
-}
-
-static int open_ext_tty(struct tty_info *info)
-{
- int fd = -1;
-
- if (!inherited_fd(&info->d, &fd) && fd < 0)
- return -1;
-
- if (restore_tty_params(fd, info)) {
- close(fd);
- return -1;
- }
-
- return fd;
-}
-
-static int tty_open(struct file_desc *d)
-{
- struct tty_info *info = container_of(d, struct tty_info, d);
-
- tty_show_pty_info("open", info);
-
- if (!info->create)
- return receive_tty(info);
-
- if (is_pty(info->driver) && !tty_is_master(info))
- return pty_open_unpaired_slave(d, info);
-
- return info->driver->open(info);
-}
-
-static int tty_transport(FdinfoEntry *fe, struct file_desc *d)
-{
- struct tty_info *info = container_of(d, struct tty_info, d);
- return !info->create;
-}
-
-static void tty_collect_fd(struct file_desc *d, struct fdinfo_list_entry *fle,
- struct rst_info *ri)
-{
- struct tty_info *info = container_of(d, struct tty_info, d);
- struct list_head *tgt;
-
- /*
- * Unix98 pty slave peers requires the master peers being
- * opened before them. In turn, current ttys should be opened
- * after the slave peers so session must alread exist.
- */
-
- if (tty_is_master(info) && info->driver->type != TTY_TYPE__CTTY)
- tgt = &ri->fds;
- else if (info->driver->type == TTY_TYPE__CTTY)
- tgt = &ri->tty_ctty;
- else
- tgt = &ri->tty_slaves;
-
- list_add_tail(&fle->ps_list, tgt);
-}
-
-static char *tty_d_name(struct file_desc *d, char *buf, size_t s)
-{
- struct tty_info *info = container_of(d, struct tty_info, d);
-
- snprintf(buf, s, "tty[%x:%x]", info->tie->rdev, info->tie->dev);
-
- return buf;
-}
-
-static struct file_desc_ops tty_desc_ops = {
- .type = FD_TYPES__TTY,
- .open = tty_open,
- .post_open = tty_restore_ctl_terminal,
- .want_transport = tty_transport,
- .collect_fd = tty_collect_fd,
- .name = tty_d_name,
-};
-
-static struct pstree_item *find_first_sid(int sid)
-{
- struct pstree_item *item;
-
- for_each_pstree_item(item) {
- if (item->sid == sid)
- return item;
- }
-
- return NULL;
-}
-
-static int tty_find_restoring_task(struct tty_info *info)
-{
- struct pstree_item *item;
-
- /*
- * The overall scenario is the following (note
- * we might have corrupted image so don't believe
- * anything).
- *
- * SID is present on a peer
- * ------------------------
- *
- * - if it's master peer and we have as well a slave
- * peer then prefer restore controlling terminal
- * via slave peer
- *
- * - if it's master peer without slave, there must be
- * a SID leader who will be restoring the peer
- *
- * - if it's a slave peer and no session leader found
- * than we need an option to inherit terminal
- *
- * No SID present on a peer
- * ------------------------
- *
- * - if it's a master peer than we are in good shape
- * and continue in a normal way, we're the peer keepers
- *
- * - if it's a slave peer and no appropriate master peer
- * found we need an option to inherit terminal
- *
- * In any case if it's hungup peer, then we jump out
- * early since it will require fake master peer and
- * rather non-usable anyway.
- */
-
- if (tty_is_hung(info)) {
- pr_debug("Hungup terminal found id %x\n", info->tfe->id);
- return 0;
- }
-
- /*
- * Current tty should be skipped here: the
- * underlied _real_ pty (or anything else
- * driver in future) should restore the
- * session.
- */
- if (info->driver->type == TTY_TYPE__CTTY)
- return 0;
-
- if (info->tie->sid) {
- if (!tty_is_master(info)) {
- if (tty_has_active_pair(info))
- return 0;
- else
- goto shell_job;
- }
-
- /*
- * Restoring via leader only. All files
- * opened over same real tty get propagated
- * automatically by kernel itself.
- */
- if (info->ctl_tty != info)
- return 0;
-
- /*
- * Find out the task which is session leader
- * and it can restore the controlling terminal
- * for us.
- */
- item = find_first_sid(info->tie->sid);
- if (item && item->pid.virt == item->sid) {
- pr_info("Set a control terminal %x to %d\n",
- info->tfe->id, info->tie->sid);
- return prepare_ctl_tty(item->pid.virt,
- rsti(item),
- info->tfe->id);
- }
-
- goto notask;
- } else {
- if (tty_is_master(info))
- return 0;
- if (tty_has_active_pair(info))
- return 0;
- }
-
-shell_job:
- if (opts.shell_job) {
- pr_info("Inherit terminal for id %x\n", info->tfe->id);
- info->inherit = true;
- return 0;
- }
-
-notask:
- pr_err("No task found with sid %d\n", info->tie->sid);
- return -1;
-}
-
-static int tty_setup_orphan_slavery(void)
-{
- struct tty_info *info, *peer, *m;
-
- list_for_each_entry(info, &all_ttys, list) {
- struct fdinfo_list_entry *a, *b;
- bool has_leader = false;
-
- if (tty_is_master(info))
- continue;
-
- a = file_master(&info->d);
- m = info;
-
- list_for_each_entry(peer, &info->sibling, sibling) {
- if (tty_is_master(peer)) {
- has_leader = true;
- break;
- }
-
- /*
- * Same check as in pipes and files -- need to
- * order slave ends so that they do not dead lock
- * waiting for each other.
- */
- b = file_master(&peer->d);
- if (fdinfo_rst_prio(b, a)) {
- a = b;
- m = peer;
- }
- }
-
- if (!has_leader) {
- m->create = true;
- pr_debug("Found orphan slave fake leader (%#x)\n",
- m->tfe->id);
- }
- }
-
- return 0;
-}
-
-int tty_setup_slavery(void)
-{
- struct tty_info *info, *peer, *m;
-
- /*
- * The image may carry several terminals opened
- * belonging to the same session, so choose the
- * leader which gonna be setting up the controlling
- * terminal.
- */
- list_for_each_entry(info, &all_ttys, list) {
- if (!info->tie->sid || info->ctl_tty ||
- info->driver->type == TTY_TYPE__CTTY)
- continue;
-
- if (!tty_is_master(info))
- continue;
-
- info->ctl_tty = info;
- pr_debug("ctl tty leader %x\n", info->tfe->id);
- peer = info;
- list_for_each_entry_safe_continue(peer, m, &all_ttys, list) {
- if (!peer->tie->sid || peer->ctl_tty ||
- peer->driver->type == TTY_TYPE__CTTY)
- continue;
- if (peer->tie->sid == info->tie->sid) {
- pr_debug(" `- slave %x\n", peer->tfe->id);
- peer->ctl_tty = info;
- }
- }
- }
-
- list_for_each_entry(info, &all_ttys, list) {
- if (tty_find_restoring_task(info))
- return -1;
- if (!is_pty(info->driver))
- continue;
-
- peer = info;
- list_for_each_entry_safe_continue(peer, m, &all_ttys, list) {
- if (!is_pty(peer->driver))
- continue;
- if (peer->tie->pty->index != info->tie->pty->index)
- continue;
-
- if (tty_find_restoring_task(peer))
- return -1;
-
- list_add(&peer->sibling, &info->sibling);
- list_del(&peer->list);
- }
- }
-
- /*
- * Print out information about peers.
- */
- list_for_each_entry(info, &all_ttys, list) {
- tty_show_pty_info("head", info);
- list_for_each_entry(peer, &info->sibling, sibling)
- tty_show_pty_info(" `- sibling", peer);
- }
-
- return tty_setup_orphan_slavery();
-}
-
-static int verify_termios(u32 id, TermiosEntry *e)
-{
- if (e && e->n_c_cc < TERMIOS_NCC) {
- pr_err("pty ID %#x n_c_cc (%d) has wrong value\n",
- id, (int)e->n_c_cc);
- return -1;
- }
- return 0;
-}
-
-#define term_opts_missing_cmp(p, op) \
- (!(p)->tie->termios op \
- !(p)->tie->termios_locked op \
- !(p)->tie->winsize)
-
-#define term_opts_missing_any(p) \
- term_opts_missing_cmp(p, ||)
-
-#define term_opts_missing_all(p) \
- term_opts_missing_cmp(p, &&)
-
-static int verify_info(struct tty_info *info)
-{
- if (!info->driver) {
- pr_err("Unknown driver master peer %x\n", info->tfe->id);
- return -1;
- }
-
- /*
- * Master peer must have all parameters present,
- * while slave peer must have either all parameters present
- * or don't have them at all.
- */
- if (term_opts_missing_any(info)) {
- if (tty_is_master(info)) {
- pr_err("Corrupted master peer %x\n", info->tfe->id);
- return -1;
- } else if (!term_opts_missing_all(info)) {
- pr_err("Corrupted slave peer %x\n", info->tfe->id);
- return -1;
- }
- }
-
- if (verify_termios(info->tfe->id, info->tie->termios_locked) ||
- verify_termios(info->tfe->id, info->tie->termios))
- return -1;
-
- if (info->tie->termios && info->tfe->tty_info_id > (MAX_TTYS << 1))
- return -1;
-
- return 0;
-}
-
-static TtyInfoEntry *lookup_tty_info_entry(u32 id)
-{
- struct tty_info_entry *e;
-
- list_for_each_entry(e, &all_tty_info_entries, list) {
- if (e->tie->id == id)
- return e->tie;
- }
-
- return NULL;
-}
-
-static int collect_one_tty_info_entry(void *obj, ProtobufCMessage *msg)
-{
- struct tty_info_entry *info = obj;
-
- info->tie = pb_msg(msg, TtyInfoEntry);
-
- switch (info->tie->type) {
- case TTY_TYPE__PTY:
- if (!info->tie->pty) {
- pr_err("No PTY data found (id %x), corrupted image?\n",
- info->tie->id);
- return -1;
- }
- break;
- case TTY_TYPE__CTTY:
- case TTY_TYPE__CONSOLE:
- case TTY_TYPE__VT:
- case TTY_TYPE__EXT_TTY:
- if (info->tie->pty) {
- pr_err("PTY data found (id %x), corrupted image?\n",
- info->tie->id);
- return -1;
- }
- break;
- default:
- pr_err("Unexpected TTY type %d (id %x)\n",
- info->tie->type, info->tie->id);
- return -1;
- }
-
- INIT_LIST_HEAD(&info->list);
- list_add(&info->list, &all_tty_info_entries);
-
- return 0;
-}
-
-struct collect_image_info tty_info_cinfo = {
- .fd_type = CR_FD_TTY_INFO,
- .pb_type = PB_TTY_INFO,
- .priv_size = sizeof(struct tty_info_entry),
- .collect = collect_one_tty_info_entry,
-};
-
-static int collect_one_tty(void *obj, ProtobufCMessage *msg)
-{
- struct tty_info *info = obj;
-
- info->tfe = pb_msg(msg, TtyFileEntry);
-
- info->tie = lookup_tty_info_entry(info->tfe->tty_info_id);
- if (!info->tie) {
- pr_err("No tty-info-id %x found on id %x\n",
- info->tfe->tty_info_id, info->tfe->id);
- return -1;
- }
-
- INIT_LIST_HEAD(&info->sibling);
- info->driver = get_tty_driver(info->tie->rdev, info->tie->dev);
- if (info->driver == NULL) {
- pr_err("Unable to find a tty driver\n");
- return -1;
- }
- info->create = tty_is_master(info);
- info->inherit = false;
- info->ctl_tty = NULL;
-
- if (verify_info(info))
- return -1;
-
- /*
- * The image might have no reg file record in old CRIU, so
- * lets don't fail for a while. After a couple of releases
- * simply require the record to present.
- */
- info->reg_d = try_collect_special_file(info->tfe->id, 1);
- if (!info->reg_d) {
- if (is_pty(info->driver)) {
- info->reg_d = pty_alloc_reg(info, true);
- if (!info->reg_d) {
- pr_err("Can't generate new reg descriptor for id %#x\n",
- info->tfe->id);
- return -1;
- }
- } if (info->driver->type != TTY_TYPE__EXT_TTY) {
- pr_err("No reg_d descriptor for id %#x\n", info->tfe->id);
- return -1;
- }
- }
-
- /*
- * The tty peers which have no @termios are hung up,
- * so don't mark them as active, we create them with
- * faked master and they are rather a rudiment which
- * can't be used. Most likely they appear if a user has
- * dumped program when it was closing a peer.
- */
- if (is_pty(info->driver) && info->tie->termios)
- tty_test_and_set(info->tfe->tty_info_id, tty_active_pairs);
-
- pr_info("Collected tty ID %#x (%s)\n", info->tfe->id, info->driver->name);
-
- list_add(&info->list, &all_ttys);
- return file_desc_add(&info->d, info->tfe->id, &tty_desc_ops);
-}
-
-struct collect_image_info tty_cinfo = {
- .fd_type = CR_FD_TTY_FILES,
- .pb_type = PB_TTY_FILE,
- .priv_size = sizeof(struct tty_info),
- .collect = collect_one_tty,
-};
-
-/* Make sure the ttys we're dumping do belong our process tree */
-int dump_verify_tty_sids(void)
-{
- struct tty_dump_info *dinfo, *n;
- int ret = 0;
-
- /*
- * There might be a cases where we get sid/pgid on
- * slave peer. For example the application is running
- * with redirection and we're migrating shell job.
- *
- * # ./app < /dev/zero > /dev/zero &2>1
- *
- * Which produce a tree like
- * PID PPID PGID SID
- * root 23786 23784 23786 23786 pts/0 \_ -bash
- * root 24246 23786 24246 23786 pts/0 \_ ./app
- *
- * And the application goes background, then we dump
- * it from the same shell.
- *
- * In this case we simply zap sid/pgid and inherit
- * the peer from the current terminal on restore.
- */
- list_for_each_entry_safe(dinfo, n, &all_ttys, list) {
- if (!ret && dinfo->sid) {
- struct pstree_item *item = find_first_sid(dinfo->sid);
-
- if (!item || item->pid.virt != dinfo->sid) {
- if (!opts.shell_job) {
- pr_err("Found dangling tty with sid %d pgid %d (%s) on peer fd %d.\n",
- dinfo->sid, dinfo->pgrp,
- dinfo->driver->name, dinfo->fd);
- /*
- * First thing people do with criu is dump smth
- * run from shell. This is typical pitfall, warn
- * user about it explicitly.
- */
- pr_msg("Task attached to shell terminal. "
- "Consider using --" OPT_SHELL_JOB " option. "
- "More details on http://criu.org/Simple_loop\n");
- ret = -1;
- }
- }
- }
- xfree(dinfo);
- }
-
- return ret;
-}
-
-static int dump_tty_info(int lfd, u32 id, const struct fd_parms *p, struct tty_driver *driver, int index)
-{
- TtyInfoEntry info = TTY_INFO_ENTRY__INIT;
- TermiosEntry termios = TERMIOS_ENTRY__INIT;
- TermiosEntry termios_locked = TERMIOS_ENTRY__INIT;
- WinsizeEntry winsize = WINSIZE_ENTRY__INIT;
- TtyPtyEntry pty = TTY_PTY_ENTRY__INIT;
- struct parasite_tty_args *pti;
- struct tty_dump_info *dinfo;
-
- struct termios t;
- struct winsize w;
-
- int ret = -1;
-
- /*
- * Make sure the structures the system provides us
- * correlates well with protobuf templates.
- */
- BUILD_BUG_ON(ARRAY_SIZE(t.c_cc) < TERMIOS_NCC);
- BUILD_BUG_ON(sizeof(termios.c_cc) != sizeof(void *));
- BUILD_BUG_ON((sizeof(termios.c_cc) * TERMIOS_NCC) < sizeof(t.c_cc));
-
- pti = parasite_dump_tty(p->ctl, p->fd, driver->type);
- if (!pti)
- return -1;
-
- dinfo = xmalloc(sizeof(*dinfo));
- if (!dinfo)
- return -1;
-
- dinfo->id = id;
- dinfo->sid = pti->sid;
- dinfo->pgrp = pti->pgrp;
- dinfo->fd = p->fd;
- dinfo->driver = driver;
-
- list_add_tail(&dinfo->list, &all_ttys);
-
- info.id = id;
- info.sid = pti->sid;
- info.pgrp = pti->pgrp;
- info.rdev = p->stat.st_rdev;
- info.dev = p->stat.st_dev;
- info.has_dev = true;
- info.locked = pti->st_lock;
- info.exclusive = pti->st_excl;
- info.packet_mode = pti->st_pckt;
-
- info.type = driver->type;
- if (info.type == TTY_TYPE__PTY) {
- info.pty = &pty;
- pty.index = index;
- }
-
- /*
- * Nothing we can do on hanging up terminal,
- * just write out minimum information we can
- * gather.
- */
- if (pti->hangup)
- return pb_write_one(img_from_set(glob_imgset, CR_FD_TTY_INFO), &info, PB_TTY_INFO);
-
- /*
- * Now trace the paired/unpaired ttys. For example
- * the task might have slave peer assigned but no
- * master peer. Such "detached" master peers are
- * not yet supported by our tool and better to
- * inform a user about such situation.
- */
- if (is_pty(driver))
- tty_test_and_set(id, tty_active_pairs);
-
- info.termios = &termios;
- info.termios_locked = &termios_locked;
- info.winsize = &winsize;
-
- termios.n_c_cc = TERMIOS_NCC;
- termios.c_cc = xmalloc(pb_repeated_size(&termios, c_cc));
-
- termios_locked.n_c_cc = TERMIOS_NCC;
- termios_locked.c_cc = xmalloc(pb_repeated_size(&termios_locked, c_cc));
-
- if (!termios.c_cc || !termios_locked.c_cc)
- goto out;
-
- memzero(&t, sizeof(t));
- if (ioctl(lfd, TCGETS, &t) < 0) {
- pr_perror("Can't get tty params on %x", id);
- goto out;
- }
- termios_copy(&termios, &t);
-
- memzero(&t, sizeof(t));
- if (ioctl(lfd, TIOCGLCKTRMIOS, &t) < 0) {
- pr_perror("Can't get tty locked params on %x", id);
- goto out;
- }
- termios_copy(&termios_locked, &t);
-
- memzero(&w, sizeof(w));
- if (ioctl(lfd, TIOCGWINSZ, &w) < 0) {
- pr_perror("Can't get tty window params on %x", id);
- goto out;
- }
- winsize_copy(&winsize, &w);
-
- ret = pb_write_one(img_from_set(glob_imgset, CR_FD_TTY_INFO), &info, PB_TTY_INFO);
-out:
- xfree(termios.c_cc);
- xfree(termios_locked.c_cc);
- return ret;
-}
-
-static int dump_one_tty(int lfd, u32 id, const struct fd_parms *p)
-{
- TtyFileEntry e = TTY_FILE_ENTRY__INIT;
- int ret = 0, index = -1;
- struct tty_driver *driver;
-
- pr_info("Dumping tty %d with id %#x\n", lfd, id);
-
- driver = get_tty_driver(p->stat.st_rdev, p->stat.st_dev);
- if (driver->fd_get_index)
- index = driver->fd_get_index(lfd, p);
- else
- index = driver->index;
-
- if (index == INDEX_ERR) {
- pr_info("Can't obtain index on tty %d id %#x\n", lfd, id);
- return -1;
- }
-
- if (driver->type != TTY_TYPE__EXT_TTY && dump_one_reg_file(lfd, id, p))
- return -1;
-
- e.id = id;
- e.tty_info_id = tty_gen_id(driver, index);
- e.flags = p->flags;
- e.fown = (FownEntry *)&p->fown;
-
- /*
- * FIXME
- *
- * Figure out how to fetch data buffered in terminal.
- * For a while simply flush before dumping. Note
- * we don't check for errors here since it makes
- * no sense anyway, the buffered data is not handled
- * properly yet.
- *
- * Note as well that if we have only one peer here
- * the external end might be sending the data to us
- * again and again while kernel buffer is not full,
- * this might lead to endless SIGTTOU signal delivery
- * to the dumpee, ruining checkpoint procedure.
- *
- * So simply do not flush the line while we dump
- * parameters tty never was being a guaranteed delivery
- * transport anyway.
- */
-
- if (!tty_test_and_set(e.tty_info_id, tty_bitmap))
- ret = dump_tty_info(lfd, e.tty_info_id, p, driver, index);
-
- if (!ret)
- ret = pb_write_one(img_from_set(glob_imgset, CR_FD_TTY_FILES), &e, PB_TTY_FILE);
- return ret;
-}
-
-const struct fdtype_ops tty_dump_ops = {
- .type = FD_TYPES__TTY,
- .dump = dump_one_tty,
-};
-
-int tty_prep_fds(void)
-{
- if (!opts.shell_job)
- return 0;
-
- if (!isatty(STDIN_FILENO)) {
- pr_err("Standard stream is not a terminal, aborting\n");
- return -1;
- }
-
- if (install_service_fd(SELF_STDIN_OFF, STDIN_FILENO) < 0) {
- pr_perror("Can't dup stdin to SELF_STDIN_OFF");
- return -1;
- }
-
- return 0;
-}
-
-void tty_fini_fds(void)
-{
- close_service_fd(SELF_STDIN_OFF);
-}
diff --git a/tun.c b/tun.c
deleted file mode 100644
index dcee704e48b4..000000000000
--- a/tun.c
+++ /dev/null
@@ -1,494 +0,0 @@
-#include <unistd.h>
-#include <sys/socket.h>
-#include <linux/if.h>
-#include <linux/if_tun.h>
-#include <sys/ioctl.h>
-#include <sched.h>
-
-// MAO required on Centos 6 (linux-3.18.1 kernel)
-#include <linux/filter.h>
-
-#include "cr_options.h"
-#include "imgset.h"
-#include "protobuf.h"
-#include "cr-show.h"
-#include "string.h"
-#include "files.h"
-#include "files-reg.h"
-#include "tun.h"
-#include "net.h"
-#include "namespaces.h"
-
-#include "protobuf/tun.pb-c.h"
-
-#ifndef IFF_PERSIST
-#define IFF_PERSIST 0x0800
-#endif
-
-#ifndef IFF_NOFILTER
-#define IFF_NOFILTER 0x1000
-#endif
-
-#ifndef TUNSETQUEUE
-#define TUNSETQUEUE _IOW('T', 217, int)
-#define IFF_ATTACH_QUEUE 0x0200
-#define IFF_DETACH_QUEUE 0x0400
-#endif
-
-/*
- * Absense of the 1st ioctl means we cannot restore tun link. But
- * since the 2nd one appeared at the same time, we'll "check" this
- * by trying to dump filter and abort dump if it's not there.
- */
-
-#ifndef TUNSETIFINDEX
-#define TUNSETIFINDEX _IOW('T', 218, unsigned int)
-#endif
-
-#ifndef TUNGETFILTER
-#define TUNGETFILTER _IOR('T', 219, struct sock_fprog)
-#endif
-
-#define TUN_DEV_GEN_PATH "/dev/net/tun"
-
-int check_tun_cr(int no_tun_err)
-{
- int fd, idx = 13, ret;
-
- fd = open(TUN_DEV_GEN_PATH, O_RDWR);
- if (fd < 0) {
- pr_perror("Can't check tun support");
- return no_tun_err;
- }
-
- ret = ioctl(fd, TUNSETIFINDEX, &idx);
- if (ret < 0)
- pr_perror("No proper support for tun dump/restore");
-
- close(fd);
- return ret;
-}
-
-static LIST_HEAD(tun_links);
-
-struct tun_link {
- char name[IFNAMSIZ];
- struct list_head l;
- union {
- struct {
- unsigned flags;
- } rst;
-
- struct {
- unsigned sndbuf;
- unsigned vnethdr;
- } dmp;
- };
-};
-
-static int list_tun_link(NetDeviceEntry *nde)
-{
- struct tun_link *tl;
-
- tl = xmalloc(sizeof(*tl));
- if (!tl)
- return -1;
-
- strlcpy(tl->name, nde->name, sizeof(tl->name));
- /*
- * Keep tun-flags not only for persistency fixup (see
- * commend below), but also for TUNSETIFF -- we must
- * open the device with the same flags it should live
- * with (i.e. -- with which it was created.
- */
- tl->rst.flags = nde->tun->flags;
- list_add_tail(&tl->l, &tun_links);
- return 0;
-}
-
-static struct tun_link *find_tun_link(char *name)
-{
- struct tun_link *tl;
-
- list_for_each_entry(tl, &tun_links, l)
- if (!strcmp(tl->name, name))
- return tl;
-
- return NULL;
-}
-
-static struct tun_link *__dump_tun_link_fd(int fd, char *name, unsigned flags)
-{
- struct tun_link *tl;
- struct sock_fprog flt;
-
- tl = xmalloc(sizeof(*tl));
- if (!tl)
- goto err;
- strlcpy(tl->name, name, sizeof(tl->name));
-
- if (ioctl(fd, TUNGETVNETHDRSZ, &tl->dmp.vnethdr) < 0) {
- pr_perror("Can't dump vnethdr size for %s", name);
- goto err;
- }
-
- if (ioctl(fd, TUNGETSNDBUF, &tl->dmp.sndbuf) < 0) {
- pr_perror("Can't dump sndbuf for %s", name);
- goto err;
- }
-
- if (flags & IFF_TAP) {
- pr_debug("Checking filter for tap %s\n", name);
- if (ioctl(fd, TUNGETFILTER, &flt) < 0) {
- pr_perror("Can't get tun filter for %s", name);
- goto err;
- }
-
- /*
- * TUN filters are tricky -- the program itself is 'somewhere'
- * in the task's memory, so we can't get one for unattached
- * persistent device. The only way for doing it is opening the
- * device with IFF_NOFILTER and attaching some fake one :(
- */
-
- if (flt.len != 0) {
- pr_err("Can't dump %s with filter on-board\n", name);
- goto err;
- }
- } else if (!(flags & IFF_NOFILTER)) {
- pr_err("No info about %s filter, kernel is too old\n", name);
- goto err;
- }
-
- return tl;
-
-err:
- xfree(tl);
- return NULL;
-}
-
-static struct tun_link *dump_tun_link_fd(int fd, char *name, unsigned flags)
-{
- struct tun_link *tl;
-
- tl = find_tun_link(name);
- if (tl)
- return tl;
-
- tl = __dump_tun_link_fd(fd, name, flags);
- if (tl)
- /*
- * Keep this in list till links dumping code starts.
- * We can't let it dump all this stuff itself, since
- * multiple attaches to one tun device is limited and
- * we may not be able to it that late.
- *
- * For persistent detached devices the get_tun_link_fd
- * will attach to the device and get the needed stuff.
- */
- list_add(&tl->l, &tun_links);
-
- return tl;
-}
-
-static int open_tun_dev(char *name, unsigned int idx, unsigned flags)
-{
- int fd;
- struct ifreq ifr;
-
- fd = open(TUN_DEV_GEN_PATH, O_RDWR);
- if (fd < 0) {
- pr_perror("Can't open tun device");
- return -1;
- }
-
- if (idx) {
- pr_debug(" restoring %u for %s tun\n", idx, name);
- if (ioctl(fd, TUNSETIFINDEX, &idx) < 0) {
- pr_perror("Can't restore tun's index");
- goto err;
- }
- }
-
- memset(&ifr, 0, sizeof(ifr));
- strlcpy(ifr.ifr_name, name, sizeof(ifr.ifr_name));
- ifr.ifr_flags = flags;
-
- if (ioctl(fd, TUNSETIFF, &ifr)) {
- pr_perror("Can't create tun device");
- goto err;
- }
-
- return fd;
-
-err:
- close(fd);
- return -1;
-}
-
-static struct tun_link *get_tun_link_fd(char *name, unsigned flags)
-{
- struct tun_link *tl;
- int fd;
-
- tl = find_tun_link(name);
- if (tl)
- return tl;
-
- /*
- * If we haven't found this thing, then the
- * device we see via netlink exists w/o any fds
- * attached, i.e. -- it's persistent
- */
-
- if (!(flags & IFF_PERSIST)) {
- pr_err("No fd infor for non persistent tun device %s\n", name);
- return NULL;
- }
-
- /*
- * Kernel will try to attach filter (if it exists) to our memory,
- * avoid this.
- */
-
- flags |= IFF_NOFILTER;
-
- fd = open_tun_dev(name, 0, flags);
- if (fd < 0)
- return NULL;
-
- tl = __dump_tun_link_fd(fd, name, flags);
- close(fd);
-
- return tl;
-}
-
-static int dump_tunfile(int lfd, u32 id, const struct fd_parms *p)
-{
- int ret;
- struct cr_img *img;
- TunfileEntry tfe = TUNFILE_ENTRY__INIT;
- struct ifreq ifr;
-
- if (!(root_ns_mask & CLONE_NEWNET)) {
- pr_err("Net namespace is required to dump tun link\n");
- return -1;
- }
-
- if (dump_one_reg_file(lfd, id, p))
- return -1;
-
- pr_info("Dumping tun-file %d with id %#x\n", lfd, id);
-
- tfe.id = id;
- ret = ioctl(lfd, TUNGETIFF, &ifr);
- if (ret < 0) {
- if (errno != EBADFD) {
- pr_perror("Can't dump tun-file device");
- return -1;
- }
-
- /*
- * Otherwise this is just opened file with not yet attached
- * tun device. Go agead an write the respective entry.
- */
- } else {
- tfe.netdev = ifr.ifr_name;
- pr_info("`- attached to device %s (flags %x)\n", tfe.netdev, ifr.ifr_flags);
-
- if (ifr.ifr_flags & IFF_DETACH_QUEUE) {
- tfe.has_detached = true;
- tfe.detached = true;
- }
-
- if (dump_tun_link_fd(lfd, tfe.netdev, ifr.ifr_flags) == NULL)
- return -1;
- }
-
- img = img_from_set(glob_imgset, CR_FD_TUNFILE);
- return pb_write_one(img, &tfe, PB_TUNFILE);
-}
-
-const struct fdtype_ops tunfile_dump_ops = {
- .type = FD_TYPES__TUNF,
- .dump = dump_tunfile,
-};
-
-struct tunfile_info {
- struct file_desc d;
- TunfileEntry *tfe;
-};
-
-static int tunfile_open(struct file_desc *d)
-{
- int fd;
- struct tunfile_info *ti;
- struct ifreq ifr;
- struct tun_link *tl;
-
- ti = container_of(d, struct tunfile_info, d);
- fd = open_reg_by_id(ti->tfe->id);
- if (fd < 0)
- return -1;
-
- if (!ti->tfe->netdev)
- /* just-opened tun file */
- return fd;
-
- tl = find_tun_link(ti->tfe->netdev);
- if (!tl) {
- pr_err("No tun device for file %s\n", ti->tfe->netdev);
- goto err;
- }
-
- memset(&ifr, 0, sizeof(ifr));
- strlcpy(ifr.ifr_name, tl->name, sizeof(ifr.ifr_name));
- ifr.ifr_flags = tl->rst.flags;
-
- if (ioctl(fd, TUNSETIFF, &ifr) < 0) {
- pr_perror("Can't attach tunfile to device");
- goto err;
- }
-
- if (ti->tfe->has_detached && ti->tfe->detached) {
- pr_info("Detaching from %s queue\n", ti->tfe->netdev);
- ifr.ifr_flags = IFF_DETACH_QUEUE;
- if (ioctl(fd, TUNSETQUEUE, &ifr) < 0) {
- pr_perror("Can't detach queue");
- goto err;
- }
- }
-
- if (!(tl->rst.flags & IFF_PERSIST)) {
- pr_info("Dropping persistency for %s\n", tl->name);
- if (ioctl(fd, TUNSETPERSIST, 0) < 0) {
- pr_perror("Error dropping persistency");
- goto err;
- }
- }
-
- return fd;
-
-err:
- close(fd);
- return -1;
-}
-
-static struct file_desc_ops tunfile_desc_ops = {
- .type = FD_TYPES__TUNF,
- .open = tunfile_open,
-};
-
-static int collect_one_tunfile(void *o, ProtobufCMessage *base)
-{
- struct tunfile_info *ti = o;
-
- ti->tfe = pb_msg(base, TunfileEntry);
- file_desc_add(&ti->d, ti->tfe->id, &tunfile_desc_ops);
-
- pr_info("Collected %s tunfile\n", ti->tfe->netdev);
-
- return 0;
-}
-
-struct collect_image_info tunfile_cinfo = {
- .fd_type = CR_FD_TUNFILE,
- .pb_type = PB_TUNFILE,
- .priv_size = sizeof(struct tunfile_info),
- .collect = collect_one_tunfile,
-};
-
-int dump_tun_link(NetDeviceEntry *nde, struct cr_imgset *fds)
-{
- TunLinkEntry tle = TUN_LINK_ENTRY__INIT;
- char spath[64];
- char buf[64];
- int ret = 0;
- struct tun_link *tl;
-
- sprintf(spath, "class/net/%s/tun_flags", nde->name);
- ret |= read_ns_sys_file(spath, buf, sizeof(buf));
- tle.flags = strtol(buf, NULL, 0);
-
- sprintf(spath, "class/net/%s/owner", nde->name);
- ret |= read_ns_sys_file(spath, buf, sizeof(buf));
- tle.owner = strtol(buf, NULL, 10);
-
- sprintf(spath, "class/net/%s/group", nde->name);
- ret |= read_ns_sys_file(spath, buf, sizeof(buf));
- tle.group = strtol(buf, NULL, 10);
-
- if (ret < 0)
- return ret;
-
- tl = get_tun_link_fd(nde->name, tle.flags);
- if (!tl)
- return ret;
-
- tle.vnethdr = tl->dmp.vnethdr;
- tle.sndbuf = tl->dmp.sndbuf;
-
- nde->tun = &tle;
- return write_netdev_img(nde, fds);
-}
-
-int restore_one_tun(NetDeviceEntry *nde, int nlsk)
-{
- int fd, ret = -1, aux;
-
- if (!nde->tun) {
- pr_err("Corrupted TUN link entry %x\n", nde->ifindex);
- return -1;
- }
-
- pr_info("Restoring tun device %s\n", nde->name);
-
- fd = open_tun_dev(nde->name, nde->ifindex, nde->tun->flags);
- if (fd < 0)
- return -1;
-
- aux = nde->tun->owner;
- if ((aux != -1) && ioctl(fd, TUNSETOWNER, aux) < 0) {
- pr_perror("Can't set owner");
- goto out;
- }
-
- aux = nde->tun->group;
- if ((aux != -1) && ioctl(fd, TUNSETGROUP, aux) < 0) {
- pr_perror("Can't set group");
- goto out;
- }
-
- aux = nde->tun->sndbuf;
- if (ioctl(fd, TUNSETSNDBUF, &aux) < 0) {
- pr_perror("Can't set sndbuf");
- goto out;
- }
-
- aux = nde->tun->vnethdr;
- if (ioctl(fd, TUNSETVNETHDRSZ, &aux) < 0) {
- pr_perror("Can't set vnethdr");
- goto out;
- }
-
- /*
- * Set this device persistent anyway and schedule
- * the persistence drop if it should not be such.
- * The first _real_ opener will do it.
- */
-
- if (ioctl(fd, TUNSETPERSIST, 1)) {
- pr_perror("Can't make tun device persistent");
- goto out;
- }
-
- if (restore_link_parms(nde, nlsk)) {
- pr_err("Error restoring %s link params\n", nde->name);
- goto out;
- }
-
- ret = list_tun_link(nde);
-out:
- close(fd);
- return ret;
-}
diff --git a/util.c b/util.c
deleted file mode 100644
index 00c327396c8a..000000000000
--- a/util.c
+++ /dev/null
@@ -1,1002 +0,0 @@
-#define _XOPEN_SOURCE
-
-#include <stdio.h>
-#include <stdarg.h>
-#include <string.h>
-#include <errno.h>
-#include <stdbool.h>
-#include <limits.h>
-#include <signal.h>
-#include <unistd.h>
-#include <errno.h>
-#include <string.h>
-#include <dirent.h>
-#include <sys/sendfile.h>
-#include <fcntl.h>
-#include <poll.h>
-#include <sys/mount.h>
-#include <sys/param.h>
-#include <sys/types.h>
-#include <sys/ptrace.h>
-#include <sys/types.h>
-#include <sys/time.h>
-#include <sys/resource.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-#include <sys/vfs.h>
-#include <sys/ptrace.h>
-#include <sys/wait.h>
-#include <sys/resource.h>
-#include <sys/wait.h>
-#include <sys/socket.h>
-#include <netinet/in.h>
-#include <netinet/tcp.h>
-#include <sched.h>
-
-#include "compiler.h"
-#include "asm/types.h"
-#include "list.h"
-#include "util.h"
-#include "rst-malloc.h"
-#include "image.h"
-#include "vma.h"
-#include "mem.h"
-#include "namespaces.h"
-
-#include "cr_options.h"
-#include "servicefd.h"
-#include "cr-service.h"
-#include "files.h"
-
-#include "cr-errno.h"
-
-#define VMA_OPT_LEN 128
-
-/*
- * This function reallocates passed str pointer.
- * It means:
- * 1) passed pointer can be either NULL, or previously allocated by malloc.
- * 2) Passed pointer can' be reused. It's either freed in case of error or can
- * be changed.
- */
-static char *xvstrcat(char *str, const char *fmt, va_list args)
-{
- size_t offset = 0, delta;
- int ret;
- char *new;
- va_list tmp;
-
- if (str)
- offset = strlen(str);
- delta = strlen(fmt) * 2;
-
- do {
- ret = -ENOMEM;
- new = xrealloc(str, offset + delta);
- if (new) {
- va_copy(tmp, args);
- ret = vsnprintf(new + offset, delta, fmt, tmp);
- va_end(tmp);
- if (ret >= delta) {
- /* NOTE: vsnprintf returns the amount of bytes
- * to allocate. */
- delta = ret +1;
- str = new;
- ret = 0;
- }
- }
- } while (ret == 0);
-
- if (ret == -ENOMEM) {
- /* realloc failed. We must release former string */
- pr_err("Failed to allocate string\n");
- xfree(str);
- } else if (ret < 0) {
- /* vsnprintf failed */
- pr_err("Failed to print string\n");
- xfree(new);
- new = NULL;
- }
- return new;
-}
-
-char *xstrcat(char *str, const char *fmt, ...)
-{
- va_list args;
-
- va_start(args, fmt);
- str = xvstrcat(str, fmt, args);
- va_end(args);
-
- return str;
-}
-
-char *xsprintf(const char *fmt, ...)
-{
- va_list args;
- char *str;
-
- va_start(args, fmt);
- str = xvstrcat(NULL, fmt, args);
- va_end(args);
-
- return str;
-}
-
-static void vma_opt_str(const struct vma_area *v, char *opt)
-{
- int p = 0;
-
-#define opt2s(_o, _s) do { \
- if (v->e->status & _o) \
- p += sprintf(opt + p, _s " "); \
- } while (0)
-
- opt[p] = '\0';
- opt2s(VMA_AREA_REGULAR, "reg");
- opt2s(VMA_AREA_STACK, "stk");
- opt2s(VMA_AREA_VSYSCALL, "vsys");
- opt2s(VMA_AREA_VDSO, "vdso");
- opt2s(VMA_AREA_VVAR, "vvar");
- opt2s(VMA_AREA_HEAP, "heap");
- opt2s(VMA_FILE_PRIVATE, "fp");
- opt2s(VMA_FILE_SHARED, "fs");
- opt2s(VMA_ANON_SHARED, "as");
- opt2s(VMA_ANON_PRIVATE, "ap");
- opt2s(VMA_AREA_SYSVIPC, "sysv");
- opt2s(VMA_AREA_SOCKET, "sk");
-
-#undef opt2s
-}
-
-void pr_vma(unsigned int loglevel, const struct vma_area *vma_area)
-{
- char opt[VMA_OPT_LEN];
- memset(opt, 0, VMA_OPT_LEN);
-
- if (!vma_area)
- return;
-
- vma_opt_str(vma_area, opt);
- print_on_level(loglevel, "%#"PRIx64"-%#"PRIx64" (%"PRIi64"K) prot %#x flags %#x st %#x off %#"PRIx64" "
- "%s shmid: %#"PRIx64"\n",
- vma_area->e->start, vma_area->e->end,
- KBYTES(vma_area_len(vma_area)),
- vma_area->e->prot,
- vma_area->e->flags,
- vma_area->e->status,
- vma_area->e->pgoff,
- opt, vma_area->e->shmid);
-}
-
-int close_safe(int *fd)
-{
- int ret = 0;
-
- if (*fd > -1) {
- ret = close(*fd);
- if (!ret)
- *fd = -1;
- else
- pr_perror("Unable to close fd %d", *fd);
- }
-
- return ret;
-}
-
-int reopen_fd_as_safe(char *file, int line, int new_fd, int old_fd, bool allow_reuse_fd)
-{
- int tmp;
-
- if (old_fd != new_fd) {
- /* make sure we won't clash with an inherit fd */
- if (inherit_fd_resolve_clash(new_fd) < 0)
- return -1;
-
- if (!allow_reuse_fd) {
- if (fcntl(new_fd, F_GETFD) != -1 || errno != EBADF) {
- pr_err("fd %d already in use (called at %s:%d)\n",
- new_fd, file, line);
- return -1;
- }
- }
-
- tmp = dup2(old_fd, new_fd);
- if (tmp < 0) {
- pr_perror("Dup %d -> %d failed (called at %s:%d)",
- old_fd, new_fd, file, line);
- return tmp;
- }
-
- /* Just to have error message if failed */
- close_safe(&old_fd);
- }
-
- return 0;
-}
-
-int move_img_fd(int *img_fd, int want_fd)
-{
- if (*img_fd == want_fd) {
- int tmp;
-
- tmp = dup(*img_fd);
- if (tmp < 0) {
- pr_perror("Can't dup file");
- return -1;
- }
-
- close(*img_fd);
-
- *img_fd = tmp;
- }
-
- return 0;
-}
-
-/*
- * Cached opened /proc/$pid and /proc/self files.
- * Used for faster access to /proc/.../foo files
- * by using openat()-s
- */
-
-static pid_t open_proc_pid = PROC_NONE;
-static int open_proc_fd = -1;
-static pid_t open_proc_self_pid;
-static int open_proc_self_fd = -1;
-
-static inline void set_proc_self_fd(int fd)
-{
- if (open_proc_self_fd >= 0)
- close(open_proc_self_fd);
-
- open_proc_self_fd = fd;
- open_proc_self_pid = getpid();
-}
-
-static inline void set_proc_pid_fd(int pid, int fd)
-{
- if (open_proc_fd >= 0)
- close(open_proc_fd);
-
- open_proc_pid = pid;
- open_proc_fd = fd;
-}
-
-static inline int get_proc_fd(int pid)
-{
- if (pid == PROC_SELF) {
- if (open_proc_self_fd != -1 && open_proc_self_pid != getpid()) {
- close(open_proc_self_fd);
- open_proc_self_fd = -1;
- }
- return open_proc_self_fd;
- } else if (pid == open_proc_pid)
- return open_proc_fd;
- else
- return -1;
-}
-
-int close_pid_proc(void)
-{
- set_proc_self_fd(-1);
- set_proc_pid_fd(PROC_NONE, -1);
- return 0;
-}
-
-void close_proc()
-{
- close_pid_proc();
- close_service_fd(PROC_FD_OFF);
-}
-
-int set_proc_fd(int fd)
-{
- if (install_service_fd(PROC_FD_OFF, fd) < 0)
- return -1;
- return 0;
-}
-
-static int open_proc_sfd(char *path)
-{
- int fd, ret;
-
- close_proc();
- fd = open(path, O_DIRECTORY | O_PATH);
- if (fd == -1) {
- pr_perror("Can't open %s", path);
- return -1;
- }
-
- ret = install_service_fd(PROC_FD_OFF, fd);
- close(fd);
- if (ret < 0)
- return -1;
-
- return 0;
-}
-
-inline int open_pid_proc(pid_t pid)
-{
- char path[18];
- int fd;
- int dfd;
-
- fd = get_proc_fd(pid);
- if (fd >= 0)
- return fd;
-
- dfd = get_service_fd(PROC_FD_OFF);
- if (dfd < 0) {
- if (open_proc_sfd("/proc") < 0)
- return -1;
-
- dfd = get_service_fd(PROC_FD_OFF);
- }
-
- if (pid == PROC_GEN)
- /*
- * Don't cache it, close_pid_proc() would
- * close service descriptor otherwise.
- */
- return dfd;
-
- if (pid == PROC_SELF)
- snprintf(path, sizeof(path), "self");
- else
- snprintf(path, sizeof(path), "%d", pid);
-
- fd = openat(dfd, path, O_PATH);
- if (fd < 0) {
- pr_perror("Can't open %s", path);
- set_cr_errno(ESRCH);
- return -1;
- }
-
- if (pid == PROC_SELF)
- set_proc_self_fd(fd);
- else
- set_proc_pid_fd(pid, fd);
-
- return fd;
-}
-
-int do_open_proc(pid_t pid, int flags, const char *fmt, ...)
-{
- char path[128];
- va_list args;
- int dirfd;
-
- dirfd = open_pid_proc(pid);
- if (dirfd < 0)
- return -1;
-
- va_start(args, fmt);
- vsnprintf(path, sizeof(path), fmt, args);
- va_end(args);
-
- return openat(dirfd, path, flags);
-}
-
-static int service_fd_rlim_cur;
-static int service_fd_id = 0;
-
-int init_service_fd(void)
-{
- struct rlimit rlimit;
-
- /*
- * Service FDs are those that most likely won't
- * conflict with any 'real-life' ones
- */
-
- if (getrlimit(RLIMIT_NOFILE, &rlimit)) {
- pr_perror("Can't get rlimit");
- return -1;
- }
-
- service_fd_rlim_cur = (int)rlimit.rlim_cur;
- BUG_ON(service_fd_rlim_cur < SERVICE_FD_MAX);
-
- return 0;
-}
-
-static int __get_service_fd(enum sfd_type type, int service_fd_id)
-{
- return service_fd_rlim_cur - type - SERVICE_FD_MAX * service_fd_id;
-}
-
-static DECLARE_BITMAP(sfd_map, SERVICE_FD_MAX);
-
-int reserve_service_fd(enum sfd_type type)
-{
- int sfd = __get_service_fd(type, service_fd_id);
-
- BUG_ON((int)type <= SERVICE_FD_MIN || (int)type >= SERVICE_FD_MAX);
-
- set_bit(type, sfd_map);
- return sfd;
-}
-
-int install_service_fd(enum sfd_type type, int fd)
-{
- int sfd = __get_service_fd(type, service_fd_id);
-
- BUG_ON((int)type <= SERVICE_FD_MIN || (int)type >= SERVICE_FD_MAX);
-
- if (dup3(fd, sfd, O_CLOEXEC) != sfd) {
- pr_perror("Dup %d -> %d failed", fd, sfd);
- return -1;
- }
-
- set_bit(type, sfd_map);
- return sfd;
-}
-
-int get_service_fd(enum sfd_type type)
-{
- BUG_ON((int)type <= SERVICE_FD_MIN || (int)type >= SERVICE_FD_MAX);
-
- if (!test_bit(type, sfd_map))
- return -1;
-
- return __get_service_fd(type, service_fd_id);
-}
-
-int criu_get_image_dir(void)
-{
- return get_service_fd(IMG_FD_OFF);
-}
-
-int close_service_fd(enum sfd_type type)
-{
- int fd;
-
- fd = get_service_fd(type);
- if (fd < 0)
- return 0;
-
- if (close_safe(&fd))
- return -1;
-
- clear_bit(type, sfd_map);
- return 0;
-}
-
-int clone_service_fd(int id)
-{
- int ret = -1, i;
-
- if (service_fd_id == id)
- return 0;
-
- for (i = SERVICE_FD_MIN + 1; i < SERVICE_FD_MAX; i++) {
- int old = __get_service_fd(i, service_fd_id);
- int new = __get_service_fd(i, id);
-
- ret = dup2(old, new);
- if (ret == -1) {
- if (errno == EBADF)
- continue;
- pr_perror("Unable to clone %d->%d", old, new);
- }
- }
-
- service_fd_id = id;
- ret = 0;
-
- return ret;
-}
-
-bool is_any_service_fd(int fd)
-{
- return fd > __get_service_fd(SERVICE_FD_MAX, service_fd_id) &&
- fd < __get_service_fd(SERVICE_FD_MIN, service_fd_id);
-}
-
-bool is_service_fd(int fd, enum sfd_type type)
-{
- return fd == get_service_fd(type);
-}
-
-int copy_file(int fd_in, int fd_out, size_t bytes)
-{
- ssize_t written = 0;
- size_t chunk = bytes ? bytes : 4096;
-
- while (1) {
- ssize_t ret;
-
- ret = sendfile(fd_out, fd_in, NULL, chunk);
- if (ret < 0) {
- pr_perror("Can't send data to ghost file");
- return -1;
- }
-
- if (ret == 0) {
- if (bytes && (written != bytes)) {
- pr_err("Ghost file size mismatch %zu/%zu\n",
- written, bytes);
- return -1;
- }
- break;
- }
-
- written += ret;
- }
-
- return 0;
-}
-
-int read_fd_link(int lfd, char *buf, size_t size)
-{
- char t[32];
- ssize_t ret;
-
- snprintf(t, sizeof(t), "/proc/self/fd/%d", lfd);
- ret = readlink(t, buf, size);
- if (ret < 0) {
- pr_perror("Can't read link of fd %d", lfd);
- return -1;
- } else if ((size_t)ret >= size) {
- pr_err("Buffer for read link of fd %d is too small\n", lfd);
- return -1;
- }
- buf[ret] = 0;
-
- return ret;
-}
-
-int is_anon_link_type(char *link, char *type)
-{
- char aux[32];
-
- snprintf(aux, sizeof(aux), "anon_inode:%s", type);
- return !strcmp(link, aux);
-}
-
-void *shmalloc(size_t bytes)
-{
- return rst_mem_alloc(bytes, RM_SHARED);
-}
-
-/* Only last chunk can be released */
-void shfree_last(void *ptr)
-{
- rst_mem_free_last(RM_SHARED);
-}
-
-#define DUP_SAFE(fd, out) \
- ({ \
- int ret__; \
- ret__ = dup(fd); \
- if (ret__ == -1) { \
- pr_perror("dup(%d) failed", fd); \
- goto out; \
- } \
- ret__; \
- })
-
-/*
- * If "in" is negative, stdin will be closed.
- * If "out" or "err" are negative, a log file descriptor will be used.
- */
-int cr_system(int in, int out, int err, char *cmd, char *const argv[], unsigned flags)
-{
- return cr_system_userns(in, out, err, cmd, argv, flags, -1);
-}
-
-int cr_system_userns(int in, int out, int err, char *cmd,
- char *const argv[], unsigned flags, int userns_pid)
-{
- sigset_t blockmask, oldmask;
- int ret = -1, status;
- pid_t pid;
-
- sigemptyset(&blockmask);
- sigaddset(&blockmask, SIGCHLD);
- if (sigprocmask(SIG_BLOCK, &blockmask, &oldmask) == -1) {
- pr_perror("Can not set mask of blocked signals");
- return -1;
- }
-
- pid = fork();
- if (pid == -1) {
- pr_perror("fork() failed");
- goto out;
- } else if (pid == 0) {
- if (userns_pid > 0) {
- if (switch_ns(userns_pid, &user_ns_desc, NULL))
- goto out_chld;
- if (setuid(0) || setgid(0)) {
- pr_perror("Unable to set uid or gid");
- goto out_chld;
- }
- }
-
- if (out < 0)
- out = log_get_fd();
- if (err < 0)
- err = log_get_fd();
-
- /*
- * out, err, in should be a separate fds,
- * because reopen_fd_as() closes an old fd
- */
- if (err == out || err == in)
- err = DUP_SAFE(err, out_chld);
-
- if (out == in)
- out = DUP_SAFE(out, out_chld);
-
- if (move_img_fd(&out, STDIN_FILENO) ||
- move_img_fd(&err, STDIN_FILENO))
- goto out_chld;
-
- if (in < 0) {
- close(STDIN_FILENO);
- } else {
- if (reopen_fd_as_nocheck(STDIN_FILENO, in))
- goto out_chld;
- }
-
- if (move_img_fd(&err, STDOUT_FILENO))
- goto out_chld;
-
- if (reopen_fd_as_nocheck(STDOUT_FILENO, out))
- goto out_chld;
-
- if (reopen_fd_as_nocheck(STDERR_FILENO, err))
- goto out_chld;
-
- execvp(cmd, argv);
-
- pr_perror("exec failed");
-out_chld:
- _exit(1);
- }
-
- while (1) {
- ret = waitpid(pid, &status, 0);
- if (ret == -1) {
- pr_perror("waitpid() failed");
- goto out;
- }
-
- if (WIFEXITED(status)) {
- if (!(flags & CRS_CAN_FAIL) && WEXITSTATUS(status))
- pr_err("exited, status=%d\n", WEXITSTATUS(status));
- break;
- } else if (WIFSIGNALED(status)) {
- pr_err("killed by signal %d\n", WTERMSIG(status));
- break;
- } else if (WIFSTOPPED(status)) {
- pr_err("stopped by signal %d\n", WSTOPSIG(status));
- } else if (WIFCONTINUED(status)) {
- pr_err("continued\n");
- }
- }
-
- ret = status ? -1 : 0;
-out:
- if (sigprocmask(SIG_SETMASK, &oldmask, NULL) == -1) {
- pr_perror("Can not unset mask of blocked signals");
- BUG();
- }
-
- return ret;
-}
-
-int cr_daemon(int nochdir, int noclose, int *keep_fd, int close_fd)
-{
- int pid;
-
- pid = fork();
- if (pid < 0) {
- pr_perror("Can't fork");
- return -1;
- }
-
- if (pid > 0)
- return pid;
-
- setsid();
- if (!nochdir)
- if (chdir("/") == -1)
- pr_perror("Can't change directory");
- if (!noclose) {
- int fd;
-
- if (close_fd != -1)
- close(close_fd);
-
- if (*keep_fd != -1)
- *keep_fd = dup2(*keep_fd, 3);
-
- fd = open("/dev/null", O_RDWR);
- if (fd < 0) {
- pr_perror("Can't open /dev/null");
- return -1;
- }
- dup2(fd, 0);
- dup2(fd, 1);
- dup2(fd, 2);
- close(fd);
- }
-
- return 0;
-}
-
-int is_root_user()
-{
- if (geteuid() != 0) {
- pr_err("You need to be root to run this command\n");
- return 0;
- }
-
- return 1;
-}
-
-int is_empty_dir(int dirfd)
-{
- int ret = 0;
- DIR *fdir = NULL;
- struct dirent *de;
-
- fdir = fdopendir(dirfd);
- if (!fdir)
- return -1;
-
- while ((de = readdir(fdir))) {
- if (dir_dots(de))
- continue;
-
- goto out;
- }
-
- ret = 1;
-out:
- closedir(fdir);
- return ret;
-}
-
-int vaddr_to_pfn(unsigned long vaddr, u64 *pfn)
-{
- int fd, ret = -1;
- off_t off;
-
- fd = open_proc(getpid(), "pagemap");
- if (fd < 0)
- return -1;
-
- off = (vaddr / page_size()) * sizeof(u64);
- ret = pread(fd, pfn, sizeof(*pfn), off);
- if (ret != sizeof(*pfn)) {
- pr_perror("Can't read pme for pid %d", getpid());
- ret = -1;
- } else {
- *pfn &= PME_PFRAME_MASK;
- ret = 0;
- }
-
- close(fd);
- return ret;
-}
-
-/*
- * Note since VMA_AREA_NONE = 0 we can skip assignment
- * here and simply rely on xzalloc
- */
-struct vma_area *alloc_vma_area(void)
-{
- struct vma_area *p;
-
- p = xzalloc(sizeof(*p) + sizeof(VmaEntry));
- if (p) {
- p->e = (VmaEntry *)(p + 1);
- vma_entry__init(p->e);
- p->vm_file_fd = -1;
- p->e->fd = -1;
- }
-
- return p;
-}
-
-int mkdirpat(int fd, const char *path)
-{
- size_t i;
- char made_path[PATH_MAX], *pos;
-
- if (strlen(path) >= PATH_MAX) {
- pr_err("path %s is longer than PATH_MAX\n", path);
- return -1;
- }
-
- strcpy(made_path, path);
-
- i = 0;
- if (made_path[0] == '/')
- i++;
-
- for (; i < strlen(made_path); i++) {
- pos = strchr(made_path + i, '/');
- if (pos)
- *pos = '\0';
- if (mkdirat(fd, made_path, 0755) < 0 && errno != EEXIST) {
- pr_perror("couldn't mkdirpat directory %s", made_path);
- return -1;
- }
- if (pos) {
- *pos = '/';
- i = pos - made_path;
- } else
- break;
- }
-
- return 0;
-}
-
-bool is_path_prefix(const char *path, const char *prefix)
-{
- if (strstartswith(path, prefix)) {
- size_t len = strlen(prefix);
- switch (path[len]) {
- case '\0':
- case '/':
- return true;
- }
- }
-
- return false;
-}
-
-FILE *fopenat(int dirfd, char *path, char *cflags)
-{
- int tmp, flags = 0;
- char *iter;
-
- for (iter = cflags; *iter; iter++) {
- switch (*iter) {
- case 'r':
- flags |= O_RDONLY;
- break;
- case 'a':
- flags |= O_APPEND;
- break;
- case 'w':
- flags |= O_WRONLY | O_CREAT;
- break;
- case '+':
- flags = O_RDWR | O_CREAT;
- break;
- }
- }
-
- tmp = openat(dirfd, path, flags, S_IRUSR | S_IWUSR);
- if (tmp < 0)
- return NULL;
-
- return fdopen(tmp, cflags);
-}
-
-void split(char *str, char token, char ***out, int *n)
-{
- int i;
- char *cur;
-
- *n = 0;
- for (cur = str; cur != NULL; cur = strchr(cur, token)) {
- (*n)++;
- cur++;
- }
-
-
- *out = xmalloc((*n) * sizeof(char *));
- if (!*out) {
- *n = -1;
- return;
-
- }
-
- cur = str;
- i = 0;
- do {
- char *prev = cur;
- cur = strchr(cur, token);
-
- if (cur)
- *cur = '\0';
- (*out)[i] = xstrdup(prev);
- if (cur) {
- *cur = token;
- cur++;
- }
-
- if (!(*out)[i]) {
- int j;
- for (j = 0; j < i; j++)
- xfree((*out)[j]);
- xfree(*out);
- *out = NULL;
- *n = -1;
- return;
- }
-
- i++;
- } while(cur);
-}
-
-int fd_has_data(int lfd)
-{
- struct pollfd pfd = {lfd, POLLIN, 0};
- int ret;
-
- ret = poll(&pfd, 1, 0);
- if (ret < 0) {
- pr_perror("poll() failed");
- }
-
- return ret;
-}
-
-size_t read_into_buffer(int fd, char *buff, size_t size)
-{
- size_t n = 0;
- size_t curr = 0;
-
- while (1) {
- n = read(fd, buff + curr, size - curr);
- if (n < 1)
- return n;
- curr += n;
- if (curr == size)
- return size;
- }
-}
-
-int make_yard(char *path)
-{
- if (mount("none", path, "tmpfs", 0, NULL)) {
- pr_perror("Unable to mount tmpfs in %s", path);
- return -1;
- }
-
- if (mount("none", path, NULL, MS_PRIVATE, NULL)) {
- pr_perror("Unable to mark yard as private");
- return -1;
- }
-
- return 0;
-}
-
-const char *ns_to_string(unsigned int ns)
-{
- switch (ns) {
- case CLONE_NEWIPC:
- return "ipc";
- case CLONE_NEWNS:
- return "mnt";
- case CLONE_NEWNET:
- return "net";
- case CLONE_NEWPID:
- return "pid";
- case CLONE_NEWUSER:
- return "user";
- case CLONE_NEWUTS:
- return "uts";
- default:
- return NULL;
- }
-}
-
-void tcp_cork(int sk, bool on)
-{
- int val = on ? 1 : 0;
- setsockopt(sk, SOL_TCP, TCP_CORK, &val, sizeof(val));
-}
-
-void tcp_nodelay(int sk, bool on)
-{
- int val = on ? 1 : 0;
- setsockopt(sk, SOL_TCP, TCP_NODELAY, &val, sizeof(val));
-}
diff --git a/uts_ns.c b/uts_ns.c
deleted file mode 100644
index ed64d77ec467..000000000000
--- a/uts_ns.c
+++ /dev/null
@@ -1,71 +0,0 @@
-#include <unistd.h>
-#include <fcntl.h>
-#include <sys/utsname.h>
-#include <string.h>
-#include <sched.h>
-
-#include "util.h"
-#include "namespaces.h"
-#include "sysctl.h"
-#include "uts_ns.h"
-
-#include "protobuf.h"
-#include "protobuf/utsns.pb-c.h"
-
-int dump_uts_ns(int ns_id)
-{
- int ret;
- struct cr_img *img;
- struct utsname ubuf;
- UtsnsEntry ue = UTSNS_ENTRY__INIT;
-
- img = open_image(CR_FD_UTSNS, O_DUMP, ns_id);
- if (!img)
- return -1;
-
- ret = uname(&ubuf);
- if (ret < 0) {
- pr_perror("Error calling uname");
- goto err;
- }
-
- ue.nodename = ubuf.nodename;
- ue.domainname = ubuf.domainname;
-
- ret = pb_write_one(img, &ue, PB_UTSNS);
-err:
- close_image(img);
- return ret < 0 ? -1 : 0;
-}
-
-int prepare_utsns(int pid)
-{
- int ret;
- struct cr_img *img;
- UtsnsEntry *ue;
- struct sysctl_req req[] = {
- { "kernel/hostname" },
- { "kernel/domainname" },
- };
-
- img = open_image(CR_FD_UTSNS, O_RSTR, pid);
- if (!img)
- return -1;
-
- ret = pb_read_one(img, &ue, PB_UTSNS);
- if (ret < 0)
- goto out;
-
- req[0].arg = ue->nodename;
- req[0].type = CTL_STR(strlen(ue->nodename));
- req[1].arg = ue->domainname;
- req[1].type = CTL_STR(strlen(ue->domainname));
-
- ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWUTS);
- utsns_entry__free_unpacked(ue, NULL);
-out:
- close_image(img);
- return ret;
-}
-
-struct ns_desc uts_ns_desc = NS_DESC_ENTRY(CLONE_NEWUTS, "uts");
diff --git a/vdso.c b/vdso.c
deleted file mode 100644
index bccf11cc11f8..000000000000
--- a/vdso.c
+++ /dev/null
@@ -1,320 +0,0 @@
-#include <stdlib.h>
-#include <stdio.h>
-#include <unistd.h>
-#include <string.h>
-#include <elf.h>
-#include <fcntl.h>
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-
-#include "asm/types.h"
-#include "asm/parasite-syscall.h"
-
-#include "parasite-syscall.h"
-#include "parasite.h"
-#include "compiler.h"
-#include "kerndat.h"
-#include "vdso.h"
-#include "util.h"
-#include "log.h"
-#include "mem.h"
-#include "vma.h"
-
-#ifdef LOG_PREFIX
-# undef LOG_PREFIX
-#endif
-#define LOG_PREFIX "vdso: "
-
-struct vdso_symtable vdso_sym_rt = VDSO_SYMTABLE_INIT;
-u64 vdso_pfn = VDSO_BAD_PFN;
-/*
- * The VMAs list might have proxy vdso/vvar areas left
- * from previous dump/restore cycle so we need to detect
- * them and eliminated from the VMAs list, they will be
- * generated again on restore if needed.
- */
-int parasite_fixup_vdso(struct parasite_ctl *ctl, pid_t pid,
- struct vm_area_list *vma_area_list)
-{
- unsigned long proxy_vdso_addr = VDSO_BAD_ADDR;
- unsigned long proxy_vvar_addr = VVAR_BAD_ADDR;
- struct vma_area *proxy_vdso_marked = NULL;
- struct vma_area *proxy_vvar_marked = NULL;
- struct parasite_vdso_vma_entry *args;
- int fd = -1, ret, exit_code = -1;
- u64 pfn = VDSO_BAD_PFN;
- struct vma_area *vma;
- off_t off;
-
- args = parasite_args(ctl, struct parasite_vdso_vma_entry);
- if (kdat.pmap == PM_FULL) {
- BUG_ON(vdso_pfn == VDSO_BAD_PFN);
- fd = open_proc(pid, "pagemap");
- if (fd < 0)
- return -1;
- } else
- pr_info("Pagemap is unavailable, trying a slow way\n");
-
- list_for_each_entry(vma, &vma_area_list->h, list) {
- if (!vma_area_is(vma, VMA_AREA_REGULAR))
- continue;
-
- if (vma_area_is(vma, VMA_FILE_SHARED) ||
- vma_area_is(vma, VMA_FILE_PRIVATE))
- continue;
- /*
- * It might be possible VVAR area from marked
- * vDSO zone, we need to detect it earlier than
- * VDSO_PROT test because VVAR_PROT is a subset
- * of it but don't yield continue here,
- * sigh... what a mess.
- */
- BUILD_BUG_ON(!(VDSO_PROT & VVAR_PROT));
-
- if ((vma->e->prot & VVAR_PROT) == VVAR_PROT) {
- if (proxy_vvar_addr != VVAR_BAD_ADDR &&
- proxy_vvar_addr == vma->e->start) {
- BUG_ON(proxy_vvar_marked);
- proxy_vvar_marked = vma;
- continue;
- }
- }
-
- if ((vma->e->prot & VDSO_PROT) != VDSO_PROT)
- continue;
-
- if (vma->e->start > kdat.task_size)
- continue;
-
- if (vma->e->flags & MAP_GROWSDOWN)
- continue;
-
- /*
- * I need to poke every potentially marked vma,
- * otherwise if task never called for vdso functions
- * page frame number won't be reported.
- *
- * Moreover, if page frame numbers are not accessible
- * we have to scan the vma zone for vDSO elf structure
- * which gonna be a slow way.
- */
- args->start = vma->e->start;
- args->len = vma_area_len(vma);
- args->try_fill_symtable = (fd < 0) ? true : false;
- args->is_vdso = false;
-
- if (parasite_execute_daemon(PARASITE_CMD_CHECK_VDSO_MARK, ctl)) {
- pr_err("Parasite failed to poke for mark\n");
- goto err;
- }
-
- /*
- * Defer handling marked vdso until we walked over
- * all vmas and restore potentially remapped vDSO
- * area status.
- */
- if (unlikely(args->is_marked)) {
- if (proxy_vdso_marked) {
- pr_err("Ow! Second vdso mark detected!\n");
- goto err;
- }
- proxy_vdso_marked = vma;
- proxy_vdso_addr = args->proxy_vdso_addr;
- proxy_vvar_addr = args->proxy_vvar_addr;
- continue;
- }
-
- /*
- * If we have an access to pagemap we can handle vDSO
- * status early. Otherwise, in worst scenario, where
- * the dumpee has been remapping vdso on its own and
- * the kernel version is < 3.16, the vdso won't be
- * detected via procfs status so we have to parse
- * symbols in parasite code.
- */
- if (fd >= 0) {
- off = (vma->e->start / PAGE_SIZE) * sizeof(u64);
- ret = pread(fd, &pfn, sizeof(pfn), off);
- if (ret < 0 || ret != sizeof(pfn)) {
- pr_perror("Can't read pme for pid %d", pid);
- goto err;
- }
-
- pfn = PME_PFRAME(pfn);
- if (!pfn) {
- pr_err("Unexpected page fram number 0 for pid %d\n", pid);
- goto err;
- }
- }
-
- /*
- * Setup proper VMA status. Note starting with 3.16
- * the [vdso]/[vvar] marks are reported correctly
- * even when they are remapped into a new place,
- * but only since that particular version of the
- * kernel!
- */
- if ((pfn == vdso_pfn && pfn != VDSO_BAD_PFN) || args->is_vdso) {
- if (!vma_area_is(vma, VMA_AREA_VDSO)) {
- pr_debug("Restore vDSO status by pfn/symtable at %lx\n",
- (long)vma->e->start);
- vma->e->status |= VMA_AREA_VDSO;
- }
- } else {
- if (unlikely(vma_area_is(vma, VMA_AREA_VDSO))) {
- pr_debug("Drop mishinted vDSO status at %lx\n",
- (long)vma->e->start);
- vma->e->status &= ~VMA_AREA_VDSO;
- }
- }
- }
-
- /*
- * There is marked vdso, it means such vdso is autogenerated
- * and must be dropped from vma list.
- */
- if (proxy_vdso_marked) {
- pr_debug("vdso: Found marked at %lx (proxy vDSO at %lx VVAR at %lx)\n",
- (long)proxy_vdso_marked->e->start,
- (long)proxy_vdso_addr, (long)proxy_vvar_addr);
-
- /*
- * Don't forget to restore the proxy vdso/vvar status, since
- * it's unknown to the kernel.
- */
- list_for_each_entry(vma, &vma_area_list->h, list) {
- if (vma->e->start == proxy_vdso_addr) {
- vma->e->status |= VMA_AREA_REGULAR | VMA_AREA_VDSO;
- pr_debug("vdso: Restore proxy vDSO status at %lx\n",
- (long)vma->e->start);
- } else if (vma->e->start == proxy_vvar_addr) {
- vma->e->status |= VMA_AREA_REGULAR | VMA_AREA_VVAR;
- pr_debug("vdso: Restore proxy VVAR status at %lx\n",
- (long)vma->e->start);
- }
- }
-
- pr_debug("vdso: Droppping marked vdso at %lx\n",
- (long)proxy_vdso_marked->e->start);
- list_del(&proxy_vdso_marked->list);
- xfree(proxy_vdso_marked);
- vma_area_list->nr--;
-
- if (proxy_vvar_marked) {
- pr_debug("vdso: Droppping marked vvar at %lx\n",
- (long)proxy_vvar_marked->e->start);
- list_del(&proxy_vvar_marked->list);
- xfree(proxy_vvar_marked);
- vma_area_list->nr--;
- }
- }
- exit_code = 0;
-err:
- close_safe(&fd);
- return exit_code;
-}
-
-static int vdso_fill_self_symtable(struct vdso_symtable *s)
-{
- char buf[512];
- int ret, exit_code = -1;
- FILE *maps;
-
- *s = (struct vdso_symtable)VDSO_SYMTABLE_INIT;
-
- maps = fopen_proc(PROC_SELF, "maps");
- if (!maps) {
- pr_perror("Can't open self-vma");
- return -1;
- }
-
- while (fgets(buf, sizeof(buf), maps)) {
- unsigned long start, end;
- char *has_vdso, *has_vvar;
-
- has_vdso = strstr(buf, "[vdso]");
- if (!has_vdso)
- has_vvar = strstr(buf, "[vvar]");
- else
- has_vvar = NULL;
-
- if (!has_vdso && !has_vvar)
- continue;
-
- ret = sscanf(buf, "%lx-%lx", &start, &end);
- if (ret != 2) {
- pr_err("Can't find vDSO/VVAR bounds\n");
- goto err;
- }
-
- if (has_vdso) {
- if (s->vma_start != VDSO_BAD_ADDR) {
- pr_err("Got second vDSO entry\n");
- goto err;
- }
- s->vma_start = start;
- s->vma_end = end;
-
- ret = vdso_fill_symtable((void *)start, end - start, s);
- if (ret)
- goto err;
- } else {
- if (s->vvar_start != VVAR_BAD_ADDR) {
- pr_err("Got second VVAR entry\n");
- goto err;
- }
- s->vvar_start = start;
- s->vvar_end = end;
- }
- }
-
- /*
- * Validate its structure -- for new vDSO format the
- * structure must be like
- *
- * 7fff1f5fd000-7fff1f5fe000 r-xp 00000000 00:00 0 [vdso]
- * 7fff1f5fe000-7fff1f600000 r--p 00000000 00:00 0 [vvar]
- *
- * The areas may be in reverse order.
- *
- * 7fffc3502000-7fffc3504000 r--p 00000000 00:00 0 [vvar]
- * 7fffc3504000-7fffc3506000 r-xp 00000000 00:00 0 [vdso]
- *
- */
- if (s->vma_start != VDSO_BAD_ADDR) {
- if (s->vvar_start != VVAR_BAD_ADDR) {
- if (s->vma_end != s->vvar_start &&
- s->vvar_end != s->vma_start) {
- pr_err("Unexpected rt vDSO area bounds\n");
- goto err;
- }
- }
- } else {
- pr_err("Can't find rt vDSO\n");
- goto err;
- }
-
- pr_debug("rt [vdso] %lx-%lx [vvar] %lx-%lx\n",
- s->vma_start, s->vma_end,
- s->vvar_start, s->vvar_end);
-
- exit_code = 0;
-err:
- fclose(maps);
- return exit_code;
-}
-
-int vdso_init(void)
-{
- if (vdso_fill_self_symtable(&vdso_sym_rt))
- return -1;
-
- if (kdat.pmap != PM_FULL)
- pr_info("VDSO detection turned off\n");
- else if (vaddr_to_pfn(vdso_sym_rt.vma_start, &vdso_pfn))
- return -1;
-
- return 0;
-}
--
2.5.0
More information about the CRIU
mailing list