[CRIU] [PATCH 2/4] make: move pie files and build to separated directory
Kinsbursky Stanislav
skinsbursky at openvz.org
Fri Nov 2 11:39:59 EDT 2012
From: Stanislav Kinsbursky <skinsbursky at openvz.org>
Signed-off-by: Stanislav Kinsbursky <skinsbursky at openvz.org>
---
pie/Makefile | 92 ++++++
pie/gen-offsets.sh | 35 ++
pie/log-simple.c | 204 +++++++++++++
pie/parasite-head-x86-64.S | 24 ++
pie/parasite.c | 552 +++++++++++++++++++++++++++++++++++
pie/pie.lds.S | 25 ++
pie/restorer.c | 690 ++++++++++++++++++++++++++++++++++++++++++++
pie/util-net.c | 151 ++++++++++
8 files changed, 1773 insertions(+), 0 deletions(-)
create mode 100644 pie/Makefile
create mode 100644 pie/gen-offsets.sh
create mode 100644 pie/log-simple.c
create mode 100644 pie/parasite-head-x86-64.S
create mode 100644 pie/parasite.c
create mode 100644 pie/pie.lds.S
create mode 100644 pie/restorer.c
create mode 100644 pie/util-net.c
-------------- next part --------------
diff --git a/Makefile b/Makefile
index 0f457c8..cfc7b3b 100644
--- a/Makefile
+++ b/Makefile
@@ -12,7 +12,7 @@ OBJS += cr-dump.o
OBJS += cr-show.o
OBJS += cr-check.o
OBJS += util.o
-OBJS += util-net.o
+OBJS += pie/util-net.o
OBJS += sysctl.o
OBJS += ptrace.o
OBJS += kcmp-ids.o
@@ -45,13 +45,12 @@ OBJS += protobuf.o
OBJS += tty.o
PROTOBUF-LIB := protobuf/protobuf-lib.o
+SYSCALL-LIB := syscall/syscall-x86-64.o
DEPS := $(patsubst %.o,%.d,$(OBJS))
MAKEFLAGS += --no-print-directory
-include Makefile.pie
-
.PHONY: all zdtm test rebuild clean distclean tags cscope \
docs help pie protobuf syscall
@@ -59,7 +58,7 @@ all: pie
$(Q) $(MAKE) $(PROGRAM)
pie: protobuf syscall
- $(Q) $(MAKE) $(PIE-GEN)
+ $(Q) $(MAKE) -C pie/
protobuf:
$(Q) $(MAKE) -C protobuf/
@@ -101,7 +100,7 @@ rebuild:
$(Q) $(RM) -f ./protobuf/*.pb-c.h
$(Q) $(MAKE)
-clean: cleanpie
+clean:
$(E) " CLEAN"
$(Q) $(RM) -f ./*.o
$(Q) $(RM) -f ./*.d
@@ -113,6 +112,7 @@ clean: cleanpie
$(Q) $(RM) -rf ./test/dump/
$(Q) $(MAKE) -C protobuf/ clean
$(Q) $(MAKE) -C syscall/ clean
+ $(Q) $(MAKE) -C pie/ clean
$(Q) $(MAKE) -C test/zdtm cleandep
$(Q) $(MAKE) -C test/zdtm clean
$(Q) $(MAKE) -C test/zdtm cleanout
diff --git a/Makefile.inc b/Makefile.inc
index 717e1ca..fd12927 100644
--- a/Makefile.inc
+++ b/Makefile.inc
@@ -43,7 +43,7 @@ endif
no-deps-targets := tags cscope clean cleanout cleandep realclean
-CFLAGS += -I./include -fno-strict-aliasing
+CFLAGS += -I./include -I./pie -fno-strict-aliasing
LIBS += -lrt -lpthread -lprotobuf-c
@@ -69,5 +69,3 @@ WARNINGS += -Wall
CFLAGS += $(WARNINGS) $(DEFINES)
export CC ECHO MAKE CFLAGS LIBS ARCH DEFINES
-
-SYSCALL-LIB := syscall/syscall-x86-64.o
diff --git a/Makefile.pie b/Makefile.pie
deleted file mode 100644
index 99e3593..0000000
--- a/Makefile.pie
+++ /dev/null
@@ -1,84 +0,0 @@
-GEN-OFFSETS := gen-offsets.sh
-
-PASM-OBJS += parasite-head-x86-64.o
-PASM-SRC += $(patsubst %.o,%.S,$(PASM-OBJS))
-
-POBJS += parasite.o
-PSRCS += $(patsubst %.o,%.c,$(POBJS))
-
-PBLOB-NAME := parasite
-PBLOB-HDR := parasite-blob.h
-PBLOB-BIN := parasite.bin
-PBLOB-BIN-O := $(PBLOB-BIN).o
-
-ROBJS += restorer.o
-ROBJS += log-simple.o
-RSRCS += $(patsubst %.o,%.c,$(ROBJS))
-
-RBLOB-NAME := restorer
-RBLOB-HDR := restorer-blob.h
-RBLOB-BIN := restorer.bin
-RBLOB-BIN-O := $(RBLOB-BIN).o
-
-DEPS += $(patsubst %.o,%.d,$(POBJS))
-DEPS += $(patsubst %.o,%.d,$(ROBJS))
-
-PIELDS := pie.lds.S
-PIEFLAGS := -fpie -Wa,--noexecstack -fno-strict-aliasing
-ASMFLAGS := -D__ASSEMBLY__
-
-$(PASM-OBJS): $(PASM-SRC) $(SYSCALL-LIB)
- $(E) " CC " $@
- $(Q) $(CC) -c $(ASMFLAGS) $(CFLAGS) $(PIEFLAGS) $(patsubst %.o,%.S,$@) -o $@
-
-$(POBJS): $(PSRCS) $(PASM-OBJS) $(SYSCALL-LIB)
- $(E) " CC " $@
- $(Q) $(CC) -c $(CFLAGS) $(PIEFLAGS) $(patsubst %.o,%.c,$@) -o $@
-
-parasite-util-net.o: util-net.c $(SYSCALL-LIB)
- $(E) " CC " $@
- $(Q) $(CC) -c $(CFLAGS) $(PIEFLAGS) $< -o $@
-
-parasite-log.o: log-simple.c $(SYSCALL-LIB)
- $(E) " CC " $@
- $(Q) $(CC) -c $(CFLAGS) $(PIEFLAGS) $< -o $@
-
-POBJS += parasite-util-net.o parasite-log.o
-
-$(PBLOB-BIN-O): $(PIELDS) $(POBJS) $(PASM-OBJS)
- $(E) " GEN " $@
- $(Q) $(LD) --oformat=elf64-x86-64 -T $(PIELDS) -o $(PBLOB-BIN).o $(POBJS) $(PASM-OBJS) $(SYSCALL-LIB)
-
-$(PBLOB-BIN): $(PBLOB-BIN-O) $(PIELDS) $(POBJS) $(PASM-OBJS)
- $(E) " GEN " $@
- $(Q) $(LD) --oformat=binary -T $(PIELDS) -o $(PBLOB-BIN) $(POBJS) $(PASM-OBJS) $(SYSCALL-LIB)
-
-$(PBLOB-HDR): $(PBLOB-BIN) $(GEN-OFFSETS)
- $(E) " GEN " $@
- $(Q) $(SH) $(GEN-OFFSETS) $(PBLOB-NAME) > $@ || rm -f $@
-
-$(ROBJS): $(RSRCS) $(SYSCALL-LIB)
- $(E) " CC " $@
- $(Q) $(CC) -c $(CFLAGS) $(PIEFLAGS) $(patsubst %.o,%.c,$@) -o $@
-
-$(RBLOB-BIN-O): $(PIELDS) $(ROBJS)
- $(E) " GEN " $@
- $(Q) $(LD) --oformat=elf64-x86-64 -T $(PIELDS) -o $(RBLOB-BIN).o $(ROBJS) $(SYSCALL-LIB)
-
-$(RBLOB-BIN): $(RBLOB-BIN-O) $(PIELDS) $(ROBJS)
- $(E) " GEN " $@
- $(Q) $(LD) --oformat=binary -T $(PIELDS) -o $(RBLOB-BIN) $(ROBJS) $(SYSCALL-LIB)
-
-$(RBLOB-HDR): $(RBLOB-BIN) $(GEN-OFFSETS)
- $(E) " GEN " $@
- $(Q) $(SH) $(GEN-OFFSETS) $(RBLOB-NAME) > $@ || rm -f $@
-
-PIE-GEN := $(PBLOB-HDR) $(RBLOB-HDR)
-
-cleanpie:
- $(E) " CLEAN PIE"
- $(Q) $(RM) -f ./$(PBLOB-HDR)
- $(Q) $(RM) -f ./$(RBLOB-HDR)
- $(Q) $(RM) -f ./*.bin
- $(Q) $(RM) -f ./*.bin.o
-.PHONY: cleanpie
diff --git a/gen-offsets.sh b/gen-offsets.sh
deleted file mode 100644
index cc5c6fd..0000000
--- a/gen-offsets.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/sh
-
-set -e
-set -u
-
-NAME=$1
-INC_GUARD=__${NAME}_h__
-PREFIX=${NAME}_blob_offset__
-BLOB=${NAME}_blob
-OBJNAME=${NAME}.bin.o
-BINARY=${NAME}.bin
-
-AWK_CMD='$2 ~ /^[tT]$/ { print "#define '$PREFIX'" $3 " 0x" $1; }'
-
-cat << EOF
-/* Autogenerated by $0, do not edit */
-#ifndef $INC_GUARD
-#define $INC_GUARD
-
-EOF
-
-nm $OBJNAME | grep "__export_" | tr . _ | awk "$AWK_CMD"
-
-cat << EOF
-
-static char $BLOB[] = {
-EOF
-
-hexdump -v -e '"\t" 8/1 "0x%02x, " "\n"' $BINARY
-
-cat << EOF
-};
-
-#endif /* $INC_GUARD */
-EOF
diff --git a/log-simple.c b/log-simple.c
deleted file mode 100644
index eb940c1..0000000
--- a/log-simple.c
+++ /dev/null
@@ -1,204 +0,0 @@
-#include <stdarg.h>
-#include "syscall.h"
-#include "log.h"
-#include "log-levels.h"
-
-static int logfd = -1;
-static int cur_loglevel = DEFAULT_LOGLEVEL;
-
-void log_set_fd(int fd)
-{
- sys_close(logfd);
- logfd = fd;
-}
-
-void log_set_loglevel(unsigned int level)
-{
- cur_loglevel = level;
-}
-
-static void print_string(const char *msg)
-{
- int size = 0;
- while (msg[size])
- size++;
- sys_write(logfd, msg, size);
-}
-
-int vprint_num(char *buf, int blen, int num, char **ps)
-{
- int neg = 0;
- char *s;
-
- s = &buf[blen - 1];
-
- if (num < 0) {
- neg = 1;
- num = -num;
- } else if (num == 0) {
- *s = '0';
- s--;
- goto done;
- }
-
- while (num > 0) {
- *s = (num % 10) + '0';
- s--;
- num /= 10;
- }
-
- if (neg) {
- *s = '-';
- s--;
- }
-done:
- s++;
- *ps = s;
- return blen - (s - buf);
-}
-
-static void print_num(int num)
-{
- char buf[11], *s;
- int len;
-
- len = vprint_num(buf, sizeof(buf), num, &s);
- sys_write(logfd, s, len);
-}
-
-static void print_num_l(long num)
-{
- int neg = 0;
- char buf[21], *s;
-
- s = &buf[20];
-
- if (num < 0) {
- neg = 1;
- num = -num;
- } else if (num == 0) {
- *s = '0';
- s--;
- goto done;
- }
-
- while (num > 0) {
- *s = (num % 10) + '0';
- s--;
- num /= 10;
- }
-
- if (neg) {
- *s = '-';
- s--;
- }
-done:
- s++;
- sys_write(logfd, s, sizeof(buf) - (s - buf));
-}
-
-static void hexdigit(unsigned int v, char *to, char **z)
-{
- *to = "0123456789abcdef"[v & 0xf];
- if (*to != '0')
- *z = to;
-}
-
-static void print_hex(unsigned int num)
-{
- char buf[10], *z = &buf[9];
-
- hexdigit(num >> 0, &buf[9], &z);
- hexdigit(num >> 4, &buf[8], &z);
- hexdigit(num >> 8, &buf[7], &z);
- hexdigit(num >> 12, &buf[6], &z);
- hexdigit(num >> 16, &buf[5], &z);
- hexdigit(num >> 20, &buf[4], &z);
- hexdigit(num >> 24, &buf[3], &z);
- hexdigit(num >> 28, &buf[2], &z);
- z -= 2;
- z[0] = '0';
- z[1] = 'x';
-
- sys_write(logfd, z, sizeof(buf) - (z - buf));
-}
-
-static void print_hex_l(unsigned long num)
-{
- char buf[18], *z = &buf[17];
-
- hexdigit(num >> 0, &buf[17], &z);
- hexdigit(num >> 4, &buf[16], &z);
- hexdigit(num >> 8, &buf[15], &z);
- hexdigit(num >> 12, &buf[14], &z);
- hexdigit(num >> 16, &buf[13], &z);
- hexdigit(num >> 20, &buf[12], &z);
- hexdigit(num >> 24, &buf[11], &z);
- hexdigit(num >> 28, &buf[10], &z);
-
- hexdigit(num >> 32, &buf[9], &z);
- hexdigit(num >> 36, &buf[8], &z);
- hexdigit(num >> 40, &buf[7], &z);
- hexdigit(num >> 44, &buf[6], &z);
- hexdigit(num >> 48, &buf[5], &z);
- hexdigit(num >> 52, &buf[4], &z);
- hexdigit(num >> 56, &buf[3], &z);
- hexdigit(num >> 60, &buf[2], &z);
-
- z -= 2;
- z[0] = '0';
- z[1] = 'x';
-
- sys_write(logfd, z, sizeof(buf) - (z - buf));
-}
-
-void print_on_level(unsigned int loglevel, const char *format, ...)
-{
- va_list args;
- const char *s = format, *p;
-
- if (loglevel > cur_loglevel)
- return;
-
- va_start(args, format);
- p = s;
- while (1) {
- int along = 0;
-
- if (*s != '\0' && *s != '%') {
- s++;
- continue;
- }
-
- sys_write(logfd, p, s - p);
- if (*s == '\0')
- break;
-
- s++;
- if (*s == 'l') {
- along = 1;
- s++;
- }
-
- switch (*s) {
- case 's':
- print_string(va_arg(args, char *));
- break;
- case 'd':
- if (along)
- print_num_l(va_arg(args, long));
- else
- print_num(va_arg(args, int));
- break;
- case 'x':
- if (along)
- print_hex_l(va_arg(args, long));
- else
- print_hex(va_arg(args, unsigned int));
- break;
- }
- s++;
- p = s;
- }
- va_end(args);
-}
diff --git a/parasite-head-x86-64.S b/parasite-head-x86-64.S
deleted file mode 100644
index a2c12dc..0000000
--- a/parasite-head-x86-64.S
+++ /dev/null
@@ -1,24 +0,0 @@
-#include "linkage.h"
-#include "parasite.h"
-
- .section .head.text, "ax"
-ENTRY(__export_parasite_head_start)
- leaq __export_parasite_stack(%rip), %rsp
- subq $16, %rsp
- andq $~15, %rsp
- pushq $0
- movq %rsp, %rbp
- movl __export_parasite_cmd(%rip), %edi
- leaq __export_parasite_args(%rip), %rsi
- call parasite_service
- int $0x03
- .align 8
-__export_parasite_cmd:
- .long 0
-__export_parasite_args:
- .long 0
- .space PARASITE_ARG_SIZE,0
- .space PARASITE_STACK_SIZE,0
-__export_parasite_stack:
- .long 0
-END(__export_parasite_head_start)
diff --git a/parasite.c b/parasite.c
deleted file mode 100644
index 9e916dd..0000000
--- a/parasite.c
+++ /dev/null
@@ -1,552 +0,0 @@
-#include <sys/mman.h>
-#include <errno.h>
-#include <signal.h>
-#include <linux/limits.h>
-#include <sys/mount.h>
-#include <stdarg.h>
-#include <sys/ioctl.h>
-
-#include "syscall.h"
-#include "parasite.h"
-#include "log.h"
-
-#include <string.h>
-
-#ifndef CONFIG_X86_64
-#error non-x86-64 mode not yet implemented
-#endif
-
-static void *brk_start, *brk_end, *brk_tail;
-
-static int tsock = -1;
-
-#define MAX_HEAP_SIZE (10 << 20) /* Hope 10MB will be enough... */
-
-static int brk_init(void)
-{
- unsigned long ret;
- /*
- * Map 10 MB. Hope this will be enough for unix skb's...
- */
- ret = sys_mmap(NULL, MAX_HEAP_SIZE,
- PROT_READ | PROT_WRITE,
- MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
- if (ret < 0)
- return -ENOMEM;
-
- brk_start = brk_tail = (void *)ret;
- brk_end = brk_start + MAX_HEAP_SIZE;
- return 0;
-}
-
-static void brk_fini(void)
-{
- sys_munmap(brk_start, MAX_HEAP_SIZE);
-}
-
-static void *brk_alloc(unsigned long bytes)
-{
- void *addr = NULL;
- if (brk_end >= (brk_tail + bytes)) {
- addr = brk_tail;
- brk_tail+= bytes;
- }
- return addr;
-}
-
-static void brk_free(unsigned long bytes)
-{
- if (brk_start >= (brk_tail - bytes))
- brk_tail -= bytes;
-}
-
-#define PME_PRESENT (1ULL << 63)
-#define PME_SWAP (1ULL << 62)
-#define PME_FILE (1ULL << 61)
-
-static inline int should_dump_page(VmaEntry *vmae, u64 pme)
-{
- if (vma_entry_is(vmae, VMA_AREA_VDSO))
- return 1;
- /*
- * Optimisation for private mapping pages, that haven't
- * yet being COW-ed
- */
- if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE))
- return 0;
- if (pme & (PME_PRESENT | PME_SWAP))
- return 1;
-
- return 0;
-}
-
-static int fd_pages = -1;
-static int fd_pagemap = -1;
-
-static int dump_pages_init()
-{
- fd_pages = recv_fd(tsock);
- if (fd_pages < 0)
- return fd_pages;
-
- fd_pagemap = sys_open("/proc/self/pagemap", O_RDONLY, 0);
- if (fd_pagemap < 0) {
- pr_err("Can't open self pagemap");
- sys_close(fd_pages);
- return fd_pagemap;
- }
-
- return 0;
-}
-
-static int sys_write_safe(int fd, void *buf, int size)
-{
- int ret;
-
- ret = sys_write(fd, buf, size);
- if (ret < 0) {
- pr_err("sys_write failed\n");
- return ret;
- }
-
- if (ret != size) {
- pr_err("not all data was written\n");
- ret = -EIO;
- }
-
- return 0;
-}
-
-/*
- * This is the main page dumping routine, it's executed
- * inside a victim process space.
- */
-static int dump_pages(struct parasite_dump_pages_args *args)
-{
- unsigned long nrpages, pfn, length;
- unsigned long prot_old, prot_new;
- u64 *map, off;
- int ret = -1;
-
- args->nrpages_dumped = 0;
- args->nrpages_skipped = 0;
- prot_old = prot_new = 0;
-
- pfn = args->vma_entry.start / PAGE_SIZE;
- nrpages = (args->vma_entry.end - args->vma_entry.start) / PAGE_SIZE;
- args->nrpages_total = nrpages;
- length = nrpages * sizeof(*map);
-
- /*
- * Up to 10M of pagemap will handle 5G mapping.
- */
- map = brk_alloc(length);
- if (!map) {
- ret = -ENOMEM;
- goto err;
- }
-
- off = pfn * sizeof(*map);
- off = sys_lseek(fd_pagemap, off, SEEK_SET);
- if (off != pfn * sizeof(*map)) {
- pr_err("Can't seek pagemap");
- ret = off;
- goto err_free;
- }
-
- ret = sys_read(fd_pagemap, map, length);
- if (ret != length) {
- pr_err("Can't read self pagemap");
- goto err_free;
- }
-
- /*
- * Try to change page protection if needed so we would
- * be able to dump contents.
- */
- if (!(args->vma_entry.prot & PROT_READ)) {
- prot_old = (unsigned long)args->vma_entry.prot;
- prot_new = prot_old | PROT_READ;
- ret = sys_mprotect((void *)args->vma_entry.start,
- (unsigned long)vma_entry_len(&args->vma_entry),
- prot_new);
- if (ret) {
- pr_err("sys_mprotect failed\n");
- goto err_free;
- }
- }
-
- ret = 0;
- for (pfn = 0; pfn < nrpages; pfn++) {
- unsigned long vaddr;
-
- if (should_dump_page(&args->vma_entry, map[pfn])) {
- /*
- * That's the optimized write of
- * page_entry structure, see image.h
- */
- vaddr = (unsigned long)args->vma_entry.start + pfn * PAGE_SIZE;
-
- ret = sys_write_safe(fd_pages, &vaddr, sizeof(vaddr));
- if (ret)
- return ret;
- ret = sys_write_safe(fd_pages, (void *)vaddr, PAGE_SIZE);
- if (ret)
- return ret;
-
- args->nrpages_dumped++;
- } else if (map[pfn] & PME_PRESENT)
- args->nrpages_skipped++;
- }
-
- /*
- * Don't left pages readable if they were not.
- */
- if (prot_old != prot_new) {
- ret = sys_mprotect((void *)args->vma_entry.start,
- (unsigned long)vma_entry_len(&args->vma_entry),
- prot_old);
- if (ret) {
- pr_err("PANIC: Ouch! sys_mprotect failed on restore\n");
- goto err_free;
- }
- }
-
- ret = 0;
-err_free:
- brk_free(length);
-err:
- return ret;
-}
-
-static int dump_pages_fini(void)
-{
- int ret;
-
- ret = sys_close(fd_pagemap);
- ret |= sys_close(fd_pages);
-
- return ret;
-}
-
-static int dump_sigact(struct parasite_dump_sa_args *da)
-{
- int sig, ret = 0;
-
- for (sig = 1; sig < SIGMAX; sig++) {
- if (sig == SIGKILL || sig == SIGSTOP)
- continue;
-
- ret = sys_sigaction(sig, NULL, &da->sas[sig], sizeof(rt_sigset_t));
- if (ret < 0) {
- pr_err("sys_sigaction failed\n");
- break;
- }
- }
-
- return ret;
-}
-
-static int dump_itimers(struct parasite_dump_itimers_args *args)
-{
- int ret;
-
- ret = sys_getitimer(ITIMER_REAL, &args->real);
- if (!ret)
- ret = sys_getitimer(ITIMER_VIRTUAL, &args->virt);
- if (!ret)
- ret = sys_getitimer(ITIMER_PROF, &args->prof);
-
- if (ret)
- pr_err("getitimer failed\n");
-
- return ret;
-}
-
-static k_rtsigset_t old_blocked;
-static int reset_blocked = 0;
-
-static int dump_misc(struct parasite_dump_misc *args)
-{
- args->brk = sys_brk(0);
- args->blocked = old_blocked;
-
- args->pid = sys_getpid();
- args->sid = sys_getsid();
- args->pgid = sys_getpgid();
-
- return 0;
-}
-
-static int dump_creds(struct parasite_dump_creds *args)
-{
- int ret;
-
- args->secbits = sys_prctl(PR_GET_SECUREBITS, 0, 0, 0, 0);
-
- ret = sys_getgroups(0, NULL);
- if (ret < 0)
- goto grps_err;
-
- args->ngroups = ret;
- if (args->ngroups >= PARASITE_MAX_GROUPS) {
- pr_err("Too many groups in task %d\n", (int)args->ngroups);
- return -1;
- }
-
- ret = sys_getgroups(args->ngroups, args->groups);
- if (ret < 0)
- goto grps_err;
-
- if (ret != args->ngroups) {
- pr_err("Groups changed on the fly %d -> %d\n",
- args->ngroups, ret);
- return -1;
- }
-
- return 0;
-
-grps_err:
- pr_err("Error calling getgroups (%d)\n", ret);
- return -1;
-}
-
-static int dump_tid_info(struct parasite_dump_tid_info *args)
-{
- int ret;
-
- ret = sys_prctl(PR_GET_TID_ADDRESS, (unsigned long) &args->tid_addr, 0, 0, 0);
- if (ret)
- return ret;
-
- args->tid = sys_gettid();
-
- return 0;
-}
-
-static int drain_fds(struct parasite_drain_fd *args)
-{
- int ret;
-
- ret = send_fds(tsock, NULL, 0,
- args->fds, args->nr_fds, true);
- if (ret)
- pr_err("send_fds failed\n");
-
- return ret;
-}
-
-static int init(struct parasite_init_args *args)
-{
- k_rtsigset_t to_block;
- int ret;
-
- ret = brk_init();
- if (ret)
- return -ret;
-
- tsock = sys_socket(PF_UNIX, SOCK_DGRAM, 0);
- if (tsock < 0)
- return -tsock;
-
- ret = sys_bind(tsock, (struct sockaddr *) &args->p_addr, args->p_addr_len);
- if (ret < 0)
- return ret;
-
- ret = sys_connect(tsock, (struct sockaddr *)&args->h_addr, args->h_addr_len);
- if (ret < 0)
- return ret;
-
- ksigfillset(&to_block);
- ret = sys_sigprocmask(SIG_SETMASK, &to_block, &old_blocked, sizeof(k_rtsigset_t));
- if (ret < 0)
- reset_blocked = ret;
- else
- reset_blocked = 1;
-
- return ret;
-}
-
-static char proc_mountpoint[] = "proc.crtools";
-static int parasite_get_proc_fd()
-{
- int ret, fd = -1;
- char buf[2];
-
- ret = sys_readlink("/proc/self", buf, sizeof(buf));
- if (ret < 0 && ret != -ENOENT) {
- pr_err("Can't readlink /proc/self\n");
- return ret;
- }
-
- /* Fast path -- if /proc belongs to this pidns */
- if (ret == 1 && buf[0] == '1') {
- fd = sys_open("/proc", O_RDONLY, 0);
- goto out_send_fd;
- }
-
- if (sys_mkdir(proc_mountpoint, 0700)) {
- pr_err("Can't create a directory\n");
- return ret;
- }
-
- if (sys_mount("proc", proc_mountpoint, "proc", MS_MGC_VAL, NULL)) {
- pr_err("mount failed\n");
- ret = -1;
- goto out_rmdir;
- }
-
- fd = sys_open(proc_mountpoint, O_RDONLY, 0);
-
- if (sys_umount2(proc_mountpoint, MNT_DETACH)) {
- pr_err("Can't umount procfs\n");
- return -1;
- }
-
-out_rmdir:
- if (sys_rmdir(proc_mountpoint)) {
- pr_err("Can't remove directory\n");
- return -1;
- }
-
-out_send_fd:
- if (fd < 0)
- return fd;
- ret = send_fd(tsock, NULL, 0, fd);
- sys_close(fd);
- return ret;
-}
-
-static inline int tty_ioctl(int fd, int cmd, int *arg)
-{
- int ret;
-
- ret = sys_ioctl(fd, cmd, (unsigned long)arg);
- if (ret < 0) {
- if (ret != -ENOTTY)
- return -1;
- *arg = 0;
- }
- return 0;
-}
-
-static int parasite_dump_tty(struct parasite_tty_args *args)
-{
- int ret;
-
-#ifndef TIOCGPKT
-# define TIOCGPKT _IOR('T', 0x38, int)
-#endif
-
-#ifndef TIOCGPTLCK
-# define TIOCGPTLCK _IOR('T', 0x39, int)
-#endif
-
-#ifndef TIOCGEXCL
-# define TIOCGEXCL _IOR('T', 0x40, int)
-#endif
-
- ret = tty_ioctl(args->fd, TIOCGSID, &args->sid);
- if (ret < 0)
- goto err;
-
- ret = tty_ioctl(args->fd, TIOCGPGRP, &args->pgrp);
- if (ret < 0)
- goto err;
-
- ret = tty_ioctl(args->fd, TIOCGPKT, &args->st_pckt);
- if (ret < 0)
- goto err;
-
- ret = tty_ioctl(args->fd, TIOCGPTLCK, &args->st_lock);
- if (ret < 0)
- goto err;
-
- ret = tty_ioctl(args->fd, TIOCGEXCL, &args->st_excl);
- if (ret < 0)
- goto err;
-
- args->hangup = false;
- return 0;
-
-err:
- if (ret != -EIO) {
- pr_err("TTY: Can't get sid/pgrp\n");
- return -1;
- }
-
- /* kernel reports EIO for get ioctls on pair-less ptys */
- args->sid = 0;
- args->pgrp = 0;
- args->st_pckt = 0;
- args->st_lock = 0;
- args->st_excl = 0;
- args->hangup = true;
-
- return 0;
-}
-
-static int parasite_cfg_log(struct parasite_log_args *args)
-{
- int ret;
-
- ret = recv_fd(tsock);
- if (ret >= 0) {
- log_set_fd(ret);
- log_set_loglevel(args->log_level);
- ret = 0;
- }
-
- return ret;
-}
-
-static int fini(void)
-{
- if (reset_blocked == 1)
- sys_sigprocmask(SIG_SETMASK, &old_blocked, NULL, sizeof(k_rtsigset_t));
-
- log_set_fd(-1);
- sys_close(tsock);
- brk_fini();
-
- return 0;
-}
-
-int __used parasite_service(unsigned int cmd, void *args)
-{
- pr_info("Parasite cmd %d/%x process\n", cmd, cmd);
-
- switch (cmd) {
- case PARASITE_CMD_INIT:
- return init(args);
- case PARASITE_CMD_FINI:
- return fini();
- case PARASITE_CMD_CFG_LOG:
- return parasite_cfg_log(args);
- case PARASITE_CMD_DUMPPAGES_INIT:
- return dump_pages_init();
- case PARASITE_CMD_DUMPPAGES_FINI:
- return dump_pages_fini();
- case PARASITE_CMD_DUMPPAGES:
- return dump_pages(args);
- case PARASITE_CMD_DUMP_SIGACTS:
- return dump_sigact(args);
- case PARASITE_CMD_DUMP_ITIMERS:
- return dump_itimers(args);
- case PARASITE_CMD_DUMP_MISC:
- return dump_misc(args);
- case PARASITE_CMD_DUMP_CREDS:
- return dump_creds(args);
- case PARASITE_CMD_DUMP_TID_ADDR:
- return dump_tid_info(args);
- case PARASITE_CMD_DRAIN_FDS:
- return drain_fds(args);
- case PARASITE_CMD_GET_PROC_FD:
- return parasite_get_proc_fd();
- case PARASITE_CMD_DUMP_TTY:
- return parasite_dump_tty(args);
- }
-
- pr_err("Unknown command to parasite\n");
- return -EINVAL;
-}
diff --git a/pie.lds.S b/pie.lds.S
deleted file mode 100644
index 4055af7..0000000
--- a/pie.lds.S
+++ /dev/null
@@ -1,25 +0,0 @@
-OUTPUT_ARCH(i386:x86-64)
-
-SECTIONS
-{
- .crblob 0x0 : {
- *(.head.text)
- *(.text)
- . = ALIGN(32);
- *(.data*)
- . = ALIGN(32);
- *(.rodata*)
- . = ALIGN(32);
- *(.bss*)
- . = ALIGN(32);
- } =0x00000000
-
- /DISCARD/ : {
- *(.debug*)
- *(.comment*)
- *(.note*)
- *(.group*)
- *(.eh_frame*)
- *(*)
- }
-}
diff --git a/pie/Makefile b/pie/Makefile
new file mode 100644
index 0000000..119e837
--- /dev/null
+++ b/pie/Makefile
@@ -0,0 +1,92 @@
+SYSCALL-LIB := ../syscall/syscall-x86-64.o
+CFLAGS += -I../include/ -I../protobuf/ -I../syscall/
+
+include ../Makefile.inc
+
+GEN-OFFSETS := gen-offsets.sh
+
+PASM-OBJS += parasite-head-x86-64.o
+PASM-SRC += $(patsubst %.o,%.S,$(PASM-OBJS))
+
+POBJS += parasite.o
+PSRCS += $(patsubst %.o,%.c,$(POBJS))
+
+PBLOB-NAME := parasite
+PBLOB-HDR := parasite-blob.h
+PBLOB-BIN := parasite.bin
+PBLOB-BIN-O := $(PBLOB-BIN).o
+
+ROBJS += restorer.o
+ROBJS += log-simple.o
+RSRCS += $(patsubst %.o,%.c,$(ROBJS))
+
+RBLOB-NAME := restorer
+RBLOB-HDR := restorer-blob.h
+RBLOB-BIN := restorer.bin
+RBLOB-BIN-O := $(RBLOB-BIN).o
+
+DEPS += $(patsubst %.o,%.d,$(POBJS))
+DEPS += $(patsubst %.o,%.d,$(ROBJS))
+
+PIELDS := pie.lds.S
+PIEFLAGS := -fpie -Wa,--noexecstack -fno-strict-aliasing
+ASMFLAGS := -D__ASSEMBLY__
+
+.DEFAULT_GOAL := pie
+
+$(PASM-OBJS): $(PASM-SRC) $(SYSCALL-LIB)
+ $(E) " CC " $@
+ $(Q) $(CC) -c $(ASMFLAGS) $(CFLAGS) $(PIEFLAGS) $(patsubst %.o,%.S,$@) -o $@
+
+$(POBJS): $(PSRCS) $(PASM-OBJS) $(SYSCALL-LIB)
+ $(E) " CC " $@
+ $(Q) $(CC) -c $(CFLAGS) $(PIEFLAGS) $(patsubst %.o,%.c,$@) -o $@
+
+parasite-util-net.o: util-net.c $(SYSCALL-LIB)
+ $(E) " CC " $@
+ $(Q) $(CC) -c $(CFLAGS) $(PIEFLAGS) $< -o $@
+
+parasite-log.o: log-simple.c $(SYSCALL-LIB)
+ $(E) " CC " $@
+ $(Q) $(CC) -c $(CFLAGS) $(PIEFLAGS) $< -o $@
+
+POBJS += parasite-util-net.o parasite-log.o
+
+$(PBLOB-BIN-O): $(PIELDS) $(POBJS) $(PASM-OBJS)
+ $(E) " GEN " $@
+ $(Q) $(LD) --oformat=elf64-x86-64 -T $(PIELDS) -o $(PBLOB-BIN).o $(POBJS) $(PASM-OBJS) $(SYSCALL-LIB)
+
+$(PBLOB-BIN): $(PBLOB-BIN-O) $(PIELDS) $(POBJS) $(PASM-OBJS)
+ $(E) " GEN " $@
+ $(Q) $(LD) --oformat=binary -T $(PIELDS) -o $(PBLOB-BIN) $(POBJS) $(PASM-OBJS) $(SYSCALL-LIB)
+
+$(PBLOB-HDR): $(PBLOB-BIN) $(GEN-OFFSETS)
+ $(E) " GEN " $@
+ $(Q) $(SH) $(GEN-OFFSETS) $(PBLOB-NAME) > $@ || rm -f $@
+
+$(ROBJS): $(RSRCS) $(SYSCALL-LIB)
+ $(E) " CC " $@
+ $(Q) $(CC) -c $(CFLAGS) $(PIEFLAGS) $(patsubst %.o,%.c,$@) -o $@
+
+$(RBLOB-BIN-O): $(PIELDS) $(ROBJS)
+ $(E) " GEN " $@
+ $(Q) $(LD) --oformat=elf64-x86-64 -T $(PIELDS) -o $(RBLOB-BIN).o $(ROBJS) $(SYSCALL-LIB)
+
+$(RBLOB-BIN): $(RBLOB-BIN-O) $(PIELDS) $(ROBJS)
+ $(E) " GEN " $@
+ $(Q) $(LD) --oformat=binary -T $(PIELDS) -o $(RBLOB-BIN) $(ROBJS) $(SYSCALL-LIB)
+
+$(RBLOB-HDR): $(RBLOB-BIN) $(GEN-OFFSETS)
+ $(E) " GEN " $@
+ $(Q) $(SH) $(GEN-OFFSETS) $(RBLOB-NAME) > $@ || rm -f $@
+
+pie: $(PBLOB-HDR) $(RBLOB-HDR)
+
+clean:
+ $(E) " CLEAN PIE"
+ $(Q) $(RM) -f ./$(PBLOB-HDR)
+ $(Q) $(RM) -f ./$(RBLOB-HDR)
+ $(Q) $(RM) -f ./*.o
+ $(Q) $(RM) -f ./*.bin
+ $(Q) $(RM) -f ./*.bin.o
+.PHONY: clean pie
diff --git a/pie/gen-offsets.sh b/pie/gen-offsets.sh
new file mode 100644
index 0000000..cc5c6fd
--- /dev/null
+++ b/pie/gen-offsets.sh
@@ -0,0 +1,35 @@
+#!/bin/sh
+
+set -e
+set -u
+
+NAME=$1
+INC_GUARD=__${NAME}_h__
+PREFIX=${NAME}_blob_offset__
+BLOB=${NAME}_blob
+OBJNAME=${NAME}.bin.o
+BINARY=${NAME}.bin
+
+AWK_CMD='$2 ~ /^[tT]$/ { print "#define '$PREFIX'" $3 " 0x" $1; }'
+
+cat << EOF
+/* Autogenerated by $0, do not edit */
+#ifndef $INC_GUARD
+#define $INC_GUARD
+
+EOF
+
+nm $OBJNAME | grep "__export_" | tr . _ | awk "$AWK_CMD"
+
+cat << EOF
+
+static char $BLOB[] = {
+EOF
+
+hexdump -v -e '"\t" 8/1 "0x%02x, " "\n"' $BINARY
+
+cat << EOF
+};
+
+#endif /* $INC_GUARD */
+EOF
diff --git a/pie/log-simple.c b/pie/log-simple.c
new file mode 100644
index 0000000..eb940c1
--- /dev/null
+++ b/pie/log-simple.c
@@ -0,0 +1,204 @@
+#include <stdarg.h>
+#include "syscall.h"
+#include "log.h"
+#include "log-levels.h"
+
+static int logfd = -1;
+static int cur_loglevel = DEFAULT_LOGLEVEL;
+
+void log_set_fd(int fd)
+{
+ sys_close(logfd);
+ logfd = fd;
+}
+
+void log_set_loglevel(unsigned int level)
+{
+ cur_loglevel = level;
+}
+
+static void print_string(const char *msg)
+{
+ int size = 0;
+ while (msg[size])
+ size++;
+ sys_write(logfd, msg, size);
+}
+
+int vprint_num(char *buf, int blen, int num, char **ps)
+{
+ int neg = 0;
+ char *s;
+
+ s = &buf[blen - 1];
+
+ if (num < 0) {
+ neg = 1;
+ num = -num;
+ } else if (num == 0) {
+ *s = '0';
+ s--;
+ goto done;
+ }
+
+ while (num > 0) {
+ *s = (num % 10) + '0';
+ s--;
+ num /= 10;
+ }
+
+ if (neg) {
+ *s = '-';
+ s--;
+ }
+done:
+ s++;
+ *ps = s;
+ return blen - (s - buf);
+}
+
+static void print_num(int num)
+{
+ char buf[11], *s;
+ int len;
+
+ len = vprint_num(buf, sizeof(buf), num, &s);
+ sys_write(logfd, s, len);
+}
+
+static void print_num_l(long num)
+{
+ int neg = 0;
+ char buf[21], *s;
+
+ s = &buf[20];
+
+ if (num < 0) {
+ neg = 1;
+ num = -num;
+ } else if (num == 0) {
+ *s = '0';
+ s--;
+ goto done;
+ }
+
+ while (num > 0) {
+ *s = (num % 10) + '0';
+ s--;
+ num /= 10;
+ }
+
+ if (neg) {
+ *s = '-';
+ s--;
+ }
+done:
+ s++;
+ sys_write(logfd, s, sizeof(buf) - (s - buf));
+}
+
+static void hexdigit(unsigned int v, char *to, char **z)
+{
+ *to = "0123456789abcdef"[v & 0xf];
+ if (*to != '0')
+ *z = to;
+}
+
+static void print_hex(unsigned int num)
+{
+ char buf[10], *z = &buf[9];
+
+ hexdigit(num >> 0, &buf[9], &z);
+ hexdigit(num >> 4, &buf[8], &z);
+ hexdigit(num >> 8, &buf[7], &z);
+ hexdigit(num >> 12, &buf[6], &z);
+ hexdigit(num >> 16, &buf[5], &z);
+ hexdigit(num >> 20, &buf[4], &z);
+ hexdigit(num >> 24, &buf[3], &z);
+ hexdigit(num >> 28, &buf[2], &z);
+ z -= 2;
+ z[0] = '0';
+ z[1] = 'x';
+
+ sys_write(logfd, z, sizeof(buf) - (z - buf));
+}
+
+static void print_hex_l(unsigned long num)
+{
+ char buf[18], *z = &buf[17];
+
+ hexdigit(num >> 0, &buf[17], &z);
+ hexdigit(num >> 4, &buf[16], &z);
+ hexdigit(num >> 8, &buf[15], &z);
+ hexdigit(num >> 12, &buf[14], &z);
+ hexdigit(num >> 16, &buf[13], &z);
+ hexdigit(num >> 20, &buf[12], &z);
+ hexdigit(num >> 24, &buf[11], &z);
+ hexdigit(num >> 28, &buf[10], &z);
+
+ hexdigit(num >> 32, &buf[9], &z);
+ hexdigit(num >> 36, &buf[8], &z);
+ hexdigit(num >> 40, &buf[7], &z);
+ hexdigit(num >> 44, &buf[6], &z);
+ hexdigit(num >> 48, &buf[5], &z);
+ hexdigit(num >> 52, &buf[4], &z);
+ hexdigit(num >> 56, &buf[3], &z);
+ hexdigit(num >> 60, &buf[2], &z);
+
+ z -= 2;
+ z[0] = '0';
+ z[1] = 'x';
+
+ sys_write(logfd, z, sizeof(buf) - (z - buf));
+}
+
+void print_on_level(unsigned int loglevel, const char *format, ...)
+{
+ va_list args;
+ const char *s = format, *p;
+
+ if (loglevel > cur_loglevel)
+ return;
+
+ va_start(args, format);
+ p = s;
+ while (1) {
+ int along = 0;
+
+ if (*s != '\0' && *s != '%') {
+ s++;
+ continue;
+ }
+
+ sys_write(logfd, p, s - p);
+ if (*s == '\0')
+ break;
+
+ s++;
+ if (*s == 'l') {
+ along = 1;
+ s++;
+ }
+
+ switch (*s) {
+ case 's':
+ print_string(va_arg(args, char *));
+ break;
+ case 'd':
+ if (along)
+ print_num_l(va_arg(args, long));
+ else
+ print_num(va_arg(args, int));
+ break;
+ case 'x':
+ if (along)
+ print_hex_l(va_arg(args, long));
+ else
+ print_hex(va_arg(args, unsigned int));
+ break;
+ }
+ s++;
+ p = s;
+ }
+ va_end(args);
+}
diff --git a/pie/parasite-head-x86-64.S b/pie/parasite-head-x86-64.S
new file mode 100644
index 0000000..a2c12dc
--- /dev/null
+++ b/pie/parasite-head-x86-64.S
@@ -0,0 +1,24 @@
+#include "linkage.h"
+#include "parasite.h"
+
+ .section .head.text, "ax"
+ENTRY(__export_parasite_head_start)
+ leaq __export_parasite_stack(%rip), %rsp
+ subq $16, %rsp
+ andq $~15, %rsp
+ pushq $0
+ movq %rsp, %rbp
+ movl __export_parasite_cmd(%rip), %edi
+ leaq __export_parasite_args(%rip), %rsi
+ call parasite_service
+ int $0x03
+ .align 8
+__export_parasite_cmd:
+ .long 0
+__export_parasite_args:
+ .long 0
+ .space PARASITE_ARG_SIZE,0
+ .space PARASITE_STACK_SIZE,0
+__export_parasite_stack:
+ .long 0
+END(__export_parasite_head_start)
diff --git a/pie/parasite.c b/pie/parasite.c
new file mode 100644
index 0000000..9e916dd
--- /dev/null
+++ b/pie/parasite.c
@@ -0,0 +1,552 @@
+#include <sys/mman.h>
+#include <errno.h>
+#include <signal.h>
+#include <linux/limits.h>
+#include <sys/mount.h>
+#include <stdarg.h>
+#include <sys/ioctl.h>
+
+#include "syscall.h"
+#include "parasite.h"
+#include "log.h"
+
+#include <string.h>
+
+#ifndef CONFIG_X86_64
+#error non-x86-64 mode not yet implemented
+#endif
+
+static void *brk_start, *brk_end, *brk_tail;
+
+static int tsock = -1;
+
+#define MAX_HEAP_SIZE (10 << 20) /* Hope 10MB will be enough... */
+
+static int brk_init(void)
+{
+ unsigned long ret;
+ /*
+ * Map 10 MB. Hope this will be enough for unix skb's...
+ */
+ ret = sys_mmap(NULL, MAX_HEAP_SIZE,
+ PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ if (ret < 0)
+ return -ENOMEM;
+
+ brk_start = brk_tail = (void *)ret;
+ brk_end = brk_start + MAX_HEAP_SIZE;
+ return 0;
+}
+
+static void brk_fini(void)
+{
+ sys_munmap(brk_start, MAX_HEAP_SIZE);
+}
+
+static void *brk_alloc(unsigned long bytes)
+{
+ void *addr = NULL;
+ if (brk_end >= (brk_tail + bytes)) {
+ addr = brk_tail;
+ brk_tail+= bytes;
+ }
+ return addr;
+}
+
+static void brk_free(unsigned long bytes)
+{
+ if (brk_start >= (brk_tail - bytes))
+ brk_tail -= bytes;
+}
+
+#define PME_PRESENT (1ULL << 63)
+#define PME_SWAP (1ULL << 62)
+#define PME_FILE (1ULL << 61)
+
+static inline int should_dump_page(VmaEntry *vmae, u64 pme)
+{
+ if (vma_entry_is(vmae, VMA_AREA_VDSO))
+ return 1;
+ /*
+ * Optimisation for private mapping pages, that haven't
+ * yet being COW-ed
+ */
+ if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE))
+ return 0;
+ if (pme & (PME_PRESENT | PME_SWAP))
+ return 1;
+
+ return 0;
+}
+
+static int fd_pages = -1;
+static int fd_pagemap = -1;
+
+static int dump_pages_init()
+{
+ fd_pages = recv_fd(tsock);
+ if (fd_pages < 0)
+ return fd_pages;
+
+ fd_pagemap = sys_open("/proc/self/pagemap", O_RDONLY, 0);
+ if (fd_pagemap < 0) {
+ pr_err("Can't open self pagemap");
+ sys_close(fd_pages);
+ return fd_pagemap;
+ }
+
+ return 0;
+}
+
+static int sys_write_safe(int fd, void *buf, int size)
+{
+ int ret;
+
+ ret = sys_write(fd, buf, size);
+ if (ret < 0) {
+ pr_err("sys_write failed\n");
+ return ret;
+ }
+
+ if (ret != size) {
+ pr_err("not all data was written\n");
+ ret = -EIO;
+ }
+
+ return 0;
+}
+
+/*
+ * This is the main page dumping routine, it's executed
+ * inside a victim process space.
+ */
+static int dump_pages(struct parasite_dump_pages_args *args)
+{
+ unsigned long nrpages, pfn, length;
+ unsigned long prot_old, prot_new;
+ u64 *map, off;
+ int ret = -1;
+
+ args->nrpages_dumped = 0;
+ args->nrpages_skipped = 0;
+ prot_old = prot_new = 0;
+
+ pfn = args->vma_entry.start / PAGE_SIZE;
+ nrpages = (args->vma_entry.end - args->vma_entry.start) / PAGE_SIZE;
+ args->nrpages_total = nrpages;
+ length = nrpages * sizeof(*map);
+
+ /*
+ * Up to 10M of pagemap will handle 5G mapping.
+ */
+ map = brk_alloc(length);
+ if (!map) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ off = pfn * sizeof(*map);
+ off = sys_lseek(fd_pagemap, off, SEEK_SET);
+ if (off != pfn * sizeof(*map)) {
+ pr_err("Can't seek pagemap");
+ ret = off;
+ goto err_free;
+ }
+
+ ret = sys_read(fd_pagemap, map, length);
+ if (ret != length) {
+ pr_err("Can't read self pagemap");
+ goto err_free;
+ }
+
+ /*
+ * Try to change page protection if needed so we would
+ * be able to dump contents.
+ */
+ if (!(args->vma_entry.prot & PROT_READ)) {
+ prot_old = (unsigned long)args->vma_entry.prot;
+ prot_new = prot_old | PROT_READ;
+ ret = sys_mprotect((void *)args->vma_entry.start,
+ (unsigned long)vma_entry_len(&args->vma_entry),
+ prot_new);
+ if (ret) {
+ pr_err("sys_mprotect failed\n");
+ goto err_free;
+ }
+ }
+
+ ret = 0;
+ for (pfn = 0; pfn < nrpages; pfn++) {
+ unsigned long vaddr;
+
+ if (should_dump_page(&args->vma_entry, map[pfn])) {
+ /*
+ * That's the optimized write of
+ * page_entry structure, see image.h
+ */
+ vaddr = (unsigned long)args->vma_entry.start + pfn * PAGE_SIZE;
+
+ ret = sys_write_safe(fd_pages, &vaddr, sizeof(vaddr));
+ if (ret)
+ return ret;
+ ret = sys_write_safe(fd_pages, (void *)vaddr, PAGE_SIZE);
+ if (ret)
+ return ret;
+
+ args->nrpages_dumped++;
+ } else if (map[pfn] & PME_PRESENT)
+ args->nrpages_skipped++;
+ }
+
+ /*
+ * Don't left pages readable if they were not.
+ */
+ if (prot_old != prot_new) {
+ ret = sys_mprotect((void *)args->vma_entry.start,
+ (unsigned long)vma_entry_len(&args->vma_entry),
+ prot_old);
+ if (ret) {
+ pr_err("PANIC: Ouch! sys_mprotect failed on restore\n");
+ goto err_free;
+ }
+ }
+
+ ret = 0;
+err_free:
+ brk_free(length);
+err:
+ return ret;
+}
+
+static int dump_pages_fini(void)
+{
+ int ret;
+
+ ret = sys_close(fd_pagemap);
+ ret |= sys_close(fd_pages);
+
+ return ret;
+}
+
+static int dump_sigact(struct parasite_dump_sa_args *da)
+{
+ int sig, ret = 0;
+
+ for (sig = 1; sig < SIGMAX; sig++) {
+ if (sig == SIGKILL || sig == SIGSTOP)
+ continue;
+
+ ret = sys_sigaction(sig, NULL, &da->sas[sig], sizeof(rt_sigset_t));
+ if (ret < 0) {
+ pr_err("sys_sigaction failed\n");
+ break;
+ }
+ }
+
+ return ret;
+}
+
+static int dump_itimers(struct parasite_dump_itimers_args *args)
+{
+ int ret;
+
+ ret = sys_getitimer(ITIMER_REAL, &args->real);
+ if (!ret)
+ ret = sys_getitimer(ITIMER_VIRTUAL, &args->virt);
+ if (!ret)
+ ret = sys_getitimer(ITIMER_PROF, &args->prof);
+
+ if (ret)
+ pr_err("getitimer failed\n");
+
+ return ret;
+}
+
+static k_rtsigset_t old_blocked;
+static int reset_blocked = 0;
+
+static int dump_misc(struct parasite_dump_misc *args)
+{
+ args->brk = sys_brk(0);
+ args->blocked = old_blocked;
+
+ args->pid = sys_getpid();
+ args->sid = sys_getsid();
+ args->pgid = sys_getpgid();
+
+ return 0;
+}
+
+static int dump_creds(struct parasite_dump_creds *args)
+{
+ int ret;
+
+ args->secbits = sys_prctl(PR_GET_SECUREBITS, 0, 0, 0, 0);
+
+ ret = sys_getgroups(0, NULL);
+ if (ret < 0)
+ goto grps_err;
+
+ args->ngroups = ret;
+ if (args->ngroups >= PARASITE_MAX_GROUPS) {
+ pr_err("Too many groups in task %d\n", (int)args->ngroups);
+ return -1;
+ }
+
+ ret = sys_getgroups(args->ngroups, args->groups);
+ if (ret < 0)
+ goto grps_err;
+
+ if (ret != args->ngroups) {
+ pr_err("Groups changed on the fly %d -> %d\n",
+ args->ngroups, ret);
+ return -1;
+ }
+
+ return 0;
+
+grps_err:
+ pr_err("Error calling getgroups (%d)\n", ret);
+ return -1;
+}
+
+static int dump_tid_info(struct parasite_dump_tid_info *args)
+{
+ int ret;
+
+ ret = sys_prctl(PR_GET_TID_ADDRESS, (unsigned long) &args->tid_addr, 0, 0, 0);
+ if (ret)
+ return ret;
+
+ args->tid = sys_gettid();
+
+ return 0;
+}
+
+static int drain_fds(struct parasite_drain_fd *args)
+{
+ int ret;
+
+ ret = send_fds(tsock, NULL, 0,
+ args->fds, args->nr_fds, true);
+ if (ret)
+ pr_err("send_fds failed\n");
+
+ return ret;
+}
+
+static int init(struct parasite_init_args *args)
+{
+ k_rtsigset_t to_block;
+ int ret;
+
+ ret = brk_init();
+ if (ret)
+ return -ret;
+
+ tsock = sys_socket(PF_UNIX, SOCK_DGRAM, 0);
+ if (tsock < 0)
+ return -tsock;
+
+ ret = sys_bind(tsock, (struct sockaddr *) &args->p_addr, args->p_addr_len);
+ if (ret < 0)
+ return ret;
+
+ ret = sys_connect(tsock, (struct sockaddr *)&args->h_addr, args->h_addr_len);
+ if (ret < 0)
+ return ret;
+
+ ksigfillset(&to_block);
+ ret = sys_sigprocmask(SIG_SETMASK, &to_block, &old_blocked, sizeof(k_rtsigset_t));
+ if (ret < 0)
+ reset_blocked = ret;
+ else
+ reset_blocked = 1;
+
+ return ret;
+}
+
+static char proc_mountpoint[] = "proc.crtools";
+static int parasite_get_proc_fd()
+{
+ int ret, fd = -1;
+ char buf[2];
+
+ ret = sys_readlink("/proc/self", buf, sizeof(buf));
+ if (ret < 0 && ret != -ENOENT) {
+ pr_err("Can't readlink /proc/self\n");
+ return ret;
+ }
+
+ /* Fast path -- if /proc belongs to this pidns */
+ if (ret == 1 && buf[0] == '1') {
+ fd = sys_open("/proc", O_RDONLY, 0);
+ goto out_send_fd;
+ }
+
+ if (sys_mkdir(proc_mountpoint, 0700)) {
+ pr_err("Can't create a directory\n");
+ return ret;
+ }
+
+ if (sys_mount("proc", proc_mountpoint, "proc", MS_MGC_VAL, NULL)) {
+ pr_err("mount failed\n");
+ ret = -1;
+ goto out_rmdir;
+ }
+
+ fd = sys_open(proc_mountpoint, O_RDONLY, 0);
+
+ if (sys_umount2(proc_mountpoint, MNT_DETACH)) {
+ pr_err("Can't umount procfs\n");
+ return -1;
+ }
+
+out_rmdir:
+ if (sys_rmdir(proc_mountpoint)) {
+ pr_err("Can't remove directory\n");
+ return -1;
+ }
+
+out_send_fd:
+ if (fd < 0)
+ return fd;
+ ret = send_fd(tsock, NULL, 0, fd);
+ sys_close(fd);
+ return ret;
+}
+
+static inline int tty_ioctl(int fd, int cmd, int *arg)
+{
+ int ret;
+
+ ret = sys_ioctl(fd, cmd, (unsigned long)arg);
+ if (ret < 0) {
+ if (ret != -ENOTTY)
+ return -1;
+ *arg = 0;
+ }
+ return 0;
+}
+
+static int parasite_dump_tty(struct parasite_tty_args *args)
+{
+ int ret;
+
+#ifndef TIOCGPKT
+# define TIOCGPKT _IOR('T', 0x38, int)
+#endif
+
+#ifndef TIOCGPTLCK
+# define TIOCGPTLCK _IOR('T', 0x39, int)
+#endif
+
+#ifndef TIOCGEXCL
+# define TIOCGEXCL _IOR('T', 0x40, int)
+#endif
+
+ ret = tty_ioctl(args->fd, TIOCGSID, &args->sid);
+ if (ret < 0)
+ goto err;
+
+ ret = tty_ioctl(args->fd, TIOCGPGRP, &args->pgrp);
+ if (ret < 0)
+ goto err;
+
+ ret = tty_ioctl(args->fd, TIOCGPKT, &args->st_pckt);
+ if (ret < 0)
+ goto err;
+
+ ret = tty_ioctl(args->fd, TIOCGPTLCK, &args->st_lock);
+ if (ret < 0)
+ goto err;
+
+ ret = tty_ioctl(args->fd, TIOCGEXCL, &args->st_excl);
+ if (ret < 0)
+ goto err;
+
+ args->hangup = false;
+ return 0;
+
+err:
+ if (ret != -EIO) {
+ pr_err("TTY: Can't get sid/pgrp\n");
+ return -1;
+ }
+
+ /* kernel reports EIO for get ioctls on pair-less ptys */
+ args->sid = 0;
+ args->pgrp = 0;
+ args->st_pckt = 0;
+ args->st_lock = 0;
+ args->st_excl = 0;
+ args->hangup = true;
+
+ return 0;
+}
+
+static int parasite_cfg_log(struct parasite_log_args *args)
+{
+ int ret;
+
+ ret = recv_fd(tsock);
+ if (ret >= 0) {
+ log_set_fd(ret);
+ log_set_loglevel(args->log_level);
+ ret = 0;
+ }
+
+ return ret;
+}
+
+static int fini(void)
+{
+ if (reset_blocked == 1)
+ sys_sigprocmask(SIG_SETMASK, &old_blocked, NULL, sizeof(k_rtsigset_t));
+
+ log_set_fd(-1);
+ sys_close(tsock);
+ brk_fini();
+
+ return 0;
+}
+
+int __used parasite_service(unsigned int cmd, void *args)
+{
+ pr_info("Parasite cmd %d/%x process\n", cmd, cmd);
+
+ switch (cmd) {
+ case PARASITE_CMD_INIT:
+ return init(args);
+ case PARASITE_CMD_FINI:
+ return fini();
+ case PARASITE_CMD_CFG_LOG:
+ return parasite_cfg_log(args);
+ case PARASITE_CMD_DUMPPAGES_INIT:
+ return dump_pages_init();
+ case PARASITE_CMD_DUMPPAGES_FINI:
+ return dump_pages_fini();
+ case PARASITE_CMD_DUMPPAGES:
+ return dump_pages(args);
+ case PARASITE_CMD_DUMP_SIGACTS:
+ return dump_sigact(args);
+ case PARASITE_CMD_DUMP_ITIMERS:
+ return dump_itimers(args);
+ case PARASITE_CMD_DUMP_MISC:
+ return dump_misc(args);
+ case PARASITE_CMD_DUMP_CREDS:
+ return dump_creds(args);
+ case PARASITE_CMD_DUMP_TID_ADDR:
+ return dump_tid_info(args);
+ case PARASITE_CMD_DRAIN_FDS:
+ return drain_fds(args);
+ case PARASITE_CMD_GET_PROC_FD:
+ return parasite_get_proc_fd();
+ case PARASITE_CMD_DUMP_TTY:
+ return parasite_dump_tty(args);
+ }
+
+ pr_err("Unknown command to parasite\n");
+ return -EINVAL;
+}
diff --git a/pie/pie.lds.S b/pie/pie.lds.S
new file mode 100644
index 0000000..4055af7
--- /dev/null
+++ b/pie/pie.lds.S
@@ -0,0 +1,25 @@
+OUTPUT_ARCH(i386:x86-64)
+
+SECTIONS
+{
+ .crblob 0x0 : {
+ *(.head.text)
+ *(.text)
+ . = ALIGN(32);
+ *(.data*)
+ . = ALIGN(32);
+ *(.rodata*)
+ . = ALIGN(32);
+ *(.bss*)
+ . = ALIGN(32);
+ } =0x00000000
+
+ /DISCARD/ : {
+ *(.debug*)
+ *(.comment*)
+ *(.note*)
+ *(.group*)
+ *(.eh_frame*)
+ *(*)
+ }
+}
diff --git a/pie/restorer.c b/pie/restorer.c
new file mode 100644
index 0000000..2d9a82c
--- /dev/null
+++ b/pie/restorer.c
@@ -0,0 +1,690 @@
+#define CR_NOGLIBC
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <sys/time.h>
+#include <sys/shm.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sched.h>
+#include <sys/resource.h>
+
+#include "compiler.h"
+#include "types.h"
+#include "syscall.h"
+#include "log.h"
+#include "util.h"
+#include "image.h"
+#include "sk-inet.h"
+
+#include "crtools.h"
+#include "lock.h"
+#include "restorer.h"
+
+#include "creds.pb-c.h"
+
+#define sys_prctl_safe(opcode, val1, val2, val3) \
+ ({ \
+ long __ret = sys_prctl(opcode, val1, val2, val3, 0); \
+ if (__ret) \
+ pr_err("prctl failed @%d with %ld\n", __LINE__, __ret);\
+ __ret; \
+ })
+
+static struct task_entries *task_entries;
+
+static void sigchld_handler(int signal, siginfo_t *siginfo, void *data)
+{
+ char *r;
+
+ if (siginfo->si_code & CLD_EXITED)
+ r = " exited, status=";
+ else if (siginfo->si_code & CLD_KILLED)
+ r = " killed by signal ";
+ else
+ r = "disappeared with ";
+
+ pr_info("Task %d %s %d\n", siginfo->si_pid, r, siginfo->si_status);
+
+ futex_abort_and_wake(&task_entries->nr_in_progress);
+ /* sa_restorer may be unmaped, so we can't go back to userspace*/
+ sys_kill(sys_getpid(), SIGSTOP);
+ sys_exit_group(1);
+}
+
+static void restore_creds(CredsEntry *ce)
+{
+ int b, i;
+ struct cap_header hdr;
+ struct cap_data data[_LINUX_CAPABILITY_U32S_3];
+
+ /*
+ * We're still root here and thus can do it without failures.
+ */
+
+ /*
+ * First -- set the SECURE_NO_SETUID_FIXUP bit not to
+ * lose caps bits when changing xids.
+ */
+
+ sys_prctl(PR_SET_SECUREBITS, 1 << SECURE_NO_SETUID_FIXUP, 0, 0, 0);
+
+ /*
+ * Second -- restore xids. Since we still have the CAP_SETUID
+ * capability nothing should fail. But call the setfsXid last
+ * to override the setresXid settings.
+ */
+
+ sys_setresuid(ce->uid, ce->euid, ce->suid);
+ sys_setfsuid(ce->fsuid);
+ sys_setresgid(ce->gid, ce->egid, ce->sgid);
+ sys_setfsgid(ce->fsgid);
+
+ /*
+ * Third -- restore securebits. We don't need them in any
+ * special state any longer.
+ */
+
+ sys_prctl(PR_SET_SECUREBITS, ce->secbits, 0, 0, 0);
+
+ /*
+ * Fourth -- trim bset. This can only be done while
+ * having the CAP_SETPCAP capablity.
+ */
+
+ for (b = 0; b < CR_CAP_SIZE; b++) {
+ for (i = 0; i < 32; i++) {
+ if (ce->cap_bnd[b] & (1 << i))
+ /* already set */
+ continue;
+
+ sys_prctl(PR_CAPBSET_DROP, i + b * 32, 0, 0, 0);
+ }
+ }
+
+ /*
+ * Fifth -- restore caps. Nothing but cap bits are changed
+ * at this stage, so just do it.
+ */
+
+ hdr.version = _LINUX_CAPABILITY_VERSION_3;
+ hdr.pid = 0;
+
+ BUILD_BUG_ON(_LINUX_CAPABILITY_U32S_3 != CR_CAP_SIZE);
+
+ for (i = 0; i < CR_CAP_SIZE; i++) {
+ data[i].eff = ce->cap_eff[i];
+ data[i].prm = ce->cap_prm[i];
+ data[i].inh = ce->cap_inh[i];
+ }
+
+ sys_capset(&hdr, data);
+}
+
+static void restore_sched_info(struct rst_sched_param *p)
+{
+ struct sched_param parm;
+
+ if ((p->policy == SCHED_OTHER) && (p->nice == 0))
+ return;
+
+ pr_info("Restoring scheduler params %d.%d.%d\n",
+ p->policy, p->nice, p->prio);
+
+ sys_setpriority(PRIO_PROCESS, 0, p->nice);
+ parm.sched_priority = p->prio;
+ sys_sched_setscheduler(0, p->policy, &parm);
+}
+
+static int restore_gpregs(struct rt_sigframe *f, UserX86RegsEntry *r)
+{
+ long ret;
+ unsigned long fsgs_base;
+
+#define CPREG1(d) f->uc.uc_mcontext.d = r->d
+#define CPREG2(d, s) f->uc.uc_mcontext.d = r->s
+
+ CPREG1(r8);
+ CPREG1(r9);
+ CPREG1(r10);
+ CPREG1(r11);
+ CPREG1(r12);
+ CPREG1(r13);
+ CPREG1(r14);
+ CPREG1(r15);
+ CPREG2(rdi, di);
+ CPREG2(rsi, si);
+ CPREG2(rbp, bp);
+ CPREG2(rbx, bx);
+ CPREG2(rdx, dx);
+ CPREG2(rax, ax);
+ CPREG2(rcx, cx);
+ CPREG2(rsp, sp);
+ CPREG2(rip, ip);
+ CPREG2(eflags, flags);
+ CPREG1(cs);
+ CPREG1(gs);
+ CPREG1(fs);
+
+ fsgs_base = r->fs_base;
+ ret = sys_arch_prctl(ARCH_SET_FS, fsgs_base);
+ if (ret) {
+ pr_info("SET_FS fail %ld\n", ret);
+ return -1;
+ }
+
+ fsgs_base = r->gs_base;
+ ret = sys_arch_prctl(ARCH_SET_GS, fsgs_base);
+ if (ret) {
+ pr_info("SET_GS fail %ld\n", ret);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int restore_thread_common(struct rt_sigframe *sigframe,
+ struct thread_restore_args *args)
+{
+ sys_set_tid_address((int *)args->clear_tid_addr);
+
+ if (args->has_futex) {
+ if (sys_set_robust_list((void *)args->futex_rla, args->futex_rla_len)) {
+ pr_err("Robust list err\n");
+ return -1;
+ }
+ }
+
+ restore_sched_info(&args->sp);
+
+ return restore_gpregs(sigframe, &args->gpregs);
+}
+
+/*
+ * Threads restoration via sigreturn. Note it's locked
+ * routine and calls for unlock at the end.
+ */
+long __export_restore_thread(struct thread_restore_args *args)
+{
+ struct rt_sigframe *rt_sigframe;
+ unsigned long new_sp;
+ int my_pid = sys_gettid();
+
+ if (my_pid != args->pid) {
+ pr_err("Thread pid mismatch %d/%d\n", my_pid, args->pid);
+ goto core_restore_end;
+ }
+
+ rt_sigframe = (void *)args->mem_zone.rt_sigframe + 8;
+
+ if (restore_thread_common(rt_sigframe, args))
+ goto core_restore_end;
+
+ mutex_unlock(&args->ta->rst_lock);
+
+ restore_creds(&args->ta->creds);
+
+ futex_dec_and_wake(&task_entries->nr_in_progress);
+
+ pr_info("%ld: Restored\n", sys_gettid());
+
+ futex_wait_while(&task_entries->start, CR_STATE_RESTORE);
+ futex_dec_and_wake(&task_entries->nr_in_progress);
+
+ new_sp = (long)rt_sigframe + 8;
+ asm volatile(
+ "movq %0, %%rax \n"
+ "movq %%rax, %%rsp \n"
+ "movl $"__stringify(__NR_rt_sigreturn)", %%eax \n"
+ "syscall \n"
+ :
+ : "r"(new_sp)
+ : "rax","rsp","memory");
+core_restore_end:
+ pr_err("Restorer abnormal termination for %ld\n", sys_getpid());
+ sys_exit_group(1);
+ return -1;
+}
+
+static long restore_self_exe_late(struct task_restore_core_args *args)
+{
+ int fd = args->fd_exe_link;
+
+ pr_info("Restoring EXE link\n");
+ sys_prctl_safe(PR_SET_MM, PR_SET_MM_EXE_FILE, fd, 0);
+ sys_close(fd);
+
+ /* FIXME Once kernel side stabilized -- fix error reporting */
+ return 0;
+}
+
+static u64 restore_mapping(const VmaEntry *vma_entry)
+{
+ int prot = vma_entry->prot;
+ int flags = vma_entry->flags | MAP_FIXED;
+ u64 addr;
+
+ if (vma_entry_is(vma_entry, VMA_AREA_SYSVIPC))
+ return sys_shmat(vma_entry->fd, (void *)vma_entry->start,
+ (vma_entry->prot & PROT_WRITE) ? 0 : SHM_RDONLY);
+
+ /*
+ * Restore or shared mappings are tricky, since
+ * we open anonymous mapping via map_files/
+ * MAP_ANONYMOUS should be eliminated so fd would
+ * be taken into account by a kernel.
+ */
+ if (vma_entry_is(vma_entry, VMA_ANON_SHARED) && (vma_entry->fd != -1UL))
+ flags &= ~MAP_ANONYMOUS;
+
+ /* A mapping of file with MAP_SHARED is up to date */
+ if (vma_entry->fd == -1 || !(vma_entry->flags & MAP_SHARED))
+ prot |= PROT_WRITE;
+
+ /*
+ * Should map memory here. Note we map them as
+ * writable since we're going to restore page
+ * contents.
+ */
+ addr = sys_mmap((void *)vma_entry->start,
+ vma_entry_len(vma_entry),
+ prot, flags,
+ vma_entry->fd,
+ vma_entry->pgoff);
+
+ if (vma_entry->fd != -1)
+ sys_close(vma_entry->fd);
+
+ return addr;
+}
+
+static void rst_tcp_socks_all(int *arr, int size)
+{
+ int i;
+
+ if (size == 0)
+ return;
+
+ for (i =0; arr[i] >= 0; i++)
+ tcp_repair_off(arr[i]);
+
+ sys_munmap(arr, size);
+}
+
+/*
+ * The main routine to restore task via sigreturn.
+ * This one is very special, we never return there
+ * but use sigreturn facility to restore core registers
+ * and jump execution to some predefined ip read from
+ * core file.
+ */
+long __export_restore_task(struct task_restore_core_args *args)
+{
+ long ret = -1;
+ VmaEntry *vma_entry;
+ u64 va;
+
+ struct rt_sigframe *rt_sigframe;
+ unsigned long new_sp;
+ pid_t my_pid = sys_getpid();
+ rt_sigaction_t act;
+
+ task_entries = args->task_entries;
+ sys_sigaction(SIGCHLD, NULL, &act, sizeof(rt_sigset_t));
+ act.rt_sa_handler = sigchld_handler;
+ sys_sigaction(SIGCHLD, &act, NULL, sizeof(rt_sigset_t));
+
+ log_set_fd(args->logfd);
+ log_set_loglevel(args->loglevel);
+
+ pr_info("Switched to the restorer %d\n", my_pid);
+
+ for (vma_entry = args->self_vmas; vma_entry->start != 0; vma_entry++) {
+ if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR))
+ continue;
+
+ if (sys_munmap((void *)vma_entry->start, vma_entry_len(vma_entry))) {
+ pr_err("Munmap fail for %lx\n", vma_entry->start);
+ goto core_restore_end;
+ }
+ }
+
+ sys_munmap(args->self_vmas,
+ ((void *)(vma_entry + 1) - ((void *)args->self_vmas)));
+
+ /*
+ * OK, lets try to map new one.
+ */
+ for (vma_entry = args->tgt_vmas; vma_entry->start != 0; vma_entry++) {
+ if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR))
+ continue;
+
+ va = restore_mapping(vma_entry);
+
+ if (va != vma_entry->start) {
+ pr_err("Can't restore %lx mapping with %lx\n", vma_entry->start, va);
+ goto core_restore_end;
+ }
+ }
+
+ /*
+ * Read page contents.
+ */
+ while (1) {
+ ret = sys_read(args->fd_pages, &va, sizeof(va));
+ if (!ret)
+ break;
+
+ if (ret != sizeof(va)) {
+ pr_err("Bad mapping page size %ld\n", ret);
+ goto core_restore_end;
+ }
+
+ ret = sys_read(args->fd_pages, (void *)va, PAGE_SIZE);
+ if (ret != PAGE_SIZE) {
+ pr_err("Can'r read mapping page %ld\n", ret);
+ goto core_restore_end;
+ }
+ }
+
+ sys_close(args->fd_pages);
+
+ /*
+ * Walk though all VMAs again to drop PROT_WRITE
+ * if it was not there.
+ */
+ for (vma_entry = args->tgt_vmas; vma_entry->start != 0; vma_entry++) {
+ if (!(vma_entry_is(vma_entry, VMA_AREA_REGULAR)))
+ continue;
+
+ if (vma_entry_is(vma_entry, VMA_ANON_SHARED)) {
+ struct shmem_info *entry;
+
+ entry = find_shmem(args->shmems,
+ vma_entry->shmid);
+ if (entry && entry->pid == my_pid &&
+ entry->start == vma_entry->start)
+ futex_set_and_wake(&entry->lock, 1);
+ }
+
+ if (vma_entry->prot & PROT_WRITE)
+ continue;
+
+ sys_mprotect((void *)vma_entry->start,
+ vma_entry_len(vma_entry),
+ vma_entry->prot);
+ }
+
+ /*
+ * Finally restore madivse() bits
+ */
+ for (vma_entry = args->tgt_vmas; vma_entry->start != 0; vma_entry++) {
+ unsigned long i;
+
+ if (!vma_entry->has_madv || !vma_entry->madv)
+ continue;
+ for (i = 0; i < sizeof(vma_entry->madv) * 8; i++) {
+ if (vma_entry->madv & (1ul << i)) {
+ ret = sys_madvise(vma_entry->start,
+ vma_entry_len(vma_entry),
+ i);
+ if (ret) {
+ pr_err("madvise(%lx, %ld, %ld) "
+ "failed with %ld\n",
+ vma_entry->start,
+ vma_entry_len(vma_entry),
+ i, ret);
+ goto core_restore_end;
+ }
+ }
+ }
+ }
+
+ sys_munmap(args->tgt_vmas,
+ ((void *)(vma_entry + 1) - ((void *)args->tgt_vmas)));
+
+ ret = sys_munmap(args->shmems, SHMEMS_SIZE);
+ if (ret < 0) {
+ pr_err("Can't unmap shmem %ld\n", ret);
+ goto core_restore_end;
+ }
+
+ /*
+ * Tune up the task fields.
+ */
+ ret |= sys_prctl_safe(PR_SET_NAME, (long)args->comm, 0, 0);
+
+ ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_CODE, (long)args->mm.mm_start_code, 0);
+ ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_CODE, (long)args->mm.mm_end_code, 0);
+ ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_DATA, (long)args->mm.mm_start_data, 0);
+ ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_DATA, (long)args->mm.mm_end_data, 0);
+ ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_STACK, (long)args->mm.mm_start_stack, 0);
+ ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_BRK, (long)args->mm.mm_start_brk, 0);
+ ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_BRK, (long)args->mm.mm_brk, 0);
+ ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_START, (long)args->mm.mm_arg_start, 0);
+ ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_END, (long)args->mm.mm_arg_end, 0);
+ ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_START, (long)args->mm.mm_env_start, 0);
+ ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_END, (long)args->mm.mm_env_end, 0);
+ ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_AUXV, (long)args->mm_saved_auxv, args->mm_saved_auxv_size);
+ if (ret)
+ goto core_restore_end;
+
+ /*
+ * Because of requirements applied from kernel side
+ * we need to restore /proc/pid/exe symlink late,
+ * after old existing VMAs are superseded with
+ * new ones from image file.
+ */
+ ret = restore_self_exe_late(args);
+ if (ret)
+ goto core_restore_end;
+
+ /*
+ * We need to prepare a valid sigframe here, so
+ * after sigreturn the kernel will pick up the
+ * registers from the frame, set them up and
+ * finally pass execution to the new IP.
+ */
+ rt_sigframe = (void *)args->t.mem_zone.rt_sigframe + 8;
+
+ if (restore_thread_common(rt_sigframe, &args->t))
+ goto core_restore_end;
+
+ /*
+ * Blocked signals.
+ */
+ rt_sigframe->uc.uc_sigmask.sig[0] = args->blk_sigset;
+
+ /*
+ * Threads restoration. This requires some more comments. This
+ * restorer routine and thread restorer routine has the following
+ * memory map, prepared by a caller code.
+ *
+ * | <-- low addresses high addresses --> |
+ * +-------------------------------------------------------+-----------------------+
+ * | this proc body | own stack | heap | rt_sigframe space | thread restore zone |
+ * +-------------------------------------------------------+-----------------------+
+ *
+ * where each thread restore zone is the following
+ *
+ * | <-- low addresses high addresses --> |
+ * +--------------------------------------------------------------------------+
+ * | thread restore proc | thread1 stack | thread1 heap | thread1 rt_sigframe |
+ * +--------------------------------------------------------------------------+
+ */
+
+ if (args->nr_threads > 1) {
+ struct thread_restore_args *thread_args = args->thread_args;
+ long clone_flags = CLONE_VM | CLONE_FILES | CLONE_SIGHAND |
+ CLONE_THREAD | CLONE_SYSVSEM;
+ long last_pid_len;
+ long parent_tid;
+ int i, fd;
+
+ fd = sys_open(LAST_PID_PATH, O_RDWR, LAST_PID_PERM);
+ if (fd < 0) {
+ pr_err("Can't open last_pid %d\n", fd);
+ goto core_restore_end;
+ }
+
+ ret = sys_flock(fd, LOCK_EX);
+ if (ret) {
+ pr_err("Can't lock last_pid %d\n", fd);
+ goto core_restore_end;
+ }
+
+ for (i = 0; i < args->nr_threads; i++) {
+ char last_pid_buf[16], *s;
+
+ /* skip self */
+ if (thread_args[i].pid == args->t.pid)
+ continue;
+
+ mutex_lock(&args->rst_lock);
+
+ new_sp =
+ RESTORE_ALIGN_STACK((long)thread_args[i].mem_zone.stack,
+ sizeof(thread_args[i].mem_zone.stack));
+
+ last_pid_len = vprint_num(last_pid_buf, sizeof(last_pid_buf), thread_args[i].pid - 1, &s);
+ ret = sys_write(fd, s, last_pid_len);
+ if (ret < 0) {
+ pr_err("Can't set last_pid %ld/%s\n", ret, last_pid_buf);
+ goto core_restore_end;
+ }
+
+ /*
+ * To achieve functionality like libc's clone()
+ * we need a pure assembly here, because clone()'ed
+ * thread will run with own stack and we must not
+ * have any additional instructions... oh, dear...
+ */
+ asm volatile(
+ "clone_emul: \n"
+ "movq %2, %%rsi \n"
+ "subq $16, %%rsi \n"
+ "movq %6, %%rdi \n"
+ "movq %%rdi, 8(%%rsi) \n"
+ "movq %5, %%rdi \n"
+ "movq %%rdi, 0(%%rsi) \n"
+ "movq %1, %%rdi \n"
+ "movq %3, %%rdx \n"
+ "movq %4, %%r10 \n"
+ "movl $"__stringify(__NR_clone)", %%eax \n"
+ "syscall \n"
+
+ "testq %%rax,%%rax \n"
+ "jz thread_run \n"
+
+ "movq %%rax, %0 \n"
+ "jmp clone_end \n"
+
+ "thread_run: \n" /* new stack here */
+ "xorq %%rbp, %%rbp \n" /* clear ABI frame pointer */
+ "popq %%rax \n" /* clone_restore_fn -- restore_thread */
+ "popq %%rdi \n" /* arguments */
+ "callq *%%rax \n"
+
+ "clone_end: \n"
+ : "=r"(ret)
+ : "g"(clone_flags),
+ "g"(new_sp),
+ "g"(&parent_tid),
+ "g"(&thread_args[i].pid),
+ "g"(args->clone_restore_fn),
+ "g"(&thread_args[i])
+ : "rax", "rdi", "rsi", "rdx", "r10", "memory");
+ }
+
+ ret = sys_flock(fd, LOCK_UN);
+ if (ret) {
+ pr_err("Can't unlock last_pid %ld\n", ret);
+ goto core_restore_end;
+ }
+
+ sys_close(fd);
+ }
+
+ /*
+ * Writing to last-pid is CAP_SYS_ADMIN protected, thus restore
+ * creds _after_ all threads creation.
+ */
+
+ restore_creds(&args->creds);
+
+ futex_dec_and_wake(&args->task_entries->nr_in_progress);
+
+ pr_info("%ld: Restored\n", sys_getpid());
+
+ futex_wait_while(&args->task_entries->start, CR_STATE_RESTORE);
+
+ sys_sigaction(SIGCHLD, &args->sigchld_act, NULL, sizeof(rt_sigset_t));
+
+ futex_dec_and_wake(&args->task_entries->nr_in_progress);
+
+ futex_wait_while(&args->task_entries->start, CR_STATE_RESTORE_SIGCHLD);
+
+ rst_tcp_socks_all(args->rst_tcp_socks, args->rst_tcp_socks_size);
+
+ log_set_fd(-1);
+
+ /*
+ * The code that prepared the itimers makes shure the
+ * code below doesn't fail due to bad timing values.
+ */
+
+#define itimer_armed(args, i) \
+ (args->itimers[i].it_interval.tv_sec || \
+ args->itimers[i].it_interval.tv_usec)
+
+ if (itimer_armed(args, 0))
+ sys_setitimer(ITIMER_REAL, &args->itimers[0], NULL);
+ if (itimer_armed(args, 1))
+ sys_setitimer(ITIMER_VIRTUAL, &args->itimers[1], NULL);
+ if (itimer_armed(args, 2))
+ sys_setitimer(ITIMER_PROF, &args->itimers[2], NULL);
+
+ ret = sys_munmap(args->task_entries, TASK_ENTRIES_SIZE);
+ if (ret < 0) {
+ ret = ((long)__LINE__ << 32) | -ret;
+ goto core_restore_failed;
+ }
+
+ /*
+ * Sigframe stack.
+ */
+ new_sp = (long)rt_sigframe + 8;
+
+ /*
+ * Prepare the stack and call for sigreturn,
+ * pure assembly since we don't need any additional
+ * code insns from gcc.
+ */
+ asm volatile(
+ "movq %0, %%rax \n"
+ "movq %%rax, %%rsp \n"
+ "movl $"__stringify(__NR_rt_sigreturn)", %%eax \n"
+ "syscall \n"
+ :
+ : "r"(new_sp)
+ : "rax","rsp","memory");
+
+core_restore_end:
+ pr_err("Restorer fail %ld\n", sys_getpid());
+ sys_exit_group(1);
+ return -1;
+
+core_restore_failed:
+ asm volatile(
+ "movq %0, %%rsp \n"
+ "movq 0, %%rax \n"
+ "jmp *%%rax \n"
+ :
+ : "r"(ret)
+ : "memory");
+ return ret;
+}
diff --git a/pie/util-net.c b/pie/util-net.c
new file mode 100644
index 0000000..0feae03
--- /dev/null
+++ b/pie/util-net.c
@@ -0,0 +1,151 @@
+#include <sys/socket.h>
+#include <sys/un.h>
+
+#include <errno.h>
+
+#include "compiler.h"
+#include "memcpy_64.h"
+#include "types.h"
+#include "syscall.h"
+
+#include "util-net.h"
+
+static void scm_fdset_init_chunk(struct scm_fdset *fdset, int nr_fds)
+{
+ struct cmsghdr *cmsg;
+
+ fdset->hdr.msg_controllen = CMSG_LEN(sizeof(int) * nr_fds);
+
+ cmsg = CMSG_FIRSTHDR(&fdset->hdr);
+ cmsg->cmsg_len = fdset->hdr.msg_controllen;
+}
+
+static int *scm_fdset_init(struct scm_fdset *fdset, struct sockaddr_un *saddr,
+ int saddr_len, bool with_flags)
+{
+ struct cmsghdr *cmsg;
+
+ BUILD_BUG_ON(CR_SCM_MAX_FD > SCM_MAX_FD);
+ BUILD_BUG_ON(sizeof(fdset->msg_buf) < (CMSG_SPACE(sizeof(int) * CR_SCM_MAX_FD)));
+
+ fdset->iov.iov_base = fdset->opts;
+ fdset->iov.iov_len = with_flags ? sizeof(fdset->opts) : 1;
+
+ fdset->hdr.msg_iov = &fdset->iov;
+ fdset->hdr.msg_iovlen = 1;
+ fdset->hdr.msg_name = (struct sockaddr *)saddr;
+ fdset->hdr.msg_namelen = saddr_len;
+
+ fdset->hdr.msg_control = &fdset->msg_buf;
+ fdset->hdr.msg_controllen = CMSG_LEN(sizeof(int) * CR_SCM_MAX_FD);
+
+ cmsg = CMSG_FIRSTHDR(&fdset->hdr);
+ cmsg->cmsg_len = fdset->hdr.msg_controllen;
+ cmsg->cmsg_level = SOL_SOCKET;
+ cmsg->cmsg_type = SCM_RIGHTS;
+
+ return (int *)CMSG_DATA(cmsg);
+}
+
+int send_fds(int sock, struct sockaddr_un *saddr, int len,
+ int *fds, int nr_fds, bool with_flags)
+{
+ struct scm_fdset fdset;
+ int *cmsg_data;
+ int i, min_fd, ret;
+
+ cmsg_data = scm_fdset_init(&fdset, saddr, len, with_flags);
+ for (i = 0; i < nr_fds; i += min_fd) {
+ min_fd = min(CR_SCM_MAX_FD, nr_fds - i);
+ scm_fdset_init_chunk(&fdset, min_fd);
+ builtin_memcpy(cmsg_data, &fds[i], sizeof(int) * min_fd);
+
+ if (with_flags) {
+ int j;
+
+ for (j = 0; j < min_fd; j++) {
+ int flags, fd = fds[i + j];
+ struct fd_opts *p = fdset.opts + j;
+ struct f_owner_ex owner_ex;
+ u32 v[2];
+
+ flags = sys_fcntl(fd, F_GETFD, 0);
+ if (flags < 0)
+ return -1;
+
+ p->flags = (char)flags;
+
+ if (sys_fcntl(fd, F_GETOWN_EX, (long)&owner_ex))
+ return -1;
+
+ /*
+ * Simple case -- nothing is changed.
+ */
+ if (owner_ex.pid == 0) {
+ p->fown.pid = 0;
+ continue;
+ }
+
+ if (sys_fcntl(fd, F_GETOWNER_UIDS, (long)&v))
+ return -1;
+
+ p->fown.uid = v[0];
+ p->fown.euid = v[1];
+ p->fown.pid_type = owner_ex.type;
+ p->fown.pid = owner_ex.pid;
+ }
+ }
+
+ ret = sys_sendmsg(sock, &fdset.hdr, 0);
+ if (ret <= 0)
+ return ret ? : -1;
+ }
+
+ return 0;
+}
+
+int recv_fds(int sock, int *fds, int nr_fds, struct fd_opts *opts)
+{
+ struct scm_fdset fdset;
+ struct cmsghdr *cmsg;
+ int *cmsg_data;
+ int ret;
+ int i, min_fd;
+
+ cmsg_data = scm_fdset_init(&fdset, NULL, 0, opts != NULL);
+ for (i = 0; i < nr_fds; i += min_fd) {
+ min_fd = min(CR_SCM_MAX_FD, nr_fds - i);
+ scm_fdset_init_chunk(&fdset, min_fd);
+
+ ret = sys_recvmsg(sock, &fdset.hdr, 0);
+ if (ret <= 0)
+ return ret ? : -1;
+
+ cmsg = CMSG_FIRSTHDR(&fdset.hdr);
+ if (!cmsg || cmsg->cmsg_type != SCM_RIGHTS)
+ return -EINVAL;
+ if (fdset.hdr.msg_flags & MSG_CTRUNC)
+ return -ENFILE;
+
+ min_fd = (cmsg->cmsg_len - sizeof(struct cmsghdr)) / sizeof(int);
+ /*
+ * In case if kernel screwed the recepient, most probably
+ * the caller stack frame will be overwriten, just scream
+ * and exit.
+ *
+ * FIXME Need to sanitize util.h to be able to include it
+ * into files which do not have glibc and a couple of
+ * sys_write_ helpers. Meawhile opencoded BUG_ON here.
+ */
+ if (unlikely(min_fd > CR_SCM_MAX_FD))
+ *(volatile unsigned long *)NULL = 0xdead0000 + __LINE__;
+ if (unlikely(min_fd <= 0))
+ return -1;
+ builtin_memcpy(&fds[i], cmsg_data, sizeof(int) * min_fd);
+ if (opts)
+ builtin_memcpy(opts + i, fdset.opts, sizeof(struct fd_opts) * min_fd);
+ }
+
+ return 0;
+}
+
diff --git a/restorer.c b/restorer.c
deleted file mode 100644
index 369adb9..0000000
--- a/restorer.c
+++ /dev/null
@@ -1,690 +0,0 @@
-#define CR_NOGLIBC
-#include <stdio.h>
-#include <stdlib.h>
-
-#include <sys/types.h>
-#include <sys/mman.h>
-#include <sys/stat.h>
-#include <sys/wait.h>
-#include <sys/time.h>
-#include <sys/shm.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <sched.h>
-#include <sys/resource.h>
-
-#include "compiler.h"
-#include "types.h"
-#include "syscall.h"
-#include "log.h"
-#include "util.h"
-#include "image.h"
-#include "sk-inet.h"
-
-#include "crtools.h"
-#include "lock.h"
-#include "restorer.h"
-
-#include "protobuf/creds.pb-c.h"
-
-#define sys_prctl_safe(opcode, val1, val2, val3) \
- ({ \
- long __ret = sys_prctl(opcode, val1, val2, val3, 0); \
- if (__ret) \
- pr_err("prctl failed @%d with %ld\n", __LINE__, __ret);\
- __ret; \
- })
-
-static struct task_entries *task_entries;
-
-static void sigchld_handler(int signal, siginfo_t *siginfo, void *data)
-{
- char *r;
-
- if (siginfo->si_code & CLD_EXITED)
- r = " exited, status=";
- else if (siginfo->si_code & CLD_KILLED)
- r = " killed by signal ";
- else
- r = "disappeared with ";
-
- pr_info("Task %d %s %d\n", siginfo->si_pid, r, siginfo->si_status);
-
- futex_abort_and_wake(&task_entries->nr_in_progress);
- /* sa_restorer may be unmaped, so we can't go back to userspace*/
- sys_kill(sys_getpid(), SIGSTOP);
- sys_exit_group(1);
-}
-
-static void restore_creds(CredsEntry *ce)
-{
- int b, i;
- struct cap_header hdr;
- struct cap_data data[_LINUX_CAPABILITY_U32S_3];
-
- /*
- * We're still root here and thus can do it without failures.
- */
-
- /*
- * First -- set the SECURE_NO_SETUID_FIXUP bit not to
- * lose caps bits when changing xids.
- */
-
- sys_prctl(PR_SET_SECUREBITS, 1 << SECURE_NO_SETUID_FIXUP, 0, 0, 0);
-
- /*
- * Second -- restore xids. Since we still have the CAP_SETUID
- * capability nothing should fail. But call the setfsXid last
- * to override the setresXid settings.
- */
-
- sys_setresuid(ce->uid, ce->euid, ce->suid);
- sys_setfsuid(ce->fsuid);
- sys_setresgid(ce->gid, ce->egid, ce->sgid);
- sys_setfsgid(ce->fsgid);
-
- /*
- * Third -- restore securebits. We don't need them in any
- * special state any longer.
- */
-
- sys_prctl(PR_SET_SECUREBITS, ce->secbits, 0, 0, 0);
-
- /*
- * Fourth -- trim bset. This can only be done while
- * having the CAP_SETPCAP capablity.
- */
-
- for (b = 0; b < CR_CAP_SIZE; b++) {
- for (i = 0; i < 32; i++) {
- if (ce->cap_bnd[b] & (1 << i))
- /* already set */
- continue;
-
- sys_prctl(PR_CAPBSET_DROP, i + b * 32, 0, 0, 0);
- }
- }
-
- /*
- * Fifth -- restore caps. Nothing but cap bits are changed
- * at this stage, so just do it.
- */
-
- hdr.version = _LINUX_CAPABILITY_VERSION_3;
- hdr.pid = 0;
-
- BUILD_BUG_ON(_LINUX_CAPABILITY_U32S_3 != CR_CAP_SIZE);
-
- for (i = 0; i < CR_CAP_SIZE; i++) {
- data[i].eff = ce->cap_eff[i];
- data[i].prm = ce->cap_prm[i];
- data[i].inh = ce->cap_inh[i];
- }
-
- sys_capset(&hdr, data);
-}
-
-static void restore_sched_info(struct rst_sched_param *p)
-{
- struct sched_param parm;
-
- if ((p->policy == SCHED_OTHER) && (p->nice == 0))
- return;
-
- pr_info("Restoring scheduler params %d.%d.%d\n",
- p->policy, p->nice, p->prio);
-
- sys_setpriority(PRIO_PROCESS, 0, p->nice);
- parm.sched_priority = p->prio;
- sys_sched_setscheduler(0, p->policy, &parm);
-}
-
-static int restore_gpregs(struct rt_sigframe *f, UserX86RegsEntry *r)
-{
- long ret;
- unsigned long fsgs_base;
-
-#define CPREG1(d) f->uc.uc_mcontext.d = r->d
-#define CPREG2(d, s) f->uc.uc_mcontext.d = r->s
-
- CPREG1(r8);
- CPREG1(r9);
- CPREG1(r10);
- CPREG1(r11);
- CPREG1(r12);
- CPREG1(r13);
- CPREG1(r14);
- CPREG1(r15);
- CPREG2(rdi, di);
- CPREG2(rsi, si);
- CPREG2(rbp, bp);
- CPREG2(rbx, bx);
- CPREG2(rdx, dx);
- CPREG2(rax, ax);
- CPREG2(rcx, cx);
- CPREG2(rsp, sp);
- CPREG2(rip, ip);
- CPREG2(eflags, flags);
- CPREG1(cs);
- CPREG1(gs);
- CPREG1(fs);
-
- fsgs_base = r->fs_base;
- ret = sys_arch_prctl(ARCH_SET_FS, fsgs_base);
- if (ret) {
- pr_info("SET_FS fail %ld\n", ret);
- return -1;
- }
-
- fsgs_base = r->gs_base;
- ret = sys_arch_prctl(ARCH_SET_GS, fsgs_base);
- if (ret) {
- pr_info("SET_GS fail %ld\n", ret);
- return -1;
- }
-
- return 0;
-}
-
-static int restore_thread_common(struct rt_sigframe *sigframe,
- struct thread_restore_args *args)
-{
- sys_set_tid_address((int *)args->clear_tid_addr);
-
- if (args->has_futex) {
- if (sys_set_robust_list((void *)args->futex_rla, args->futex_rla_len)) {
- pr_err("Robust list err\n");
- return -1;
- }
- }
-
- restore_sched_info(&args->sp);
-
- return restore_gpregs(sigframe, &args->gpregs);
-}
-
-/*
- * Threads restoration via sigreturn. Note it's locked
- * routine and calls for unlock at the end.
- */
-long __export_restore_thread(struct thread_restore_args *args)
-{
- struct rt_sigframe *rt_sigframe;
- unsigned long new_sp;
- int my_pid = sys_gettid();
-
- if (my_pid != args->pid) {
- pr_err("Thread pid mismatch %d/%d\n", my_pid, args->pid);
- goto core_restore_end;
- }
-
- rt_sigframe = (void *)args->mem_zone.rt_sigframe + 8;
-
- if (restore_thread_common(rt_sigframe, args))
- goto core_restore_end;
-
- mutex_unlock(&args->ta->rst_lock);
-
- restore_creds(&args->ta->creds);
-
- futex_dec_and_wake(&task_entries->nr_in_progress);
-
- pr_info("%ld: Restored\n", sys_gettid());
-
- futex_wait_while(&task_entries->start, CR_STATE_RESTORE);
- futex_dec_and_wake(&task_entries->nr_in_progress);
-
- new_sp = (long)rt_sigframe + 8;
- asm volatile(
- "movq %0, %%rax \n"
- "movq %%rax, %%rsp \n"
- "movl $"__stringify(__NR_rt_sigreturn)", %%eax \n"
- "syscall \n"
- :
- : "r"(new_sp)
- : "rax","rsp","memory");
-core_restore_end:
- pr_err("Restorer abnormal termination for %ld\n", sys_getpid());
- sys_exit_group(1);
- return -1;
-}
-
-static long restore_self_exe_late(struct task_restore_core_args *args)
-{
- int fd = args->fd_exe_link;
-
- pr_info("Restoring EXE link\n");
- sys_prctl_safe(PR_SET_MM, PR_SET_MM_EXE_FILE, fd, 0);
- sys_close(fd);
-
- /* FIXME Once kernel side stabilized -- fix error reporting */
- return 0;
-}
-
-static u64 restore_mapping(const VmaEntry *vma_entry)
-{
- int prot = vma_entry->prot;
- int flags = vma_entry->flags | MAP_FIXED;
- u64 addr;
-
- if (vma_entry_is(vma_entry, VMA_AREA_SYSVIPC))
- return sys_shmat(vma_entry->fd, (void *)vma_entry->start,
- (vma_entry->prot & PROT_WRITE) ? 0 : SHM_RDONLY);
-
- /*
- * Restore or shared mappings are tricky, since
- * we open anonymous mapping via map_files/
- * MAP_ANONYMOUS should be eliminated so fd would
- * be taken into account by a kernel.
- */
- if (vma_entry_is(vma_entry, VMA_ANON_SHARED) && (vma_entry->fd != -1UL))
- flags &= ~MAP_ANONYMOUS;
-
- /* A mapping of file with MAP_SHARED is up to date */
- if (vma_entry->fd == -1 || !(vma_entry->flags & MAP_SHARED))
- prot |= PROT_WRITE;
-
- /*
- * Should map memory here. Note we map them as
- * writable since we're going to restore page
- * contents.
- */
- addr = sys_mmap((void *)vma_entry->start,
- vma_entry_len(vma_entry),
- prot, flags,
- vma_entry->fd,
- vma_entry->pgoff);
-
- if (vma_entry->fd != -1)
- sys_close(vma_entry->fd);
-
- return addr;
-}
-
-static void rst_tcp_socks_all(int *arr, int size)
-{
- int i;
-
- if (size == 0)
- return;
-
- for (i =0; arr[i] >= 0; i++)
- tcp_repair_off(arr[i]);
-
- sys_munmap(arr, size);
-}
-
-/*
- * The main routine to restore task via sigreturn.
- * This one is very special, we never return there
- * but use sigreturn facility to restore core registers
- * and jump execution to some predefined ip read from
- * core file.
- */
-long __export_restore_task(struct task_restore_core_args *args)
-{
- long ret = -1;
- VmaEntry *vma_entry;
- u64 va;
-
- struct rt_sigframe *rt_sigframe;
- unsigned long new_sp;
- pid_t my_pid = sys_getpid();
- rt_sigaction_t act;
-
- task_entries = args->task_entries;
- sys_sigaction(SIGCHLD, NULL, &act, sizeof(rt_sigset_t));
- act.rt_sa_handler = sigchld_handler;
- sys_sigaction(SIGCHLD, &act, NULL, sizeof(rt_sigset_t));
-
- log_set_fd(args->logfd);
- log_set_loglevel(args->loglevel);
-
- pr_info("Switched to the restorer %d\n", my_pid);
-
- for (vma_entry = args->self_vmas; vma_entry->start != 0; vma_entry++) {
- if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR))
- continue;
-
- if (sys_munmap((void *)vma_entry->start, vma_entry_len(vma_entry))) {
- pr_err("Munmap fail for %lx\n", vma_entry->start);
- goto core_restore_end;
- }
- }
-
- sys_munmap(args->self_vmas,
- ((void *)(vma_entry + 1) - ((void *)args->self_vmas)));
-
- /*
- * OK, lets try to map new one.
- */
- for (vma_entry = args->tgt_vmas; vma_entry->start != 0; vma_entry++) {
- if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR))
- continue;
-
- va = restore_mapping(vma_entry);
-
- if (va != vma_entry->start) {
- pr_err("Can't restore %lx mapping with %lx\n", vma_entry->start, va);
- goto core_restore_end;
- }
- }
-
- /*
- * Read page contents.
- */
- while (1) {
- ret = sys_read(args->fd_pages, &va, sizeof(va));
- if (!ret)
- break;
-
- if (ret != sizeof(va)) {
- pr_err("Bad mapping page size %ld\n", ret);
- goto core_restore_end;
- }
-
- ret = sys_read(args->fd_pages, (void *)va, PAGE_SIZE);
- if (ret != PAGE_SIZE) {
- pr_err("Can'r read mapping page %ld\n", ret);
- goto core_restore_end;
- }
- }
-
- sys_close(args->fd_pages);
-
- /*
- * Walk though all VMAs again to drop PROT_WRITE
- * if it was not there.
- */
- for (vma_entry = args->tgt_vmas; vma_entry->start != 0; vma_entry++) {
- if (!(vma_entry_is(vma_entry, VMA_AREA_REGULAR)))
- continue;
-
- if (vma_entry_is(vma_entry, VMA_ANON_SHARED)) {
- struct shmem_info *entry;
-
- entry = find_shmem(args->shmems,
- vma_entry->shmid);
- if (entry && entry->pid == my_pid &&
- entry->start == vma_entry->start)
- futex_set_and_wake(&entry->lock, 1);
- }
-
- if (vma_entry->prot & PROT_WRITE)
- continue;
-
- sys_mprotect((void *)vma_entry->start,
- vma_entry_len(vma_entry),
- vma_entry->prot);
- }
-
- /*
- * Finally restore madivse() bits
- */
- for (vma_entry = args->tgt_vmas; vma_entry->start != 0; vma_entry++) {
- unsigned long i;
-
- if (!vma_entry->has_madv || !vma_entry->madv)
- continue;
- for (i = 0; i < sizeof(vma_entry->madv) * 8; i++) {
- if (vma_entry->madv & (1ul << i)) {
- ret = sys_madvise(vma_entry->start,
- vma_entry_len(vma_entry),
- i);
- if (ret) {
- pr_err("madvise(%lx, %ld, %ld) "
- "failed with %ld\n",
- vma_entry->start,
- vma_entry_len(vma_entry),
- i, ret);
- goto core_restore_end;
- }
- }
- }
- }
-
- sys_munmap(args->tgt_vmas,
- ((void *)(vma_entry + 1) - ((void *)args->tgt_vmas)));
-
- ret = sys_munmap(args->shmems, SHMEMS_SIZE);
- if (ret < 0) {
- pr_err("Can't unmap shmem %ld\n", ret);
- goto core_restore_end;
- }
-
- /*
- * Tune up the task fields.
- */
- ret |= sys_prctl_safe(PR_SET_NAME, (long)args->comm, 0, 0);
-
- ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_CODE, (long)args->mm.mm_start_code, 0);
- ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_CODE, (long)args->mm.mm_end_code, 0);
- ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_DATA, (long)args->mm.mm_start_data, 0);
- ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_END_DATA, (long)args->mm.mm_end_data, 0);
- ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_STACK, (long)args->mm.mm_start_stack, 0);
- ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_START_BRK, (long)args->mm.mm_start_brk, 0);
- ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_BRK, (long)args->mm.mm_brk, 0);
- ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_START, (long)args->mm.mm_arg_start, 0);
- ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ARG_END, (long)args->mm.mm_arg_end, 0);
- ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_START, (long)args->mm.mm_env_start, 0);
- ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_ENV_END, (long)args->mm.mm_env_end, 0);
- ret |= sys_prctl_safe(PR_SET_MM, PR_SET_MM_AUXV, (long)args->mm_saved_auxv, args->mm_saved_auxv_size);
- if (ret)
- goto core_restore_end;
-
- /*
- * Because of requirements applied from kernel side
- * we need to restore /proc/pid/exe symlink late,
- * after old existing VMAs are superseded with
- * new ones from image file.
- */
- ret = restore_self_exe_late(args);
- if (ret)
- goto core_restore_end;
-
- /*
- * We need to prepare a valid sigframe here, so
- * after sigreturn the kernel will pick up the
- * registers from the frame, set them up and
- * finally pass execution to the new IP.
- */
- rt_sigframe = (void *)args->t.mem_zone.rt_sigframe + 8;
-
- if (restore_thread_common(rt_sigframe, &args->t))
- goto core_restore_end;
-
- /*
- * Blocked signals.
- */
- rt_sigframe->uc.uc_sigmask.sig[0] = args->blk_sigset;
-
- /*
- * Threads restoration. This requires some more comments. This
- * restorer routine and thread restorer routine has the following
- * memory map, prepared by a caller code.
- *
- * | <-- low addresses high addresses --> |
- * +-------------------------------------------------------+-----------------------+
- * | this proc body | own stack | heap | rt_sigframe space | thread restore zone |
- * +-------------------------------------------------------+-----------------------+
- *
- * where each thread restore zone is the following
- *
- * | <-- low addresses high addresses --> |
- * +--------------------------------------------------------------------------+
- * | thread restore proc | thread1 stack | thread1 heap | thread1 rt_sigframe |
- * +--------------------------------------------------------------------------+
- */
-
- if (args->nr_threads > 1) {
- struct thread_restore_args *thread_args = args->thread_args;
- long clone_flags = CLONE_VM | CLONE_FILES | CLONE_SIGHAND |
- CLONE_THREAD | CLONE_SYSVSEM;
- long last_pid_len;
- long parent_tid;
- int i, fd;
-
- fd = sys_open(LAST_PID_PATH, O_RDWR, LAST_PID_PERM);
- if (fd < 0) {
- pr_err("Can't open last_pid %d\n", fd);
- goto core_restore_end;
- }
-
- ret = sys_flock(fd, LOCK_EX);
- if (ret) {
- pr_err("Can't lock last_pid %d\n", fd);
- goto core_restore_end;
- }
-
- for (i = 0; i < args->nr_threads; i++) {
- char last_pid_buf[16], *s;
-
- /* skip self */
- if (thread_args[i].pid == args->t.pid)
- continue;
-
- mutex_lock(&args->rst_lock);
-
- new_sp =
- RESTORE_ALIGN_STACK((long)thread_args[i].mem_zone.stack,
- sizeof(thread_args[i].mem_zone.stack));
-
- last_pid_len = vprint_num(last_pid_buf, sizeof(last_pid_buf), thread_args[i].pid - 1, &s);
- ret = sys_write(fd, s, last_pid_len);
- if (ret < 0) {
- pr_err("Can't set last_pid %ld/%s\n", ret, last_pid_buf);
- goto core_restore_end;
- }
-
- /*
- * To achieve functionality like libc's clone()
- * we need a pure assembly here, because clone()'ed
- * thread will run with own stack and we must not
- * have any additional instructions... oh, dear...
- */
- asm volatile(
- "clone_emul: \n"
- "movq %2, %%rsi \n"
- "subq $16, %%rsi \n"
- "movq %6, %%rdi \n"
- "movq %%rdi, 8(%%rsi) \n"
- "movq %5, %%rdi \n"
- "movq %%rdi, 0(%%rsi) \n"
- "movq %1, %%rdi \n"
- "movq %3, %%rdx \n"
- "movq %4, %%r10 \n"
- "movl $"__stringify(__NR_clone)", %%eax \n"
- "syscall \n"
-
- "testq %%rax,%%rax \n"
- "jz thread_run \n"
-
- "movq %%rax, %0 \n"
- "jmp clone_end \n"
-
- "thread_run: \n" /* new stack here */
- "xorq %%rbp, %%rbp \n" /* clear ABI frame pointer */
- "popq %%rax \n" /* clone_restore_fn -- restore_thread */
- "popq %%rdi \n" /* arguments */
- "callq *%%rax \n"
-
- "clone_end: \n"
- : "=r"(ret)
- : "g"(clone_flags),
- "g"(new_sp),
- "g"(&parent_tid),
- "g"(&thread_args[i].pid),
- "g"(args->clone_restore_fn),
- "g"(&thread_args[i])
- : "rax", "rdi", "rsi", "rdx", "r10", "memory");
- }
-
- ret = sys_flock(fd, LOCK_UN);
- if (ret) {
- pr_err("Can't unlock last_pid %ld\n", ret);
- goto core_restore_end;
- }
-
- sys_close(fd);
- }
-
- /*
- * Writing to last-pid is CAP_SYS_ADMIN protected, thus restore
- * creds _after_ all threads creation.
- */
-
- restore_creds(&args->creds);
-
- futex_dec_and_wake(&args->task_entries->nr_in_progress);
-
- pr_info("%ld: Restored\n", sys_getpid());
-
- futex_wait_while(&args->task_entries->start, CR_STATE_RESTORE);
-
- sys_sigaction(SIGCHLD, &args->sigchld_act, NULL, sizeof(rt_sigset_t));
-
- futex_dec_and_wake(&args->task_entries->nr_in_progress);
-
- futex_wait_while(&args->task_entries->start, CR_STATE_RESTORE_SIGCHLD);
-
- rst_tcp_socks_all(args->rst_tcp_socks, args->rst_tcp_socks_size);
-
- log_set_fd(-1);
-
- /*
- * The code that prepared the itimers makes shure the
- * code below doesn't fail due to bad timing values.
- */
-
-#define itimer_armed(args, i) \
- (args->itimers[i].it_interval.tv_sec || \
- args->itimers[i].it_interval.tv_usec)
-
- if (itimer_armed(args, 0))
- sys_setitimer(ITIMER_REAL, &args->itimers[0], NULL);
- if (itimer_armed(args, 1))
- sys_setitimer(ITIMER_VIRTUAL, &args->itimers[1], NULL);
- if (itimer_armed(args, 2))
- sys_setitimer(ITIMER_PROF, &args->itimers[2], NULL);
-
- ret = sys_munmap(args->task_entries, TASK_ENTRIES_SIZE);
- if (ret < 0) {
- ret = ((long)__LINE__ << 32) | -ret;
- goto core_restore_failed;
- }
-
- /*
- * Sigframe stack.
- */
- new_sp = (long)rt_sigframe + 8;
-
- /*
- * Prepare the stack and call for sigreturn,
- * pure assembly since we don't need any additional
- * code insns from gcc.
- */
- asm volatile(
- "movq %0, %%rax \n"
- "movq %%rax, %%rsp \n"
- "movl $"__stringify(__NR_rt_sigreturn)", %%eax \n"
- "syscall \n"
- :
- : "r"(new_sp)
- : "rax","rsp","memory");
-
-core_restore_end:
- pr_err("Restorer fail %ld\n", sys_getpid());
- sys_exit_group(1);
- return -1;
-
-core_restore_failed:
- asm volatile(
- "movq %0, %%rsp \n"
- "movq 0, %%rax \n"
- "jmp *%%rax \n"
- :
- : "r"(ret)
- : "memory");
- return ret;
-}
diff --git a/util-net.c b/util-net.c
deleted file mode 100644
index 0feae03..0000000
--- a/util-net.c
+++ /dev/null
@@ -1,151 +0,0 @@
-#include <sys/socket.h>
-#include <sys/un.h>
-
-#include <errno.h>
-
-#include "compiler.h"
-#include "memcpy_64.h"
-#include "types.h"
-#include "syscall.h"
-
-#include "util-net.h"
-
-static void scm_fdset_init_chunk(struct scm_fdset *fdset, int nr_fds)
-{
- struct cmsghdr *cmsg;
-
- fdset->hdr.msg_controllen = CMSG_LEN(sizeof(int) * nr_fds);
-
- cmsg = CMSG_FIRSTHDR(&fdset->hdr);
- cmsg->cmsg_len = fdset->hdr.msg_controllen;
-}
-
-static int *scm_fdset_init(struct scm_fdset *fdset, struct sockaddr_un *saddr,
- int saddr_len, bool with_flags)
-{
- struct cmsghdr *cmsg;
-
- BUILD_BUG_ON(CR_SCM_MAX_FD > SCM_MAX_FD);
- BUILD_BUG_ON(sizeof(fdset->msg_buf) < (CMSG_SPACE(sizeof(int) * CR_SCM_MAX_FD)));
-
- fdset->iov.iov_base = fdset->opts;
- fdset->iov.iov_len = with_flags ? sizeof(fdset->opts) : 1;
-
- fdset->hdr.msg_iov = &fdset->iov;
- fdset->hdr.msg_iovlen = 1;
- fdset->hdr.msg_name = (struct sockaddr *)saddr;
- fdset->hdr.msg_namelen = saddr_len;
-
- fdset->hdr.msg_control = &fdset->msg_buf;
- fdset->hdr.msg_controllen = CMSG_LEN(sizeof(int) * CR_SCM_MAX_FD);
-
- cmsg = CMSG_FIRSTHDR(&fdset->hdr);
- cmsg->cmsg_len = fdset->hdr.msg_controllen;
- cmsg->cmsg_level = SOL_SOCKET;
- cmsg->cmsg_type = SCM_RIGHTS;
-
- return (int *)CMSG_DATA(cmsg);
-}
-
-int send_fds(int sock, struct sockaddr_un *saddr, int len,
- int *fds, int nr_fds, bool with_flags)
-{
- struct scm_fdset fdset;
- int *cmsg_data;
- int i, min_fd, ret;
-
- cmsg_data = scm_fdset_init(&fdset, saddr, len, with_flags);
- for (i = 0; i < nr_fds; i += min_fd) {
- min_fd = min(CR_SCM_MAX_FD, nr_fds - i);
- scm_fdset_init_chunk(&fdset, min_fd);
- builtin_memcpy(cmsg_data, &fds[i], sizeof(int) * min_fd);
-
- if (with_flags) {
- int j;
-
- for (j = 0; j < min_fd; j++) {
- int flags, fd = fds[i + j];
- struct fd_opts *p = fdset.opts + j;
- struct f_owner_ex owner_ex;
- u32 v[2];
-
- flags = sys_fcntl(fd, F_GETFD, 0);
- if (flags < 0)
- return -1;
-
- p->flags = (char)flags;
-
- if (sys_fcntl(fd, F_GETOWN_EX, (long)&owner_ex))
- return -1;
-
- /*
- * Simple case -- nothing is changed.
- */
- if (owner_ex.pid == 0) {
- p->fown.pid = 0;
- continue;
- }
-
- if (sys_fcntl(fd, F_GETOWNER_UIDS, (long)&v))
- return -1;
-
- p->fown.uid = v[0];
- p->fown.euid = v[1];
- p->fown.pid_type = owner_ex.type;
- p->fown.pid = owner_ex.pid;
- }
- }
-
- ret = sys_sendmsg(sock, &fdset.hdr, 0);
- if (ret <= 0)
- return ret ? : -1;
- }
-
- return 0;
-}
-
-int recv_fds(int sock, int *fds, int nr_fds, struct fd_opts *opts)
-{
- struct scm_fdset fdset;
- struct cmsghdr *cmsg;
- int *cmsg_data;
- int ret;
- int i, min_fd;
-
- cmsg_data = scm_fdset_init(&fdset, NULL, 0, opts != NULL);
- for (i = 0; i < nr_fds; i += min_fd) {
- min_fd = min(CR_SCM_MAX_FD, nr_fds - i);
- scm_fdset_init_chunk(&fdset, min_fd);
-
- ret = sys_recvmsg(sock, &fdset.hdr, 0);
- if (ret <= 0)
- return ret ? : -1;
-
- cmsg = CMSG_FIRSTHDR(&fdset.hdr);
- if (!cmsg || cmsg->cmsg_type != SCM_RIGHTS)
- return -EINVAL;
- if (fdset.hdr.msg_flags & MSG_CTRUNC)
- return -ENFILE;
-
- min_fd = (cmsg->cmsg_len - sizeof(struct cmsghdr)) / sizeof(int);
- /*
- * In case if kernel screwed the recepient, most probably
- * the caller stack frame will be overwriten, just scream
- * and exit.
- *
- * FIXME Need to sanitize util.h to be able to include it
- * into files which do not have glibc and a couple of
- * sys_write_ helpers. Meawhile opencoded BUG_ON here.
- */
- if (unlikely(min_fd > CR_SCM_MAX_FD))
- *(volatile unsigned long *)NULL = 0xdead0000 + __LINE__;
- if (unlikely(min_fd <= 0))
- return -1;
- builtin_memcpy(&fds[i], cmsg_data, sizeof(int) * min_fd);
- if (opts)
- builtin_memcpy(opts + i, fdset.opts, sizeof(struct fd_opts) * min_fd);
- }
-
- return 0;
-}
-
More information about the CRIU
mailing list