[Libct] [PATCH v2 2/6] VZ containers: implemented kill and wait operations
Alexander Burluka
aburluka at parallels.com
Wed Nov 12 04:52:16 PST 2014
Implementation
---
src/Makefile | 1 +
src/include/readelf.h | 14 ++
src/include/vzsyscalls.h | 37 ++++
src/namespaces.c | 11 +-
src/readelf.c | 67 +++++++
src/vz.c | 473 +++++++++++++++++++++++++++++++++++++++++++++-
6 files changed, 597 insertions(+), 6 deletions(-)
create mode 100644 src/include/readelf.h
create mode 100644 src/include/vzsyscalls.h
create mode 100644 src/readelf.c
diff --git a/src/Makefile b/src/Makefile
index 6ef8016..ee98e2d 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -17,6 +17,7 @@ obj-y += devnodes.o
obj-y += route.o
obj-y += process.o
obj-y += vz.o
+obj-y += readelf.o
cflags-y += -fPIC -Wa,--noexecstack -fno-stack-protector
cflags-so += -rdynamic
diff --git a/src/include/readelf.h b/src/include/readelf.h
new file mode 100644
index 0000000..1e6ff32
--- /dev/null
+++ b/src/include/readelf.h
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) 1999-2010, Parallels, Inc. All rights reserved.
+ *
+ */
+
+#ifndef _READELF_H_
+#define _READELF_H_
+
+enum {elf_none = 0,
+ elf_32 = 1,
+ elf_64 = 2};
+int get_arch_from_elf(const char *file);
+
+#endif
diff --git a/src/include/vzsyscalls.h b/src/include/vzsyscalls.h
new file mode 100644
index 0000000..e954277
--- /dev/null
+++ b/src/include/vzsyscalls.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 1999-2010, Parallels, Inc. All rights reserved.
+ *
+ */
+
+#ifndef _VZSYSCALLS_H_
+#define _VZSYSCALLS_H_
+
+#include <sys/syscall.h>
+
+#ifdef __ia64__
+#define __NR_setluid 1506
+#define __NR_setublimit 1507
+#define __NR_ioprio_set 1274
+#elif __x86_64__
+#define __NR_setluid 501
+#define __NR_setublimit 502
+#define __NR_ioprio_set 251
+#define __NR_setns 308
+#elif __powerpc__
+#define __NR_setluid 411
+#define __NR_setublimit 412
+#define __NR_ioprio_set 273
+#elif defined(__i386__) || defined(__sparc__)
+#define __NR_setluid 511
+#define __NR_setublimit 512
+#define __NR_setns 346
+#ifdef __sparc__
+#define __NR_ioprio_set 196
+#else
+#define __NR_ioprio_set 289
+#endif
+#else
+#error "no syscall for this arch"
+#endif
+
+#endif
diff --git a/src/namespaces.c b/src/namespaces.c
index 035ad99..f4ed112 100644
--- a/src/namespaces.c
+++ b/src/namespaces.c
@@ -4,6 +4,7 @@
#include <unistd.h>
#include "namespaces.h"
+#include "vzsyscalls.h"
struct ns_desc pid_ns = {
.name = "pid",
@@ -39,6 +40,12 @@ struct ns_desc *namespaces[] = {
NULL
};
+/* setns is absent on old systems */
+static int setns(int fd, int nstype)
+{
+ return syscall(__NR_setns, fd, nstype);
+}
+
int switch_ns(int pid, struct ns_desc *nd, int *rst)
{
char buf[32];
@@ -57,9 +64,7 @@ int switch_ns(int pid, struct ns_desc *nd, int *rst)
goto err_rst;
}
-#ifndef VZ
ret = setns(nsfd, nd->cflag);
-#endif
if (ret < 0)
goto err_set;
@@ -77,8 +82,6 @@ err_ns:
void restore_ns(int rst, struct ns_desc *nd)
{
-#ifndef VZ
setns(rst, nd->cflag);
-#endif
close(rst);
}
diff --git a/src/readelf.c b/src/readelf.c
new file mode 100644
index 0000000..3e2e32f
--- /dev/null
+++ b/src/readelf.c
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 1999-2010, Parallels, Inc. All rights reserved.
+ *
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <ctype.h>
+#include <unistd.h>
+
+#include "readelf.h"
+
+#define EI_NIDENT 16
+#define ELFMAG "\177ELF"
+#define OLFMAG "\177OLF"
+
+struct elf_hdr_s {
+ uint8_t ident[EI_NIDENT];
+ uint16_t type;
+ uint16_t machine;
+};
+
+static inline int check_elf_magic(const uint8_t *buf)
+{
+ if (memcmp(buf, ELFMAG, 4) &&
+ memcmp(buf, OLFMAG, 4))
+ return -1;
+ else
+ return 0;
+}
+
+int get_arch_from_elf(const char *file)
+{
+ int fd, nbytes, class;
+ struct stat st;
+ struct elf_hdr_s elf_hdr;
+
+ if (stat(file, &st))
+ return -1;
+ if (!S_ISREG(st.st_mode))
+ return -1;
+ fd = open(file, O_RDONLY);
+ if (fd < 0)
+ return -1;
+ nbytes = read(fd, (void *) &elf_hdr, sizeof(elf_hdr));
+ close(fd);
+ if (nbytes < sizeof(elf_hdr))
+ return -1;
+ if (check_elf_magic(elf_hdr.ident))
+ return -1;
+ class = elf_hdr.ident[4];
+ switch (class) {
+ case elf_32:
+ return elf_32;
+ break;
+ case elf_64:
+ return elf_64;
+ break;
+ default:
+ return elf_none;
+ break;
+ }
+}
diff --git a/src/vz.c b/src/vz.c
index 8b56a73..51b15ae 100644
--- a/src/vz.c
+++ b/src/vz.c
@@ -29,11 +29,26 @@
#include "ct.h"
#include "xmalloc.h"
#include "fs.h"
+#include "vzsyscalls.h"
+#include "readelf.h"
#include "cgroups.h"
#include "net.h"
#include "util.h"
+#define MAX_SHTD_TM 120
#define VZCTLDEV "/dev/vzctl"
+#define ENVRETRY 3
+#define STR_SIZE 512
+#define LINUX_REBOOT_MAGIC1 0xfee1dead
+#define LINUX_REBOOT_MAGIC2 672274793
+#define LINUX_REBOOT_CMD_POWER_OFF 0x4321FEDC
+
+typedef enum {
+ M_HALT,
+ M_REBOOT,
+ M_KILL,
+ M_KILL_FORCE,
+} stop_mode_e;
static int __vzctlfd = -1;
@@ -65,6 +80,43 @@ int get_vzctlfd(void)
return __vzctlfd;
}
+static int configure_sysctl(const char *var, const char *val)
+{
+ int fd = -1, len = -1, ret = -1;
+
+ if (!var || !val)
+ return -LCTERR_BADARG;
+
+ fd = open(var, O_WRONLY);
+ if (fd == -1)
+ return -1;
+
+ len = strlen(val);
+ ret = write(fd, val, strlen(val));
+ close(fd);
+
+ return ret == len ? 0 : -1;
+}
+
+static int set_personality(unsigned long mask)
+{
+ unsigned long per;
+
+ per = personality(0xffffffff) | mask;
+ if (personality(per) == -1)
+ return -1;
+ return 0;
+}
+
+static int set_personality32(void)
+{
+#ifdef __x86_64__
+ if (get_arch_from_elf("/sbin/init") == elf_32)
+ return set_personality(PER_LINUX32);
+#endif
+ return 0;
+}
+
static void vz_ct_destroy(ct_handler_t h)
{
struct container *ct = cth2ct(h);
@@ -78,6 +130,242 @@ static void vz_ct_destroy(ct_handler_t h)
xfree(ct);
}
+static int env_is_run(unsigned veid)
+{
+ struct vzctl_env_create env_create;
+ int errcode;
+ int retry = 0;
+
+ memset(&env_create, 0, sizeof(env_create));
+ env_create.veid = veid;
+ env_create.flags = VE_TEST;
+ do {
+ if (retry)
+ usleep(50000);
+ errcode = ioctl(get_vzctlfd(), VZCTL_ENV_CREATE, &env_create);
+ } while (errcode < 0 && errno == EBUSY && retry++ < ENVRETRY);
+
+ if (errcode < 0 && (errno == ESRCH || errno == ENOTTY)) {
+ return 0;
+ } else if (errcode < 0) {
+ pr_perror("unable to get Container state");
+ return -1;
+ }
+ return 1;
+}
+
+static int env_get_pids_ioctl(unsigned veid, pid_t **pid)
+{
+ struct vzlist_vepidctl ve;
+ int i, ret, size;
+ pid_t buf[4096 * 2];
+ pid_t *tmp;
+
+ ve.veid = veid;
+ ve.num = sizeof(buf) / 2;
+ ve.pid = buf;
+ while (1) {
+ ret = ioctl(get_vzctlfd(), VZCTL_GET_VEPIDS, &ve);
+ if (ret <= 0) {
+ goto err;
+ } else if (ret <= ve.num)
+ break;
+ size = ret + 20;
+ if (ve.pid == buf)
+ tmp = malloc(size * (2 * sizeof(pid_t)));
+ else
+ tmp = realloc(ve.pid, size * (2 * sizeof(pid_t)));
+ if (tmp == NULL) {
+ ret = -1;
+ goto err;
+ }
+ ve.num = size;
+ ve.pid = tmp;
+ }
+ *pid = malloc(ret * sizeof(pid_t));
+ if (*pid == NULL) {
+ ret = -1;
+ goto err;
+ }
+ /* Copy pid from [pid:vpid] pair */
+ for (i = 0; i < ret; i++)
+ (*pid)[i] = ve.pid[2*i];
+err:
+ if (ve.pid != buf)
+ free(ve.pid);
+ return ret;
+}
+
+static int vzctl2_set_iolimit(unsigned veid, int limit)
+{
+ int ret;
+ struct iolimit_state io;
+
+ if (limit < 0)
+ return -LCTERR_BADARG;
+
+ io.id = veid;
+ io.speed = limit;
+ io.burst = limit * 3;
+ io.latency = 10*1000;
+ pr_info("Set up iolimit: %d", limit);
+ ret = ioctl(get_vzctlfd(), VZCTL_SET_IOLIMIT, &io);
+ if (ret) {
+ if (errno == ESRCH) {
+ pr_err("Container is not running");
+ return -1;
+ }
+ else if (errno == ENOTTY) {
+ pr_warn("iolimit feature is not supported by the kernel; "
+ "iolimit configuration is skipped");
+ return -1;
+ }
+ pr_perror("Unable to set iolimit");
+ return -1;
+ }
+ return 0;
+}
+
+static int env_kill(unsigned veid)
+{
+ int ret, i;
+ pid_t *pids = NULL;
+
+ ret = env_get_pids_ioctl(veid, &pids);
+ if (ret < 0)
+ return -1;
+ /* Kill all Container processes from VE0 */
+ for (i = 0; i < ret; i++)
+ kill(pids[i], SIGKILL);
+
+ if (pids != NULL) free(pids);
+
+ /* Wait for real Container shutdown */
+ for (i = 0; i < (MAX_SHTD_TM / 2); i++) {
+ if (!env_is_run(veid))
+ return 0;
+ usleep(500000);
+ }
+ return -1;
+}
+
+static int env_wait(int pid, int timeout, int *retcode)
+{
+ int ret, status;
+
+ while ((ret = waitpid(pid, &status, 0)) == -1) {
+ if (errno != EINTR) {
+ pr_perror("Error in waitpid(%d)", pid);
+ return -1;
+ }
+ }
+
+ ret = -1;
+ if (WIFEXITED(status)) {
+ ret = WEXITSTATUS(status);
+ if (retcode != NULL) {
+ *retcode = ret;
+ ret = 0;
+ }
+ } else if (WIFSIGNALED(status)) {
+ pr_info("Got signal %d", WTERMSIG(status));
+ if (timeout) {
+ pr_err("Timeout while waiting");
+ return -1;
+ }
+ }
+
+ return ret;
+}
+
+static int execvep(const char *path, char *const argv[], char *const envp[])
+{
+ if (!strchr(path, '/')) {
+ char *p = "/bin:/usr/bin:/sbin:/usr/sbin:/usr/local/bin";
+ for (; p && *p;) {
+ char partial[FILENAME_MAX];
+ char *p2;
+
+ p2 = strchr(p, ':');
+ if (p2) {
+ size_t len = p2 - p;
+
+ strncpy(partial, p, len);
+ partial[len] = 0;
+ } else {
+ strcpy(partial, p);
+ }
+ if (strlen(partial))
+ strcat(partial, "/");
+ strcat(partial, path);
+
+ execve(partial, argv, envp);
+
+ if (errno != ENOENT)
+ return -1;
+ if (p2) {
+ p = p2 + 1;
+ } else {
+ p = 0;
+ }
+ }
+ return -1;
+ } else
+ return execve(path, argv, envp);
+}
+
+static int vzctl_chroot(const char *root)
+{
+ int i;
+ sigset_t sigset;
+ struct sigaction act;
+
+ if (root == NULL)
+ return -1;
+
+ if (chdir(root)) {
+ pr_perror("unable to change dir to %s", root);
+ return -1;
+ }
+ if (chroot(root)) {
+ pr_perror("chroot %s failed", root);
+ return -1;
+ }
+ if (setsid() == -1)
+ pr_perror("setsid()");
+
+ sigemptyset(&sigset);
+ sigprocmask(SIG_SETMASK, &sigset, NULL);
+ sigemptyset(&act.sa_mask);
+ act.sa_handler = SIG_DFL;
+ act.sa_flags = 0;
+ for (i = 1; i <= NSIG; ++i)
+ sigaction(i, &act, NULL);
+ return 0;
+}
+
+static int vzctl_env_create_ioctl(unsigned veid, int flags)
+{
+ struct vzctl_env_create env_create;
+ int errcode;
+ int retry = 0;
+
+ memset(&env_create, 0, sizeof(env_create));
+ env_create.veid = veid;
+ env_create.flags = flags;
+ do {
+ if (retry)
+ usleep(50000);
+ errcode = ioctl(get_vzctlfd(), VZCTL_ENV_CREATE, &env_create);
+ } while (errcode < 0 && errno == EBUSY && retry++ < ENVRETRY);
+#ifdef __x86_64__
+ /* Set personality PER_LINUX32 for i386 based VEs */
+ if (errcode >= 0 && (flags & VE_ENTER))
+ set_personality32();
+#endif
+ return errcode;
+}
+
static int vz_spawn_cb(ct_handler_t h, ct_process_desc_t p, int (*cb)(void *), void *arg)
{
pr_err("Spawn with callback is not supported");
@@ -111,6 +399,187 @@ static int vz_set_option(ct_handler_t h, int opt, void *args)
return ret;
}
+static int vz_ct_kill(ct_handler_t h)
+{
+ struct container *ct = cth2ct(h);
+ unsigned int veid;
+
+ if (parse_uint(ct->name, &veid) == -1)
+ return -LCTERR_NOTFOUND;
+
+ if (ct->state != CT_RUNNING)
+ return -LCTERR_BADCTSTATE;
+ if (ct->nsmask & CLONE_NEWPID)
+ return kill(ct->root_pid, SIGKILL);
+ return env_kill(veid); /* for VZ containers CT_KILLABLE option is ignored */
+}
+
+static int wait_env_state(unsigned int veid, int state, unsigned int timeout)
+{
+ int i, rc;
+
+ for (i = 0; i < timeout * 2; i++) {
+ rc = env_is_run(veid);
+ switch (state) {
+ case CT_RUNNING:
+ if (rc == 1)
+ return 0;
+ break;
+ case CT_STOPPED:
+ if (rc == 0)
+ return 0;
+ break;
+ }
+ usleep(500000);
+ }
+ return -1;
+}
+
+static int vzctl2_set_iopslimit(unsigned veid, int limit)
+{
+ int ret;
+ struct iolimit_state io;
+
+ if (limit < 0)
+ return -LCTERR_BADARG;
+ io.id = veid;
+ io.speed = limit;
+ io.burst = limit * 3;
+ io.latency = 10*1000;
+ pr_info("Set up iopslimit: %d", limit);
+ ret = ioctl(get_vzctlfd(), VZCTL_SET_IOPSLIMIT, &io);
+ if (ret) {
+ if (errno == ESRCH) {
+ pr_err("Container is not running");
+ return -LCTERR_BADCTSTATE;
+ }
+ else if (errno == ENOTTY) {
+ pr_warn("iopslimit feature is not supported"
+ " by the kernel; iopslimit configuration is skipped");
+ return -LCTERR_OPNOTSUPP;
+ }
+ pr_perror("Unable to set iopslimit");
+ return -1;
+ }
+ return 0;
+}
+
+static int real_env_stop(int stop_mode)
+{
+ int fd;
+
+ fd = open("/dev/null", O_RDWR);
+ if (fd != -1) {
+ dup2(fd, 0); dup2(fd, 1); dup2(fd, 2);
+ close(fd);
+ } else {
+ close(0); close(1); close(2);
+ }
+
+ /* Disable fsync. The fsync will be done by umount() */
+ configure_sysctl("/proc/sys/fs/fsync-enable", "0");
+ switch (stop_mode) {
+ case M_HALT: {
+ char *argv[] = {"halt", NULL};
+ char *argv_init[] = {"init", "0", NULL};
+ execvep(argv[0], argv, NULL);
+ execvep(argv_init[0], argv_init, NULL);
+ break;
+ }
+ case M_REBOOT: {
+ char *argv[] = {"reboot", NULL};
+ execvep(argv[0], argv, NULL);
+ break;
+ }
+ case M_KILL:
+ return syscall(__NR_reboot, LINUX_REBOOT_MAGIC1,
+ LINUX_REBOOT_MAGIC2,
+ LINUX_REBOOT_CMD_POWER_OFF, NULL);
+ }
+ return -1;
+}
+
+static int vz_ct_wait(ct_handler_t h)
+{
+ struct container *ct = NULL;
+ unsigned int veid = -1;
+ int pid, child_pid, ret = 0;
+
+ if (!h)
+ return -LCTERR_BADARG;
+
+ ct = cth2ct(h);
+
+ if (ct->state != CT_RUNNING)
+ return -LCTERR_BADCTSTATE;
+
+ if ((child_pid = fork()) < 0) {
+ pr_perror("Unable to stop Container, fork failed");
+ goto kill_force;
+ } else if (child_pid == 0) {
+ struct sigaction act, actold;
+ sigaction(SIGCHLD, NULL, &actold);
+ sigemptyset(&act.sa_mask);
+ act.sa_handler = SIG_IGN;
+ act.sa_flags = SA_NOCLDSTOP;
+ sigaction(SIGCHLD, &act, NULL);
+
+ ret = syscall(__NR_setluid, veid);
+ if (ret)
+ _exit(ret);
+
+ ret = vzctl_chroot(ct->root_path);
+ if (ret)
+ _exit(ret);
+
+ pr_info("Stopping the Container ...");
+ if ((pid = fork()) < 0) {
+ pr_perror("Unable to stop Container, fork failed");
+ _exit(1);
+ } else if (pid == 0) {
+ ret = vzctl_env_create_ioctl(veid, VE_ENTER);
+ if (ret >= 0)
+ ret = real_env_stop(M_HALT);
+ _exit(ret);
+ }
+
+ if (wait_env_state(veid, CT_STOPPED, MAX_SHTD_TM) == 0)
+ _exit(0);
+
+ pr_info("Forcibly stop the Container...");
+ vzctl2_set_iolimit(veid, 0);
+ vzctl2_set_iopslimit(veid, 0);
+
+ if ((pid = fork()) < 0) {
+ pr_perror("Unable to stop Container, fork failed");
+ _exit(1);
+ } else if (pid == 0) {
+ ret = vzctl_env_create_ioctl(veid, VE_ENTER);
+ if (ret >= 0)
+ ret = real_env_stop(M_KILL);
+ _exit(ret);
+ }
+ if (wait_env_state(veid, CT_STOPPED, MAX_SHTD_TM) == 0)
+ _exit(0);
+
+ _exit(1);
+ }
+ env_wait(child_pid, 0, NULL);
+ if (!env_is_run(veid)) {
+ pr_info("Container was stopped");
+ return 0;
+ }
+
+kill_force:
+ pr_info("Forcibly kill the Container...");
+ if (env_kill(veid)) {
+ pr_err("Unable to stop Container: operation timed out");
+ return -1;
+ }
+
+ return 0;
+}
+
static int vz_uname(ct_handler_t h, char *host, char *dom)
{
struct container *ct = NULL;
@@ -196,8 +665,8 @@ static const struct container_ops vz_ct_ops = {
.spawn_execve = NULL,
.enter_cb = NULL,
.enter_execve = NULL,
- .kill = NULL,
- .wait = NULL,
+ .kill = vz_ct_kill,
+ .wait = vz_ct_wait,
.destroy = vz_ct_destroy,
.detach = vz_ct_destroy,
.set_nsmask = vz_set_nsmask,
--
1.7.1
More information about the Libct
mailing list