[CRIU] [PATCH] criu: dump and restore cpu affinity of each thread
Alexander Mihalicyn
alexander at mihalicyn.com
Thu Nov 26 12:45:41 MSK 2020
Hi!
Thanks for your contribution to the CRIU project!
Couldn't you post your patchset on GitHub
https://github.com/checkpoint-restore/criu/pulls ?
Regards,
Alex
On Thu, Nov 26, 2020 at 12:32 PM Sang Yan <sangyan at huawei.com> wrote:
>
> Criu should dump and restore threads' or processes'
> cpu affinity.
>
> Add one entry of thread_cpuallow_entry into
> thread_core_entry to save cpu affinity info.
>
> Restore it after threads restored but before running.
>
> Signed-off-by: Sang Yan <sangyan at huawei.com>
> ---
> compel/arch/arm/plugins/std/syscalls/syscall.def | 1 +
> .../ppc64/plugins/std/syscalls/syscall-ppc64.tbl | 1 +
> .../s390/plugins/std/syscalls/syscall-s390.tbl | 1 +
> .../arch/x86/plugins/std/syscalls/syscall_32.tbl | 1 +
> .../arch/x86/plugins/std/syscalls/syscall_64.tbl | 1 +
> criu/cr-dump.c | 14 +++++++++++
> criu/cr-restore.c | 22 ++++++++++++++++
> criu/include/restorer.h | 3 +++
> criu/pie/restorer.c | 29 ++++++++++++++++++++++
> criu/pstree.c | 7 ++++++
> images/core.proto | 5 ++++
> 11 files changed, 85 insertions(+)
>
> diff --git a/compel/arch/arm/plugins/std/syscalls/syscall.def b/compel/arch/arm/plugins/std/syscalls/syscall.def
> index f7ebc85..1c70388 100644
> --- a/compel/arch/arm/plugins/std/syscalls/syscall.def
> +++ b/compel/arch/arm/plugins/std/syscalls/syscall.def
> @@ -116,3 +116,4 @@ fsopen 430 430 (char *fsname, unsigned int flags)
> fsconfig 431 431 (int fd, unsigned int cmd, const char *key, const char *value, int aux)
> fsmount 432 432 (int fd, unsigned int flags, unsigned int attr_flags)
> clone3 435 435 (struct clone_args *uargs, size_t size)
> +sched_setaffinity 122 241 (int fd, size_t cpusetsize, const cpu_set_t *mask)
> diff --git a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl
> index 1afaf1e..460daf8 100644
> --- a/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl
> +++ b/compel/arch/ppc64/plugins/std/syscalls/syscall-ppc64.tbl
> @@ -112,3 +112,4 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags)
> __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux)
> __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags)
> __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size)
> +__NR_sched_setaffinity 222 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask)
> diff --git a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl
> index ae6fdb5..c0bba39 100644
> --- a/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl
> +++ b/compel/arch/s390/plugins/std/syscalls/syscall-s390.tbl
> @@ -112,3 +112,4 @@ __NR_fsopen 430 sys_fsopen (char *fsname, unsigned int flags)
> __NR_fsconfig 431 sys_fsconfig (int fd, unsigned int cmd, const char *key, const char *value, int aux)
> __NR_fsmount 432 sys_fsmount (int fd, unsigned int flags, unsigned int attr_flags)
> __NR_clone3 435 sys_clone3 (struct clone_args *uargs, size_t size)
> +__NR_sched_setaffinity 239 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask)
> diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl
> index 7a48711..29c13e3 100644
> --- a/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl
> +++ b/compel/arch/x86/plugins/std/syscalls/syscall_32.tbl
> @@ -63,6 +63,7 @@ __NR_mincore 218 sys_mincore (void *addr, unsigned long size, unsigned char *
> __NR_madvise 219 sys_madvise (unsigned long start, size_t len, int behavior)
> __NR_gettid 224 sys_gettid (void)
> __NR_futex 240 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3)
> +__NR_sched_setaffinity 241 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask)
> __NR_set_thread_area 243 sys_set_thread_area (user_desc_t *info)
> __NR_get_thread_area 244 sys_get_thread_area (user_desc_t *info)
> __NR_io_setup 245 sys_io_setup (unsigned nr_reqs, aio_context_t *ctx32p)
> diff --git a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl
> index 6667c07..74f5482 100644
> --- a/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl
> +++ b/compel/arch/x86/plugins/std/syscalls/syscall_64.tbl
> @@ -73,6 +73,7 @@ __NR_mount 165 sys_mount (char *dev_nmae, char *dir_name, char *type, unsign
> __NR_umount2 166 sys_umount2 (char *name, int flags)
> __NR_gettid 186 sys_gettid (void)
> __NR_futex 202 sys_futex (uint32_t *uaddr, int op, uint32_t val, struct timespec *utime, uint32_t *uaddr2, uint32_t val3)
> +__NR_sched_setaffinity 203 sys_sched_setaffinity (int fd, size_t cpusetsize, const cpu_set_t *mask)
> __NR_set_thread_area 205 sys_set_thread_area (user_desc_t *info)
> __NR_io_setup 206 sys_io_setup (unsigned nr_events, aio_context_t *ctx)
> __NR_io_getevents 208 sys_io_getevents (aio_context_t ctx, long min_nr, long nr, struct io_event *evs, struct timespec *tmo)
> diff --git a/criu/cr-dump.c b/criu/cr-dump.c
> index 193a49c..6ffd526 100644
> --- a/criu/cr-dump.c
> +++ b/criu/cr-dump.c
> @@ -140,6 +140,7 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc)
> {
> int ret;
> struct sched_param sp;
> + cpu_set_t cpumask;
>
> BUILD_BUG_ON(SCHED_OTHER != 0); /* default in proto message */
>
> @@ -185,6 +186,19 @@ static int dump_sched_info(int pid, ThreadCoreEntry *tc)
> tc->has_sched_nice = true;
> tc->sched_nice = ret;
>
> + pr_info("\tdumping cpu_allowed for %d\n", pid);
> + ret = syscall(__NR_sched_getaffinity, pid, sizeof(cpumask), &cpumask);
> + if (ret < 0) {
> + pr_perror("Can't get sched affinity for %d", pid);
> + return -1;
> + }
> + memcpy(tc->cpu_allowed->cpumask, &cpumask, __CPU_SETSIZE);
> + pr_info("\t 0x%lx, 0x%lx, 0x%lx, 0x%lx\n",
> + tc->cpu_allowed->cpumask[3],
> + tc->cpu_allowed->cpumask[2],
> + tc->cpu_allowed->cpumask[1],
> + tc->cpu_allowed->cpumask[0]);
> +
> return 0;
> }
>
> diff --git a/criu/cr-restore.c b/criu/cr-restore.c
> index 8af2e29..375eb54 100644
> --- a/criu/cr-restore.c
> +++ b/criu/cr-restore.c
> @@ -118,6 +118,7 @@ static int prepare_restorer_blob(void);
> static int prepare_rlimits(int pid, struct task_restore_args *, CoreEntry *core);
> static int prepare_posix_timers(int pid, struct task_restore_args *ta, CoreEntry *core);
> static int prepare_signals(int pid, struct task_restore_args *, CoreEntry *core);
> +static int prepare_alloweds(int pid, struct task_restore_args *ta, CoreEntry *leader_core);
>
> /*
> * Architectures can overwrite this function to restore registers that are not
> @@ -922,6 +923,9 @@ static int restore_one_alive_task(int pid, CoreEntry *core)
> if (prepare_signals(pid, ta, core))
> return -1;
>
> + if (prepare_alloweds(pid, ta, core))
> + return -1;
> +
> if (prepare_posix_timers(pid, ta, core))
> return -1;
>
> @@ -3225,6 +3229,23 @@ out:
> return ret;
> }
>
> +static int prepare_alloweds(int pid, struct task_restore_args *ta, CoreEntry *leader_core)
> +{
> + int i;
> + cpu_set_t *cpumaks;
> +
> + ta->cpualloweds = (cpu_set_t *)rst_mem_align_cpos(RM_PRIVATE);
> +
> + for (i = 0; i < current->nr_threads; i++) {
> + cpumaks = rst_mem_alloc(sizeof(cpu_set_t), RM_PRIVATE);
> + if (!cpumaks)
> + return -1;
> +
> + memcpy(cpumaks, current->core[i]->thread_core->cpu_allowed->cpumask, sizeof(cpu_set_t));
> + }
> + return 0;
> +}
> +
> extern void __gcov_flush(void) __attribute__((weak));
> void __gcov_flush(void) {}
>
> @@ -3684,6 +3705,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
> RST_MEM_FIXUP_PPTR(task_args->timerfd);
> RST_MEM_FIXUP_PPTR(task_args->posix_timers);
> RST_MEM_FIXUP_PPTR(task_args->siginfo);
> + RST_MEM_FIXUP_PPTR(task_args->cpualloweds);
> RST_MEM_FIXUP_PPTR(task_args->rlims);
> RST_MEM_FIXUP_PPTR(task_args->helpers);
> RST_MEM_FIXUP_PPTR(task_args->zombies);
> diff --git a/criu/include/restorer.h b/criu/include/restorer.h
> index dfb4e6b..67df9f5 100644
> --- a/criu/include/restorer.h
> +++ b/criu/include/restorer.h
> @@ -1,6 +1,7 @@
> #ifndef __CR_RESTORER_H__
> #define __CR_RESTORER_H__
>
> +#include <sched.h>
> #include <signal.h>
> #include <limits.h>
> #include <sys/resource.h>
> @@ -162,6 +163,8 @@ struct task_restore_args {
> siginfo_t *siginfo;
> unsigned int siginfo_n;
>
> + cpu_set_t *cpualloweds;
> +
> struct rst_tcp_sock *tcp_socks;
> unsigned int tcp_socks_n;
>
> diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c
> index b3d7e2b..833b6bb 100644
> --- a/criu/pie/restorer.c
> +++ b/criu/pie/restorer.c
> @@ -432,6 +432,31 @@ static int restore_signals(siginfo_t *ptr, int nr, bool group)
> return 0;
> }
>
> +static int restore_cpuallowed(struct task_restore_args *args)
> +{
> + int i;
> + int pid;
> + int ret;
> + cpu_set_t *cpumask;
> +
> + for (i = 0; i < args->nr_threads; i++) {
> + pid = args->thread_args[i].pid;
> + cpumask = &args->cpualloweds[i];
> + pr_info("Restoring %d cpu_allowed %lx, %lx, %lx, %lx\n", pid,
> + cpumask->__bits[3],
> + cpumask->__bits[2],
> + cpumask->__bits[1],
> + cpumask->__bits[0]);
> + ret = sys_sched_setaffinity(pid, sizeof(cpu_set_t), cpumask);
> + if (ret) {
> + pr_err("\t Restore %d cpumask failed.\n", pid);
> + return ret;
> + }
> + }
> +
> + return 0;
> +}
> +
> static int restore_seccomp_filter(pid_t tid, struct thread_restore_args *args)
> {
> unsigned int flags = args->seccomp_force_tsync ? SECCOMP_FILTER_FLAG_TSYNC : 0;
> @@ -1900,6 +1925,10 @@ long __export_restore_task(struct task_restore_args *args)
> if (ret)
> goto core_restore_end;
>
> + ret = restore_cpuallowed(args);
> + if (ret)
> + goto core_restore_end;
> +
> restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD);
>
> rst_tcp_socks_all(args);
> diff --git a/criu/pstree.c b/criu/pstree.c
> index f1513dc..d338377 100644
> --- a/criu/pstree.c
> +++ b/criu/pstree.c
> @@ -58,11 +58,13 @@ CoreEntry *core_entry_alloc(int th, int tsk)
> CredsEntry *ce = NULL;
>
> sz += sizeof(ThreadCoreEntry) + sizeof(ThreadSasEntry) + sizeof(CredsEntry);
> + sz += sizeof(ThreadCpuallowEntry);
>
> sz += CR_CAP_SIZE * sizeof(ce->cap_inh[0]);
> sz += CR_CAP_SIZE * sizeof(ce->cap_prm[0]);
> sz += CR_CAP_SIZE * sizeof(ce->cap_eff[0]);
> sz += CR_CAP_SIZE * sizeof(ce->cap_bnd[0]);
> + sz += __CPU_SETSIZE;
> /*
> * @groups are dynamic and allocated
> * on demand.
> @@ -127,6 +129,11 @@ CoreEntry *core_entry_alloc(int th, int tsk)
> ce->cap_eff = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_eff[0]));
> ce->cap_bnd = xptr_pull_s(&m, CR_CAP_SIZE * sizeof(ce->cap_bnd[0]));
>
> + core->thread_core->cpu_allowed = xptr_pull(&m, ThreadCpuallowEntry);
> + thread_cpuallow_entry__init(core->thread_core->cpu_allowed);
> + core->thread_core->cpu_allowed->n_cpumask = __CPU_SETSIZE / sizeof(uint64_t);
> + core->thread_core->cpu_allowed->cpumask = xptr_pull_s(&m, __CPU_SETSIZE);
> +
> if (arch_alloc_thread_info(core)) {
> xfree(core);
> core = NULL;
> diff --git a/images/core.proto b/images/core.proto
> index 9e9e393..d9788fd 100644
> --- a/images/core.proto
> +++ b/images/core.proto
> @@ -81,6 +81,10 @@ message thread_sas_entry {
> required uint32 ss_flags = 3;
> }
>
> +message thread_cpuallow_entry {
> + repeated uint64 cpumask = 1;
> +}
> +
> message thread_core_entry {
> required uint64 futex_rla = 1;
> required uint32 futex_rla_len = 2;
> @@ -99,6 +103,7 @@ message thread_core_entry {
>
> optional string comm = 13;
> optional uint64 blk_sigset_extended = 14;
> + required thread_cpuallow_entry cpu_allowed = 15;
> }
>
> message task_rlimits_entry {
> --
> 2.9.5
>
> _______________________________________________
> CRIU mailing list
> CRIU at openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
More information about the CRIU
mailing list