[Devel] [VZ8 PATCH] ve/mmap: protect from unsecure library load from CT image
Pavel Tikhomirov
ptikhomirov at virtuozzo.com
Tue Jun 1 11:32:25 MSK 2021
On 31.05.2021 19:55, Valeriy Vdovin wrote:
> Current kernel version wards a priviledged task from running unsecure
> binaries via exec or uselib syscalls. The condition of unsecurity of
> the subject binary is decided via a function 've_trusted_exec', which
> performs a series of sub-checks to see if the binary is located inside
> of a container image. This is considered unsafe and is only allowed in
> case if admin explicitly allows to run executable code from that image
> or the system as a whole. The problem is that dynamic librarie also
> have another way aside from uselib to link the library to a running
> process. Simple userspace linker would simply read out elf sections
> and load all the loadable sections from the file to the process address
> space via mmap syscall. If the secion is of executable type the linker
> will mmap this section with PROC_EXEC flag. This way it is possible
> to load potetialy malicious library into a priviledged task and there
> is no mechanism to stop it.
> This can happen if the priviledged task has entered a mount namespace,
> that belongs to some container and call some function that is not linked
> into the process already. Another possible way to link to a container
> library is to provide LD_LIBRARY_PATH=/vz/root/VEID/lib64 env var to
> a starting process. In both cases any lazy-linked function will trigger
> library loading at the moment the function from that library gets called
> for the first time.
> To stop the code from a container library to get executed in the host-
> level process, let's add ve_trusted_exec to the mmap code under
> condition that mmap has been called in PROT_EXEC non-anonymous mode.
>
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
> ---
> fs/exec.c | 44 +++----------------------------
> include/linux/ve.h | 2 ++
> kernel/ve/ve.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++
> mm/mmap.c | 5 ++++
> 4 files changed, 74 insertions(+), 41 deletions(-)
>
> diff --git a/fs/exec.c b/fs/exec.c
> index 61195aee6cfb..1201cd64b366 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -63,6 +63,7 @@
> #include <linux/compat.h>
> #include <linux/vmalloc.h>
> #include <linux/sysctl.h>
> +#include <linux/ve.h>
>
> #include <linux/uaccess.h>
> #include <asm/mmu_context.h>
> @@ -112,45 +113,6 @@ bool path_noexec(const struct path *path)
> (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
> }
>
> -/* Send signal only 3 times a day so that coredumps don't overflow the disk */
> -#define SIGSEGV_RATELIMIT_INTERVAL (24 * 60 * 60 * HZ)
> -#define SIGSEGV_RATELIMIT_BURST 3
> -
> -/*
> - * We don't want a VE0-privileged user intentionally or by mistake
> - * to execute files of container, these files are untrusted.
> - */
> -bool ve_exec_trusted(struct file *file, struct filename *name)
> -{
> - struct block_device *bdev = file->f_inode->i_sb->s_bdev;
> - bool exec_from_ct = !ve_is_super(get_exec_env());
> - bool file_on_ct_mount = !ve_is_super(
> - real_mount(file->f_path.mnt)->ve_owner);
> - bool file_on_trusted_disk = true;
> - static DEFINE_RATELIMIT_STATE(sigsegv_rs, SIGSEGV_RATELIMIT_INTERVAL,
> - SIGSEGV_RATELIMIT_BURST);
> -
> - if (trusted_exec)
> - return true;
> -
> - if (bdev && bdev->bd_disk)
> - file_on_trusted_disk = bdev->bd_disk->vz_trusted_exec;
> -
> - if (exec_from_ct)
> - return true;
> -
> - if (file_on_trusted_disk && !file_on_ct_mount)
> - return true;
> -
> - if (!__ratelimit(&sigsegv_rs))
> - return false;
> -
> - WARN(1, "VE0's %s tried to execute untrusted file %s from VEX\n",
> - current->comm, name->name);
> - force_sigsegv(SIGSEGV, current);
> - return false;
> -}
> -
> #ifdef CONFIG_USELIB
> /*
> * Note that a shared library must be both readable and executable due to
> @@ -187,7 +149,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
> if (path_noexec(&file->f_path))
> goto exit;
>
> - if (!ve_exec_trusted(file, tmp))
> + if (!ve_check_trusted_exec(file, tmp))
> goto exit;
>
> putname(tmp);
> @@ -910,7 +872,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
> if (path_noexec(&file->f_path))
> goto exit;
>
> - if (!ve_exec_trusted(file, name))
> + if (!ve_check_trusted_exec(file, name))
> goto exit;
>
> err = deny_write_access(file);
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index 4ca1f122a413..a24202fbb3a3 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -159,6 +159,8 @@ extern void put_ve(struct ve_struct *ve);
>
> void ve_stop_ns(struct pid_namespace *ns);
> void ve_exit_ns(struct pid_namespace *ns);
> +bool ve_check_trusted_exec(struct file *file, struct filename *name);
> +bool ve_check_trusted_mmap(struct file *file, int prot);
>
> static inline struct ve_struct *css_to_ve(struct cgroup_subsys_state *css)
> {
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index 5bf187705f2a..39144e9059f0 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -1644,6 +1644,70 @@ int vz_security_protocol_check(struct net *net, int protocol)
> }
> EXPORT_SYMBOL_GPL(vz_security_protocol_check);
>
> +static bool ve_check_trusted_file(struct file *file)
> +{
> + struct block_device *bdev = file->f_inode->i_sb->s_bdev;
> + bool exec_from_ct = !ve_is_super(get_exec_env());
> + bool file_on_ct_mount = !ve_is_super(
> + real_mount(file->f_path.mnt)->ve_owner);
> + bool file_on_trusted_disk = true;
> +
> + if (trusted_exec)
> + return true;
> +
> + if (bdev && bdev->bd_disk)
> + file_on_trusted_disk = bdev->bd_disk->vz_trusted_exec;
> +
> + if (exec_from_ct)
> + return true;
While on it, can we reorder ifs the way file_on_trusted_disk is set and
check would be one after another, without exec_from_ct if in between ?
> +
> + if (file_on_trusted_disk && !file_on_ct_mount)
> + return true;
> + return false;
> +}
> +
> +
> +/* Send signal only 3 times a day so that coredumps don't overflow the disk */
> +#define SIGSEGV_RATELIMIT_INTERVAL (24 * 60 * 60 * HZ)
> +#define SIGSEGV_RATELIMIT_BURST 3
> +
> +bool ve_check_trusted_mmap(struct file *file, int prot)
> +{
> + static DEFINE_RATELIMIT_STATE(sigsegv_rs,
> + SIGSEGV_RATELIMIT_INTERVAL, SIGSEGV_RATELIMIT_BURST);
> +
> + if ((prot & PROT_EXEC) && !ve_check_exec_trusted(file)) {
ve_check_exec_trusted() looks missing. Probably missprint and
ve_check_trusted_file() was meant?
> + if (!__ratelimit(&sigsegv_rs))
> + return false;
> +
> + WARN(1, "VE0 %s tried to map untrusted code from VEX\n",
> + current->comm);
Can we at least print name of file file->f_path->dentry->d_name.name (if
available) ?
> + force_sigsegv(SIGSEGV, current);
> + return false;
> + }
> + return true;
> +}
> +
> +/*
> + * We don't want a VE0-privileged user intentionally or by mistake
> + * to execute files of container, these files are untrusted.
> + */
> +bool ve_check_trusted_exec(struct file *file, struct filename *name) > +{
> + static DEFINE_RATELIMIT_STATE(sigsegv_rs, SIGSEGV_RATELIMIT_INTERVAL,
> + SIGSEGV_RATELIMIT_BURST);
> + if (ve_check_trusted_file(file))
> + return true;
> +
> + if (!__ratelimit(&sigsegv_rs))
> + return false;
> +
> + WARN(1, "VE0's %s tried to execute untrusted file %s from VEX\n",
> + current->comm, name->name);
> + force_sigsegv(SIGSEGV, current);
> + return false;
> +}
> +
> #ifdef CONFIG_CGROUP_SCHED
> int cpu_cgroup_proc_stat(struct cgroup_subsys_state *cpu_css,
> struct cgroup_subsys_state *cpuacct_css,
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 2d736d66df95..ad6b1f646460 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -46,6 +46,7 @@
> #include <linux/pkeys.h>
> #include <linux/oom.h>
> #include <linux/sched/mm.h>
> +#include <linux/ve.h>
>
> #include <linux/uaccess.h>
> #include <asm/cacheflush.h>
> @@ -1547,6 +1548,10 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
> file = fget(fd);
> if (!file)
> return -EBADF;
> +
> + if (ve_check_trusted_mmap(file, prot))
> + return -EBADF;
Do we need to handle case of mm/nommu.c:ksys_mmap_pgoff() too ?
Probably it would be better if we move the check to vm_mmap_pgoff() ?
> +
> if (is_file_hugepages(file))
> len = ALIGN(len, huge_page_size(hstate_file(file)));
> retval = -EINVAL;
>
--
Best regards, Tikhomirov Pavel
Software Developer, Virtuozzo.
More information about the Devel
mailing list