[Devel] [VZ8 PATCH] ve/mmap: protect from unsecure library load from CT image

Pavel Tikhomirov ptikhomirov at virtuozzo.com
Tue Jun 1 11:32:25 MSK 2021



On 31.05.2021 19:55, Valeriy Vdovin wrote:
> Current kernel version wards a priviledged task from running unsecure
> binaries via exec or uselib syscalls. The condition of unsecurity of
> the subject binary is decided via a function 've_trusted_exec', which
> performs a series of sub-checks to see if the binary is located inside
> of a container image. This is considered unsafe and is only allowed in
> case if admin explicitly allows to run executable code from that image
> or the system as a whole. The problem is that dynamic librarie also
> have another way aside from uselib to link the library to a running
> process. Simple userspace linker would simply read out elf sections
> and load all the loadable sections from the file to the process address
> space via mmap syscall. If the secion is of executable type the linker
> will mmap this section with PROC_EXEC flag. This way it is possible
> to load potetialy malicious library into a priviledged task and there
> is no mechanism to stop it.
> This can happen if the priviledged task has entered a mount namespace,
> that belongs to some container and call some function that is not linked
> into the process already. Another possible way to link to a container
> library is to provide LD_LIBRARY_PATH=/vz/root/VEID/lib64 env var to
> a starting process. In both cases any lazy-linked function will trigger
> library loading at the moment the function from that library gets called
> for the first time.
> To stop the code from a container library to get executed in the host-
> level process, let's add ve_trusted_exec to the mmap code under
> condition that mmap has been called in PROT_EXEC non-anonymous mode.
> 
> Signed-off-by: Valeriy Vdovin <valeriy.vdovin at virtuozzo.com>
> ---
>   fs/exec.c          | 44 +++----------------------------
>   include/linux/ve.h |  2 ++
>   kernel/ve/ve.c     | 64 ++++++++++++++++++++++++++++++++++++++++++++++
>   mm/mmap.c          |  5 ++++
>   4 files changed, 74 insertions(+), 41 deletions(-)
> 
> diff --git a/fs/exec.c b/fs/exec.c
> index 61195aee6cfb..1201cd64b366 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -63,6 +63,7 @@
>   #include <linux/compat.h>
>   #include <linux/vmalloc.h>
>   #include <linux/sysctl.h>
> +#include <linux/ve.h>
>   
>   #include <linux/uaccess.h>
>   #include <asm/mmu_context.h>
> @@ -112,45 +113,6 @@ bool path_noexec(const struct path *path)
>   	       (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
>   }
>   
> -/* Send signal only 3 times a day so that coredumps don't overflow the disk */
> -#define SIGSEGV_RATELIMIT_INTERVAL	(24 * 60 * 60 * HZ)
> -#define SIGSEGV_RATELIMIT_BURST		3
> -
> -/*
> - * We don't want a VE0-privileged user intentionally or by mistake
> - * to execute files of container, these files are untrusted.
> - */
> -bool ve_exec_trusted(struct file *file, struct filename *name)
> -{
> -	struct block_device *bdev = file->f_inode->i_sb->s_bdev;
> -	bool exec_from_ct = !ve_is_super(get_exec_env());
> -	bool file_on_ct_mount = !ve_is_super(
> -		real_mount(file->f_path.mnt)->ve_owner);
> -	bool file_on_trusted_disk = true;
> -	static DEFINE_RATELIMIT_STATE(sigsegv_rs, SIGSEGV_RATELIMIT_INTERVAL,
> -						  SIGSEGV_RATELIMIT_BURST);
> -
> -	if (trusted_exec)
> -		return true;
> -
> -	if (bdev && bdev->bd_disk)
> -		file_on_trusted_disk = bdev->bd_disk->vz_trusted_exec;
> -
> -	if (exec_from_ct)
> -		return true;
> -
> -	if (file_on_trusted_disk && !file_on_ct_mount)
> -		return true;
> -
> -	if (!__ratelimit(&sigsegv_rs))
> -		return false;
> -
> -	WARN(1, "VE0's %s tried to execute untrusted file %s from VEX\n",
> -		current->comm, name->name);
> -	force_sigsegv(SIGSEGV, current);
> -	return false;
> -}
> -
>   #ifdef CONFIG_USELIB
>   /*
>    * Note that a shared library must be both readable and executable due to
> @@ -187,7 +149,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
>   	if (path_noexec(&file->f_path))
>   		goto exit;
>   
> -	if (!ve_exec_trusted(file, tmp))
> +	if (!ve_check_trusted_exec(file, tmp))
>   		goto exit;
>   
>   	putname(tmp);
> @@ -910,7 +872,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
>   	if (path_noexec(&file->f_path))
>   		goto exit;
>   
> -	if (!ve_exec_trusted(file, name))
> +	if (!ve_check_trusted_exec(file, name))
>   		goto exit;
>   
>   	err = deny_write_access(file);
> diff --git a/include/linux/ve.h b/include/linux/ve.h
> index 4ca1f122a413..a24202fbb3a3 100644
> --- a/include/linux/ve.h
> +++ b/include/linux/ve.h
> @@ -159,6 +159,8 @@ extern void put_ve(struct ve_struct *ve);
>   
>   void ve_stop_ns(struct pid_namespace *ns);
>   void ve_exit_ns(struct pid_namespace *ns);
> +bool ve_check_trusted_exec(struct file *file, struct filename *name);
> +bool ve_check_trusted_mmap(struct file *file, int prot);
>   
>   static inline struct ve_struct *css_to_ve(struct cgroup_subsys_state *css)
>   {
> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
> index 5bf187705f2a..39144e9059f0 100644
> --- a/kernel/ve/ve.c
> +++ b/kernel/ve/ve.c
> @@ -1644,6 +1644,70 @@ int vz_security_protocol_check(struct net *net, int protocol)
>   }
>   EXPORT_SYMBOL_GPL(vz_security_protocol_check);
>   
> +static bool ve_check_trusted_file(struct file *file)
> +{
> +	struct block_device *bdev = file->f_inode->i_sb->s_bdev;
> +	bool exec_from_ct = !ve_is_super(get_exec_env());
> +	bool file_on_ct_mount = !ve_is_super(
> +		real_mount(file->f_path.mnt)->ve_owner);
> +	bool file_on_trusted_disk = true;
> +
> +	if (trusted_exec)
> +		return true;
> +
> +	if (bdev && bdev->bd_disk)
> +		file_on_trusted_disk = bdev->bd_disk->vz_trusted_exec;
> +
> +	if (exec_from_ct)
> +		return true;

While on it, can we reorder ifs the way file_on_trusted_disk is set and 
check would be one after another, without exec_from_ct if in between ?

> +
> +	if (file_on_trusted_disk && !file_on_ct_mount)
> +		return true;
> +	return false;
> +}
> +
> +
> +/* Send signal only 3 times a day so that coredumps don't overflow the disk */
> +#define SIGSEGV_RATELIMIT_INTERVAL	(24 * 60 * 60 * HZ)
> +#define SIGSEGV_RATELIMIT_BURST		3
> +
> +bool ve_check_trusted_mmap(struct file *file, int prot)
> +{
> +	static DEFINE_RATELIMIT_STATE(sigsegv_rs,
> +		SIGSEGV_RATELIMIT_INTERVAL, SIGSEGV_RATELIMIT_BURST);
> +
> +	if ((prot & PROT_EXEC) && !ve_check_exec_trusted(file)) {

ve_check_exec_trusted() looks missing. Probably missprint and 
ve_check_trusted_file() was meant?

> +		if (!__ratelimit(&sigsegv_rs))
> +			return false;
> +
> +		WARN(1, "VE0 %s tried to map untrusted code from VEX\n",
> +			current->comm);

Can we at least print name of file file->f_path->dentry->d_name.name (if 
available) ?

> +		force_sigsegv(SIGSEGV, current);
> +		return false;
> +	}
> +	return true;
> +}
> +
> +/*
> + * We don't want a VE0-privileged user intentionally or by mistake
> + * to execute files of container, these files are untrusted.
> + */
> +bool ve_check_trusted_exec(struct file *file, struct filename *name) > +{
> +	static DEFINE_RATELIMIT_STATE(sigsegv_rs, SIGSEGV_RATELIMIT_INTERVAL,
> +						  SIGSEGV_RATELIMIT_BURST);
> +	if (ve_check_trusted_file(file))
> +		return true;
> +
> +	if (!__ratelimit(&sigsegv_rs))
> +		return false;
> +
> +	WARN(1, "VE0's %s tried to execute untrusted file %s from VEX\n",
> +		current->comm, name->name);
> +	force_sigsegv(SIGSEGV, current);
> +	return false;
> +}
> +
>   #ifdef CONFIG_CGROUP_SCHED
>   int cpu_cgroup_proc_stat(struct cgroup_subsys_state *cpu_css,
>   			 struct cgroup_subsys_state *cpuacct_css,
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 2d736d66df95..ad6b1f646460 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -46,6 +46,7 @@
>   #include <linux/pkeys.h>
>   #include <linux/oom.h>
>   #include <linux/sched/mm.h>
> +#include <linux/ve.h>
>   
>   #include <linux/uaccess.h>
>   #include <asm/cacheflush.h>
> @@ -1547,6 +1548,10 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
>   		file = fget(fd);
>   		if (!file)
>   			return -EBADF;
> +
> +		if (ve_check_trusted_mmap(file, prot))
> +			return -EBADF;

Do we need to handle case of mm/nommu.c:ksys_mmap_pgoff() too ?
Probably it would be better if we move the check to vm_mmap_pgoff() ?

> +
>   		if (is_file_hugepages(file))
>   			len = ALIGN(len, huge_page_size(hstate_file(file)));
>   		retval = -EINVAL;
> 

-- 
Best regards, Tikhomirov Pavel
Software Developer, Virtuozzo.


More information about the Devel mailing list