[Devel] [PATCH SERIES] prctl: Continue adding stuff from vanilla kernel

Cyrill Gorcunov gorcunov at virtuozzo.com
Fri Oct 16 06:03:54 PDT 2015


Here are three additional patches missed from previous merge. I stack
them in a row for easier appliance (see numbers in patchnames).

	Cyrill
-------------- next part --------------
>From 8764b338b37524ab1a78aee527318ebee9762487 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov at openvz.org>
Date: Thu, 9 Oct 2014 15:27:32 -0700
Subject: [PATCH] mm: use may_adjust_brk helper

ML: 8764b338b37524ab1a78aee527318ebee9762487

https://jira.sw.ru/browse/PSBM-39834

Signed-off-by: Cyrill Gorcunov <gorcunov at openvz.org>
Cc: Kees Cook <keescook at chromium.org>
Cc: Tejun Heo <tj at kernel.org>
Cc: Andrew Vagin <avagin at openvz.org>
Cc: Eric W. Biederman <ebiederm at xmission.com>
Cc: H. Peter Anvin <hpa at zytor.com>
Acked-by: Serge Hallyn <serge.hallyn at canonical.com>
Cc: Pavel Emelyanov <xemul at parallels.com>
Cc: Vasiliy Kulikov <segoon at openwall.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu at jp.fujitsu.com>
Cc: Michael Kerrisk <mtk.manpages at gmail.com>
Cc: Julien Tinnes <jln at google.com>
Signed-off-by: Andrew Morton <akpm at linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds at linux-foundation.org>
Signed-off-by: Cyrill Gorcunov <gorcunov at virtuozzo.com>
---
 kernel/sys.c |   11 ++++-------
 mm/mmap.c    |    7 +++----
 2 files changed, 7 insertions(+), 11 deletions(-)

Index: linux-pcs7.git/kernel/sys.c
===================================================================
--- linux-pcs7.git.orig/kernel/sys.c
+++ linux-pcs7.git/kernel/sys.c
@@ -2279,7 +2279,6 @@ out:
 static int prctl_set_mm(int opt, unsigned long addr,
 			unsigned long arg4, unsigned long arg5)
 {
-	unsigned long rlim = rlimit(RLIMIT_DATA);
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
 	int error;
@@ -2330,9 +2329,8 @@ static int prctl_set_mm(int opt, unsigne
 		if (addr <= mm->end_data)
 			goto out;
 
-		if (rlim < RLIM_INFINITY &&
-		    (mm->brk - addr) +
-		    (mm->end_data - mm->start_data) > rlim)
+		if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr,
+				      mm->end_data, mm->start_data))
 			goto out;
 
 		mm->start_brk = addr;
@@ -2342,9 +2340,8 @@ static int prctl_set_mm(int opt, unsigne
 		if (addr <= mm->end_data)
 			goto out;
 
-		if (rlim < RLIM_INFINITY &&
-		    (addr - mm->start_brk) +
-		    (mm->end_data - mm->start_data) > rlim)
+		if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk,
+				      mm->end_data, mm->start_data))
 			goto out;
 
 		mm->brk = addr;
Index: linux-pcs7.git/mm/mmap.c
===================================================================
--- linux-pcs7.git.orig/mm/mmap.c
+++ linux-pcs7.git/mm/mmap.c
@@ -272,7 +272,7 @@ static unsigned long do_brk(unsigned lon
 
 SYSCALL_DEFINE1(brk, unsigned long, brk)
 {
-	unsigned long rlim, retval;
+	unsigned long retval;
 	unsigned long newbrk, oldbrk;
 	struct mm_struct *mm = current->mm;
 	unsigned long min_brk;
@@ -302,9 +302,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 	 * segment grow beyond its set limit the in case where the limit is
 	 * not page aligned -Ram Gupta
 	 */
-	rlim = rlimit(RLIMIT_DATA);
-	if (rlim < RLIM_INFINITY && (brk - mm->start_brk) +
-			(mm->end_data - mm->start_data) > rlim)
+	if (check_data_rlimit(rlimit(RLIMIT_DATA), brk, mm->start_brk,
+			      mm->end_data, mm->start_data))
 		goto out;
 
 	newbrk = PAGE_ALIGN(brk);
-------------- next part --------------
>From 4a00e9df293d010acbea118b9521e08cb85016c6 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan at gmail.com>
Date: Thu, 25 Jun 2015 15:00:51 -0700
Subject: [PATCH] prctl: more prctl(PR_SET_MM_*) checks

ML: 4a00e9df293d010acbea118b9521e08cb85016c6

https://jira.sw.ru/browse/PSBM-39834

From: Alexey Dobriyan <adobriyan at gmail.com>

Individual prctl(PR_SET_MM_*) calls do some checking to maintain a
consistent view of mm->arg_start et al fields, but not enough.  In
particular PR_SET_MM_ARG_START/PR_SET_MM_ARG_END/ R_SET_MM_ENV_START/
PR_SET_MM_ENV_END only check that the address lies in an existing VMA,
but don't check that the start address is lower than the end address _at
all_.

Consolidate all consistency checks, so there will be no difference in
the future between PR_SET_MM_MAP and individual PR_SET_MM_* calls.

The program below makes both ARGV and ENVP areas be reversed.  It makes
/proc/$PID/cmdline show garbage (it doesn't oops by luck).

#include <sys/mman.h>
#include <sys/prctl.h>
#include <unistd.h>

enum {PAGE_SIZE=4096};

int main(void)
{
	void *p;

	p = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);

#define PR_SET_MM               35
#define PR_SET_MM_ARG_START     8
#define PR_SET_MM_ARG_END       9
#define PR_SET_MM_ENV_START     10
#define PR_SET_MM_ENV_END       11
	prctl(PR_SET_MM, PR_SET_MM_ARG_START, (unsigned long)p + PAGE_SIZE - 1, 0, 0);
	prctl(PR_SET_MM, PR_SET_MM_ARG_END,   (unsigned long)p, 0, 0);
	prctl(PR_SET_MM, PR_SET_MM_ENV_START, (unsigned long)p + PAGE_SIZE - 1, 0, 0);
	prctl(PR_SET_MM, PR_SET_MM_ENV_END,   (unsigned long)p, 0, 0);

	pause();
	return 0;
}

[akpm at linux-foundation.org: tidy code, tweak comment]
Signed-off-by: Alexey Dobriyan <adobriyan at gmail.com>
Acked-by: Cyrill Gorcunov <gorcunov at openvz.org>
Cc: Jarod Wilson <jarod at redhat.com>
Cc: Jan Stancek <jstancek at redhat.com>
Signed-off-by: Andrew Morton <akpm at linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds at linux-foundation.org>
Signed-off-by: Cyrill Gorcunov <gorcunov at virtuozzo.com>
---
 kernel/sys.c |  158 +++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 91 insertions(+), 67 deletions(-)

Index: linux-pcs7.git/kernel/sys.c
===================================================================
--- linux-pcs7.git.orig/kernel/sys.c
+++ linux-pcs7.git/kernel/sys.c
@@ -2095,7 +2095,6 @@ exit:
 	return err;
 }
 
-#ifdef CONFIG_CHECKPOINT_RESTORE
 /*
  * WARNING: we don't require any capability here so be very careful
  * in what is allowed for modification from userspace.
@@ -2191,6 +2190,7 @@ out:
 	return error;
 }
 
+#ifdef CONFIG_CHECKPOINT_RESTORE
 static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size)
 {
 	struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, };
@@ -2276,10 +2276,41 @@ out:
 }
 #endif /* CONFIG_CHECKPOINT_RESTORE */
 
+static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
+			  unsigned long len)
+{
+	/*
+	 * This doesn't move the auxiliary vector itself since it's pinned to
+	 * mm_struct, but it permits filling the vector with new values.  It's
+	 * up to the caller to provide sane values here, otherwise userspace
+	 * tools which use this vector might be unhappy.
+	 */
+	unsigned long user_auxv[AT_VECTOR_SIZE];
+
+	if (len > sizeof(user_auxv))
+		return -EINVAL;
+
+	if (copy_from_user(user_auxv, (const void __user *)addr, len))
+		return -EFAULT;
+
+	/* Make sure the last entry is always AT_NULL */
+	user_auxv[AT_VECTOR_SIZE - 2] = 0;
+	user_auxv[AT_VECTOR_SIZE - 1] = 0;
+
+	BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
+
+	task_lock(current);
+	memcpy(mm->saved_auxv, user_auxv, len);
+	task_unlock(current);
+
+	return 0;
+}
+
 static int prctl_set_mm(int opt, unsigned long addr,
 			unsigned long arg4, unsigned long arg5)
 {
 	struct mm_struct *mm = current->mm;
+	struct prctl_mm_map prctl_map;
 	struct vm_area_struct *vma;
 	int error;
 
@@ -2303,6 +2334,9 @@ static int prctl_set_mm(int opt, unsigne
 		return error;
 	}
 
+	if (opt == PR_SET_MM_AUXV)
+		return prctl_set_auxv(mm, addr, arg4);
+
 	if (addr >= TASK_SIZE || addr < mmap_min_addr)
 		return -EINVAL;
 
@@ -2311,42 +2345,64 @@ static int prctl_set_mm(int opt, unsigne
 	down_read(&mm->mmap_sem);
 	vma = find_vma(mm, addr);
 
+	prctl_map.start_code	= mm->start_code;
+	prctl_map.end_code	= mm->end_code;
+	prctl_map.start_data	= mm->start_data;
+	prctl_map.end_data	= mm->end_data;
+	prctl_map.start_brk	= mm->start_brk;
+	prctl_map.brk		= mm->brk;
+	prctl_map.start_stack	= mm->start_stack;
+	prctl_map.arg_start	= mm->arg_start;
+	prctl_map.arg_end	= mm->arg_end;
+	prctl_map.env_start	= mm->env_start;
+	prctl_map.env_end	= mm->env_end;
+	prctl_map.auxv		= NULL;
+	prctl_map.auxv_size	= 0;
+	prctl_map.exe_fd	= -1;
+
 	switch (opt) {
 	case PR_SET_MM_START_CODE:
-		mm->start_code = addr;
+		prctl_map.start_code = addr;
 		break;
 	case PR_SET_MM_END_CODE:
-		mm->end_code = addr;
+		prctl_map.end_code = addr;
 		break;
 	case PR_SET_MM_START_DATA:
-		mm->start_data = addr;
+		prctl_map.start_data = addr;
 		break;
 	case PR_SET_MM_END_DATA:
-		mm->end_data = addr;
+		prctl_map.end_data = addr;
+		break;
+	case PR_SET_MM_START_STACK:
+		prctl_map.start_stack = addr;
 		break;
-
 	case PR_SET_MM_START_BRK:
-		if (addr <= mm->end_data)
-			goto out;
-
-		if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr,
-				      mm->end_data, mm->start_data))
-			goto out;
-
-		mm->start_brk = addr;
+		prctl_map.start_brk = addr;
 		break;
-
 	case PR_SET_MM_BRK:
-		if (addr <= mm->end_data)
-			goto out;
-
-		if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk,
-				      mm->end_data, mm->start_data))
-			goto out;
-
-		mm->brk = addr;
+		prctl_map.brk = addr;
+		break;
+	case PR_SET_MM_ARG_START:
+		prctl_map.arg_start = addr;
+		break;
+	case PR_SET_MM_ARG_END:
+		prctl_map.arg_end = addr;
+		break;
+	case PR_SET_MM_ENV_START:
+		prctl_map.env_start = addr;
 		break;
+	case PR_SET_MM_ENV_END:
+		prctl_map.env_end = addr;
+		break;
+	default:
+		goto out;
+	}
 
+	error = validate_prctl_map(&prctl_map);
+	if (error)
+		goto out;
+
+	switch (opt) {
 	/*
 	 * If command line arguments and environment
 	 * are placed somewhere else on stack, we can
@@ -2363,52 +2419,20 @@ static int prctl_set_mm(int opt, unsigne
 			error = -EFAULT;
 			goto out;
 		}
-		if (opt == PR_SET_MM_START_STACK)
-			mm->start_stack = addr;
-		else if (opt == PR_SET_MM_ARG_START)
-			mm->arg_start = addr;
-		else if (opt == PR_SET_MM_ARG_END)
-			mm->arg_end = addr;
-		else if (opt == PR_SET_MM_ENV_START)
-			mm->env_start = addr;
-		else if (opt == PR_SET_MM_ENV_END)
-			mm->env_end = addr;
-		break;
-
-	/*
-	 * This doesn't move auxiliary vector itself
-	 * since it's pinned to mm_struct, but allow
-	 * to fill vector with new values. It's up
-	 * to a caller to provide sane values here
-	 * otherwise user space tools which use this
-	 * vector might be unhappy.
-	 */
-	case PR_SET_MM_AUXV: {
-		unsigned long user_auxv[AT_VECTOR_SIZE];
-
-		if (arg4 > sizeof(user_auxv))
-			goto out;
-		up_read(&mm->mmap_sem);
-
-		if (copy_from_user(user_auxv, (const void __user *)addr, arg4))
-			return -EFAULT;
-
-		/* Make sure the last entry is always AT_NULL */
-		user_auxv[AT_VECTOR_SIZE - 2] = 0;
-		user_auxv[AT_VECTOR_SIZE - 1] = 0;
-
-		BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
-
-		task_lock(current);
-		memcpy(mm->saved_auxv, user_auxv, arg4);
-		task_unlock(current);
-
-		return 0;
-	}
-	default:
-		goto out;
 	}
 
+	mm->start_code	= prctl_map.start_code;
+	mm->end_code	= prctl_map.end_code;
+	mm->start_data	= prctl_map.start_data;
+	mm->end_data	= prctl_map.end_data;
+	mm->start_brk	= prctl_map.start_brk;
+	mm->brk		= prctl_map.brk;
+	mm->start_stack	= prctl_map.start_stack;
+	mm->arg_start	= prctl_map.arg_start;
+	mm->arg_end	= prctl_map.arg_end;
+	mm->env_start	= prctl_map.env_start;
+	mm->env_end	= prctl_map.env_end;
+
 	error = 0;
 out:
 	up_read(&mm->mmap_sem);
-------------- next part --------------
From: Cyrill Gorcunov <gorcunov at virtuozzo.com>
Subject: [PATCH] prctl: avoid using mmap_sem for exe_file serialization

ML: 6e399cd144d8500ffb5d40fa6848890e2580a80a

https://jira.sw.ru/browse/PSBM-39834

From: Davidlohr Bueso <dave at stgolabs.net>

Oleg cleverly suggested using xchg() to set the new mm->exe_file instead
of calling set_mm_exe_file() which requires some form of serialization --
mmap_sem in this case.  For archs that do not have atomic rmw instructions
we still fallback to a spinlock alternative, so this should always be
safe.  As such, we only need the mmap_sem for looking up the backing
vm_file, which can be done sharing the lock.  Naturally, this means we
need to manually deal with both the new and old file reference counting,
and we need not worry about the MMF_EXE_FILE_CHANGED bits, which can
probably be deleted in the future anyway.

Signed-off-by: Davidlohr Bueso <dbueso at suse.de>
Suggested-by: Oleg Nesterov <oleg at redhat.com>
Acked-by: Oleg Nesterov <oleg at redhat.com>
Reviewed-by: Konstantin Khlebnikov <khlebnikov at yandex-team.ru>
Signed-off-by: Andrew Morton <akpm at linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds at linux-foundation.org>
Signed-off-by: Cyrill Gorcunov <gorcunov at virtuozzo.com>
---
 kernel/sys.c |   47 ++++++++++++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 19 deletions(-)

Index: linux-pcs7.git/kernel/sys.c
===================================================================
--- linux-pcs7.git.orig/kernel/sys.c
+++ linux-pcs7.git/kernel/sys.c
@@ -2036,14 +2036,13 @@ SYSCALL_DEFINE1(umask, int, mask)
 	return mask;
 }
 
-static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
+static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 {
 	struct fd exe;
+	struct file *old_exe, *exe_file;
 	struct inode *inode;
 	int err;
 
-	VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
-
 	exe = fdget(fd);
 	if (!exe.file)
 		return -EBADF;
@@ -2067,15 +2066,22 @@ static int prctl_set_mm_exe_file_locked(
 	/*
 	 * Forbid mm->exe_file change if old file still mapped.
 	 */
+	exe_file = get_mm_exe_file(mm);
 	err = -EBUSY;
-	if (mm->exe_file) {
+	if (exe_file) {
 		struct vm_area_struct *vma;
 
-		for (vma = mm->mmap; vma; vma = vma->vm_next)
-			if (vma->vm_file &&
-			    path_equal(&vma->vm_file->f_path,
-				       &mm->exe_file->f_path))
-				goto exit;
+		down_read(&mm->mmap_sem);
+		for (vma = mm->mmap; vma; vma = vma->vm_next) {
+			if (!vma->vm_file)
+				continue;
+			if (path_equal(&vma->vm_file->f_path,
+				       &exe_file->f_path))
+				goto exit_err;
+		}
+
+		up_read(&mm->mmap_sem);
+		fput(exe_file);
 	}
 
 	/*
@@ -2089,10 +2095,18 @@ static int prctl_set_mm_exe_file_locked(
 		goto exit;
 
 	err = 0;
-	set_mm_exe_file(mm, exe.file);	/* this grabs a reference to exe.file */
+	/* set the new file, lockless */
+	get_file(exe.file);
+	old_exe = xchg(&mm->exe_file, exe.file);
+	if (old_exe)
+		fput(old_exe);
 exit:
 	fdput(exe);
 	return err;
+exit_err:
+	up_read(&mm->mmap_sem);
+	fput(exe_file);
+	goto exit;
 }
 
 /*
@@ -2227,10 +2241,9 @@ static int prctl_set_mm_map(int opt, con
 		user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
 	}
 
-	down_write(&mm->mmap_sem);
 	if (prctl_map.exe_fd != (u32)-1)
-		error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd);
-	downgrade_write(&mm->mmap_sem);
+		error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd);
+	down_read(&mm->mmap_sem);
 	if (error)
 		goto out;
 
@@ -2327,12 +2340,8 @@ static int prctl_set_mm(int opt, unsigne
 	if (!capable(CAP_SYS_RESOURCE))
 		return -EPERM;
 
-	if (opt == PR_SET_MM_EXE_FILE) {
-		down_write(&mm->mmap_sem);
-		error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr);
-		up_write(&mm->mmap_sem);
-		return error;
-	}
+	if (opt == PR_SET_MM_EXE_FILE)
+		return prctl_set_mm_exe_file(mm, (unsigned int)addr);
 
 	if (opt == PR_SET_MM_AUXV)
 		return prctl_set_auxv(mm, addr, arg4);


More information about the Devel mailing list