[Devel] [PATCH RHEL8 COMMIT] ve/pid: Export kernel.pid_max via ve cgroup

Konstantin Khorenko khorenko at virtuozzo.com
Wed Jun 23 17:32:25 MSK 2021


The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.48
------>
commit 61dae3cc69b5d574f4adfdaa19f335e15dab85b4
Author: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
Date:   Wed Jun 23 17:32:25 2021 +0300

    ve/pid: Export kernel.pid_max via ve cgroup
    
    This member represents kernel.pid_max sysctl it is vz-specific but
    lays on pid namespace. To be able to c/r from libvzctl script it is
    better put pid_max in ve cgroup, these way we do not need to enter
    container root pid namespace to get/set these sysctl.
    
    Note: we need to be able to set pid_max on running Container,
    as we can't set pid_max before we have ve's pidns.
    
    https://jira.sw.ru/browse/PSBM-48397
    
    Signed-off-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
    
    Acked-by: Cyrill Gorcunov <gorcunov at openvz.org>
    
    Cherry-picked from vz7 commit be980b3141ca ("ve/pid: Export
    kernel.pid_max via ve cgroup")
    
    v2 changes:
    * vz8 note: read and write handlers do not need to get ve->op_sem,
      ve->ve_ns is rcu protected, so rcu_read_(un)lock() is enough.
    
      See ve_drop_context():
            rcu_assign_pointer(ve->ve_ns, NULL);
            synchronize_rcu();
            put_nsproxy(ve_ns);
    
    * Also check for ve->is_running in redundant and has been removed.
      Despite the ve->is_running value (even if it's 0 already and the CT is
      being stopped), if we defeference ve->ve_ns under rcu and get !NULL,
      we are safe to write pid_max value.
    
    https://jira.sw.ru/browse/PSBM-102629
    
    Signed-off-by: Konstantin Khorenko <khorenko at virtuozzo.com>
    Reviewed-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>
---
 kernel/ve/ve.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index 954aa8127d99..9667f9051c02 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -1337,6 +1337,50 @@ enum {
 	VE_CF_CLOCK_BOOTBASED,
 };
 
+static u64 ve_pid_max_read_u64(struct cgroup_subsys_state *css,
+			       struct cftype *cft)
+{
+	struct ve_struct *ve = css_to_ve(css);
+	struct nsproxy *ve_ns;
+	u64 pid_max = 0;
+
+	rcu_read_lock();
+	ve_ns = rcu_dereference(ve->ve_ns);
+	if (ve_ns && ve_ns->pid_ns_for_children)
+		pid_max = ve_ns->pid_ns_for_children->pid_max;
+
+	rcu_read_unlock();
+
+	return pid_max;
+}
+
+extern int pid_max_min, pid_max_max;
+
+static int ve_pid_max_write_running_u64(struct cgroup_subsys_state *css,
+					struct cftype *cft, u64 val)
+{
+	struct ve_struct *ve = css_to_ve(css);
+	struct nsproxy *ve_ns;
+
+	if (!ve_is_super(get_exec_env()) &&
+	    !ve->is_pseudosuper)
+		return -EPERM;
+
+	rcu_read_lock();
+	ve_ns = rcu_dereference(ve->ve_ns);
+	if (!ve_ns || !ve_ns->pid_ns_for_children) {
+		return -EBUSY;
+	}
+	if (pid_max_min > val || pid_max_max < val) {
+		return -EINVAL;
+	}
+
+	ve->ve_ns->pid_ns_for_children->pid_max = val;
+	rcu_read_unlock();
+
+	return 0;
+}
+
 static int ve_ts_read(struct seq_file *sf, void *v)
 {
 	struct ve_struct *ve = css_to_ve(seq_css(sf));
@@ -1735,6 +1779,12 @@ static struct cftype ve_cftypes[] = {
 		.write			= ve_ts_write,
 		.private		= VE_CF_CLOCK_BOOTBASED,
 	},
+	{
+		.name			= "pid_max",
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.read_u64		= ve_pid_max_read_u64,
+		.write_u64		= ve_pid_max_write_running_u64,
+	},
 	{
 		.name			= "netns_max_nr",
 		.flags			= CFTYPE_NOT_ON_ROOT,


More information about the Devel mailing list