[Devel] [PATCH RHEL8 COMMIT] arch/x86: introduce cpuid override

Konstantin Khorenko khorenko at virtuozzo.com
Tue Nov 3 16:32:58 MSK 2020


The commit is pushed to "branch-rh8-4.18.0-193.6.3.vz8.4.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-193.6.3.vz8.4.15
------>
commit c4eda1a1f99dd7a00df91dd1975874df4b64266a
Author: Vladimir Davydov <vdavydov.dev at gmail.com>
Date:   Tue Nov 3 16:32:58 2020 +0300

    arch/x86: introduce cpuid override
    
    Port diff-arch-x86-introduce-cpuid-override
    
    Recent Intel CPUs rejected CPUID masking, which is required for flex
    migration, in favor of CPUID faulting. So we need to support it in
    kenrel.
    
    This patch adds user writable file /proc/vz/cpuid_override, which
    contains CPUID override table. Each table entry must have the following
    format:
    
      op[ count]: eax ebx ecx edx
    
    where @op and optional @count define a CPUID function, whose output one
    would like to override (@op and @count are loaded to EAX and ECX
    registers respectively before calling CPUID); @eax, @ebx, @ecx, @edx -
    the desired CPUID output for the specified function. All values must be
    in HEX, 0x prefix is optional.
    
    Notes:
    
     - the file is only present on hosts that support CPUID faulting;
     - CPUID faulting is always enabled if it is supported;
     - CPUID output is overridden on all present CPUs;
     - the maximal number of entries one can override equals 16;
     - each write(2) to the file removes all existing entries before adding
       new ones, so the whole table must be written in one write(2); in
       particular writing an empty line to the file removes all existing
       rules.
    
    Example:
    
    Suppose we want to mask out SSE2 (CPUID.01H:EDX:26) and RDTSCP
    (CPUID.80000001H:EDX:27). Then we should execute the following sequence:
    
     - get the current cpuid value:
    
       # cpuid -r | grep -e '^\s*0x00000001' -e '^\s*0x80000001' | head -n 2
          0x00000001 0x00: eax=0x000306e4 ebx=0x00200800 ecx=0x7fbee3ff edx=0xbfebfbff
          0x80000001 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000001 edx=0x2c100800
    
     - clear the feature bits we want to mask out and write the result to
       /proc/vz/cpuid_override:
    
       # cat >/proc/vz/cpuid_override <<EOF
       0x00000001: 0x000306e4 0x00200800 0x7fbee3ff 0xbbebfbff
       0x80000001: 0x00000000 0x00000000 0x00000001 0x24100800
       EOF
    
     - check that cpuid output was overridden:
    
       # cpuid -r | grep -e '^\s*0x00000001' -e '^\s*0x80000001' | head -n 2
          0x00000001 0x00: eax=0x000306e4 ebx=0x00200800 ecx=0x7fbee3ff edx=0xbbebfbff
          0x80000001 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000001 edx=0x24100800
    
    https://jira.sw.ru/browse/PSBM-28682
    
    Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
    
    Acked-by: Cyrill Gorcunov <gorcunov at parallels.com>
    =============================================================================
    
    https://jira.sw.ru/browse/PSBM-33638
    
    Signed-off-by: Vladimir Davydov <vdavydov at virtuozzo.com>
    
    Rebase:
    Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
    
    https://jira.sw.ru/browse/PSBM-121823
    [aryabinin: vz8 rebase]
    Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
    Reviewed-by: Kirill Tkhai <ktkhai at virtuozzo.com>
---
 arch/x86/include/asm/msr-index.h |   1 +
 arch/x86/include/asm/traps.h     |   2 +
 arch/x86/kernel/Makefile         |   1 +
 arch/x86/kernel/cpu/proc.c       |   4 +
 arch/x86/kernel/cpuid_fault.c    | 258 +++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/traps.c          |  24 ++++
 6 files changed, 290 insertions(+)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 6a21c227775c..9668ec6a064d 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -114,6 +114,7 @@
 
 #define MSR_IA32_BBL_CR_CTL		0x00000119
 #define MSR_IA32_BBL_CR_CTL3		0x0000011e
+#define MSR_MISC_FEATURES_ENABLES	0x00000140
 
 #define MSR_IA32_TSX_CTRL		0x00000122
 #define TSX_CTRL_RTM_DISABLE		BIT(0)	/* Disable RTM feature */
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 0ae298ea01a1..0282c81719e7 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -124,6 +124,8 @@ void __noreturn handle_stack_overflow(const char *message,
 				      unsigned long fault_address);
 #endif
 
+void do_cpuid_fault(struct pt_regs *);
+
 /* Interrupts/Exceptions */
 enum {
 	X86_TRAP_DE = 0,	/*  0, Divide-by-zero */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 431d8c6e641d..b9451b653b04 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -63,6 +63,7 @@ obj-y			+= pci-iommu_table.o
 obj-y			+= resource.o
 obj-y			+= irqflags.o
 obj-y			+= spec_ctrl.o
+obj-y			+= cpuid_fault.o
 
 obj-y				+= process.o
 obj-y				+= fpu/
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 2c8522a39ed5..d6b17a60acf6 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -54,6 +54,10 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
 }
 #endif
 
+extern void __do_cpuid_fault(unsigned int op, unsigned int count,
+			     unsigned int *eax, unsigned int *ebx,
+			     unsigned int *ecx, unsigned int *edx);
+
 static int show_cpuinfo(struct seq_file *m, void *v)
 {
 	struct cpuinfo_x86 *c = v;
diff --git a/arch/x86/kernel/cpuid_fault.c b/arch/x86/kernel/cpuid_fault.c
new file mode 100644
index 000000000000..339e2638c3b8
--- /dev/null
+++ b/arch/x86/kernel/cpuid_fault.c
@@ -0,0 +1,258 @@
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/ve.h>
+#include <asm/uaccess.h>
+
+struct cpuid_override_entry {
+	unsigned int op;
+	unsigned int count;
+	bool has_count;
+	unsigned int eax;
+	unsigned int ebx;
+	unsigned int ecx;
+	unsigned int edx;
+};
+
+#define MAX_CPUID_OVERRIDE_ENTRIES	16
+
+struct cpuid_override_table {
+	struct rcu_head rcu_head;
+	int size;
+	struct cpuid_override_entry entries[MAX_CPUID_OVERRIDE_ENTRIES];
+};
+
+static struct cpuid_override_table __rcu *cpuid_override __read_mostly;
+static DEFINE_SPINLOCK(cpuid_override_lock);
+
+static void cpuid_override_update(struct cpuid_override_table *new_table)
+{
+	struct cpuid_override_table *old_table;
+
+	spin_lock(&cpuid_override_lock);
+	old_table = rcu_access_pointer(cpuid_override);
+	rcu_assign_pointer(cpuid_override, new_table);
+	spin_unlock(&cpuid_override_lock);
+
+	if (old_table)
+		kfree_rcu(old_table, rcu_head);
+}
+
+static bool cpuid_override_match(unsigned int op, unsigned int count,
+				 unsigned int *eax, unsigned int *ebx,
+				 unsigned int *ecx, unsigned int *edx)
+{
+	bool ret = false;
+	struct cpuid_override_table *t;
+	struct cpuid_override_entry *e;
+	int i;
+
+	rcu_read_lock();
+	t = rcu_dereference(cpuid_override);
+	if (!t)
+		goto out;
+
+	for (i = 0; i < t->size; i++) {
+		e = &t->entries[i];
+		if (e->op != op)
+			continue;
+		if (e->has_count && e->count != count)
+			continue;
+		*eax = e->eax;
+		*ebx = e->ebx;
+		*ecx = e->ecx;
+		*edx = e->edx;
+		ret = true;
+		break;
+	}
+out:
+	rcu_read_unlock();
+	return ret;
+}
+
+void __do_cpuid_fault(unsigned int op, unsigned int count,
+		      unsigned int *eax, unsigned int *ebx,
+		      unsigned int *ecx, unsigned int *edx)
+{
+	/* check if op is overridden */
+	if (cpuid_override_match(op, count, eax, ebx, ecx, edx))
+		return;
+
+	/* fallback to real cpuid */
+	cpuid_count(op, count, eax, ebx, ecx, edx);
+}
+
+void do_cpuid_fault(struct pt_regs *regs)
+{
+	unsigned int eax, ebx, ecx, edx;
+
+	__do_cpuid_fault(regs->ax, regs->cx, &eax, &ebx, &ecx, &edx);
+
+	regs->ax = eax;
+	regs->bx = ebx;
+	regs->cx = ecx;
+	regs->dx = edx;
+}
+
+/*
+ * CPUID override entry format:
+ *
+ * op[ count]: eax ebx ecx edx
+ *
+ * All values are in HEX.
+ */
+static int cpuid_override_entry_parse(const char *s, char **endp,
+				      struct cpuid_override_entry *e)
+{
+	int taken;
+	char *end;
+
+	if (sscanf(s, "%x %x: %x %x %x %x%n",
+		   &e->op, &e->count, &e->eax, &e->ebx, &e->ecx, &e->edx,
+		   &taken) == 6)
+		e->has_count = true;
+	else if (sscanf(s, "%x: %x %x %x %x%n",
+			&e->op, &e->eax, &e->ebx, &e->ecx, &e->edx,
+			&taken) == 5)
+		e->has_count = false;
+	else
+		return -EINVAL;
+
+	end = (char *)s + taken;
+	if (*end) {
+		if (*end != '\n')
+			return -EINVAL;
+		++end;
+	}
+	*endp = end;
+	return 0;
+}
+
+static ssize_t cpuid_override_write(struct file *file, const char __user *buf,
+				    size_t count, loff_t *ppos)
+{
+	struct cpuid_override_table *t = NULL;
+	void *page = NULL;
+	char *s;
+	int err;
+
+	err = -E2BIG;
+	if (count >= PAGE_SIZE)
+		goto out;
+
+	err = -ENOMEM;
+	t = kmalloc(sizeof(*t), GFP_KERNEL);
+	if (!t)
+		goto out;
+
+	page = (void *)__get_free_page(GFP_KERNEL);
+	if (!page)
+		goto out;
+
+	err = copy_from_user(page, buf, count);
+	if (err)
+		goto out;
+
+	s = page;
+	s[count] = '\0';
+	t->size = 0;
+	while (*(s = skip_spaces(s))) {
+		err = -E2BIG;
+		if (t->size == MAX_CPUID_OVERRIDE_ENTRIES)
+			goto out;
+		err = -EINVAL;
+		if (cpuid_override_entry_parse(s, &s, &t->entries[t->size++]))
+			goto out;
+	}
+	if (!t->size) {
+		kfree(t);
+		t = NULL;
+	}
+	err = 0;
+out:
+	free_page((unsigned long)page);
+
+	if (!err)
+		cpuid_override_update(t);
+	else
+		kfree(t);
+
+	return err ?: count;
+}
+
+static void *__cpuid_override_seq_start(loff_t pos)
+{
+	struct cpuid_override_table *t = rcu_dereference(cpuid_override);
+	return t && pos < t->size ? &t->entries[pos] : NULL;
+}
+
+static void *cpuid_override_seq_start(struct seq_file *seq, loff_t *ppos)
+{
+	rcu_read_lock();
+	return __cpuid_override_seq_start(*ppos);
+}
+
+static void *cpuid_override_seq_next(struct seq_file *seq,
+				     void *v, loff_t *ppos)
+{
+	++*ppos;
+	return __cpuid_override_seq_start(*ppos);
+}
+
+static void cpuid_override_seq_stop(struct seq_file *s, void *v)
+{
+	rcu_read_unlock();
+}
+
+static int cpuid_override_seq_show(struct seq_file *s, void *v)
+{
+	struct cpuid_override_entry *e = v;
+
+	seq_printf(s, "0x%08x", e->op);
+	if (e->has_count)
+		seq_printf(s, " 0x%08x", e->count);
+	seq_printf(s, ": 0x%08x 0x%08x 0x%08x 0x%08x\n",
+		   e->eax, e->ebx, e->ecx, e->edx);
+	return 0;
+}
+
+static struct seq_operations cpuid_override_seq_ops = {
+	.start = cpuid_override_seq_start,
+	.next  = cpuid_override_seq_next,
+	.stop  = cpuid_override_seq_stop,
+	.show  = cpuid_override_seq_show,
+};
+
+static int cpuid_override_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &cpuid_override_seq_ops);
+}
+
+static struct file_operations proc_cpuid_override_ops = {
+	.owner   = THIS_MODULE,
+	.open    = cpuid_override_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+	.write   = cpuid_override_write,
+};
+
+static int __init cpuid_fault_init(void)
+{
+	struct proc_dir_entry *proc;
+
+	if (!static_cpu_has(X86_FEATURE_CPUID_FAULT))
+		return 0;
+
+	proc = proc_create("cpuid_override", 0644, proc_vz_dir,
+			   &proc_cpuid_override_ops);
+	if (!proc)
+		return -ENOMEM;
+
+	return 0;
+}
+module_init(cpuid_fault_init);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4b96d9a574ff..c43e3b80e50f 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -518,6 +518,27 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 	do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL);
 }
 
+static int check_cpuid_fault(struct pt_regs *regs, long error_code)
+{
+	unsigned long addr;
+	unsigned short opcode;
+
+	if (error_code != 0)
+		return 0;
+
+	addr = convert_ip_to_linear(current, regs);
+	if (get_user(opcode, (unsigned short __user *)addr))
+		return 0;
+
+	if (opcode != 0xa20f)
+		return 0;
+
+	do_cpuid_fault(regs);
+
+	regs->ip += 2;
+	return 1;
+}
+
 dotraplinkage void
 do_general_protection(struct pt_regs *regs, long error_code)
 {
@@ -551,6 +572,9 @@ do_general_protection(struct pt_regs *regs, long error_code)
 		return;
 	}
 
+	if (check_cpuid_fault(regs, error_code))
+		return;
+
 	tsk->thread.error_code = error_code;
 	tsk->thread.trap_nr = X86_TRAP_GP;
 


More information about the Devel mailing list