[Devel] [PATCH vz8 1/3] arch/x86: introduce cpuid override

Andrey Ryabinin aryabinin at virtuozzo.com
Fri Oct 30 14:45:13 MSK 2020


From: Vladimir Davydov <vdavydov at virtuozzo.com>

Port diff-arch-x86-introduce-cpuid-override

Recent Intel CPUs rejected CPUID masking, which is required for flex
migration, in favor of CPUID faulting. So we need to support it in
kenrel.

This patch adds user writable file /proc/vz/cpuid_override, which
contains CPUID override table. Each table entry must have the following
format:

  op[ count]: eax ebx ecx edx

where @op and optional @count define a CPUID function, whose output one
would like to override (@op and @count are loaded to EAX and ECX
registers respectively before calling CPUID); @eax, @ebx, @ecx, @edx -
the desired CPUID output for the specified function. All values must be
in HEX, 0x prefix is optional.

Notes:

 - the file is only present on hosts that support CPUID faulting;
 - CPUID faulting is always enabled if it is supported;
 - CPUID output is overridden on all present CPUs;
 - the maximal number of entries one can override equals 16;
 - each write(2) to the file removes all existing entries before adding
   new ones, so the whole table must be written in one write(2); in
   particular writing an empty line to the file removes all existing
   rules.

Example:

Suppose we want to mask out SSE2 (CPUID.01H:EDX:26) and RDTSCP
(CPUID.80000001H:EDX:27). Then we should execute the following sequence:

 - get the current cpuid value:

   # cpuid -r | grep -e '^\s*0x00000001' -e '^\s*0x80000001' | head -n 2
      0x00000001 0x00: eax=0x000306e4 ebx=0x00200800 ecx=0x7fbee3ff edx=0xbfebfbff
      0x80000001 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000001 edx=0x2c100800

 - clear the feature bits we want to mask out and write the result to
   /proc/vz/cpuid_override:

   # cat >/proc/vz/cpuid_override <<EOF
   0x00000001: 0x000306e4 0x00200800 0x7fbee3ff 0xbbebfbff
   0x80000001: 0x00000000 0x00000000 0x00000001 0x24100800
   EOF

 - check that cpuid output was overridden:

   # cpuid -r | grep -e '^\s*0x00000001' -e '^\s*0x80000001' | head -n 2
      0x00000001 0x00: eax=0x000306e4 ebx=0x00200800 ecx=0x7fbee3ff edx=0xbbebfbff
      0x80000001 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000001 edx=0x24100800

https://jira.sw.ru/browse/PSBM-28682

Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>

Acked-by: Cyrill Gorcunov <gorcunov at parallels.com>
=============================================================================

https://jira.sw.ru/browse/PSBM-33638

Signed-off-by: Vladimir Davydov <vdavydov at virtuozzo.com>
Rebase:
Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>

https://jira.sw.ru/browse/PSBM-121823
[aryabinin: vz8 rebase]
Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
---
 arch/x86/include/asm/msr-index.h |   1 +
 arch/x86/include/asm/traps.h     |   2 +
 arch/x86/kernel/Makefile         |   1 +
 arch/x86/kernel/cpu/proc.c       |   4 +
 arch/x86/kernel/cpuid_fault.c    | 258 +++++++++++++++++++++++++++++++
 arch/x86/kernel/traps.c          |  24 +++
 6 files changed, 290 insertions(+)
 create mode 100644 arch/x86/kernel/cpuid_fault.c

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 6a21c227775c..9668ec6a064d 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -114,6 +114,7 @@
 
 #define MSR_IA32_BBL_CR_CTL		0x00000119
 #define MSR_IA32_BBL_CR_CTL3		0x0000011e
+#define MSR_MISC_FEATURES_ENABLES	0x00000140
 
 #define MSR_IA32_TSX_CTRL		0x00000122
 #define TSX_CTRL_RTM_DISABLE		BIT(0)	/* Disable RTM feature */
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 0ae298ea01a1..0282c81719e7 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -124,6 +124,8 @@ void __noreturn handle_stack_overflow(const char *message,
 				      unsigned long fault_address);
 #endif
 
+void do_cpuid_fault(struct pt_regs *);
+
 /* Interrupts/Exceptions */
 enum {
 	X86_TRAP_DE = 0,	/*  0, Divide-by-zero */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 431d8c6e641d..b9451b653b04 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -63,6 +63,7 @@ obj-y			+= pci-iommu_table.o
 obj-y			+= resource.o
 obj-y			+= irqflags.o
 obj-y			+= spec_ctrl.o
+obj-y			+= cpuid_fault.o
 
 obj-y				+= process.o
 obj-y				+= fpu/
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 2c8522a39ed5..d6b17a60acf6 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -54,6 +54,10 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
 }
 #endif
 
+extern void __do_cpuid_fault(unsigned int op, unsigned int count,
+			     unsigned int *eax, unsigned int *ebx,
+			     unsigned int *ecx, unsigned int *edx);
+
 static int show_cpuinfo(struct seq_file *m, void *v)
 {
 	struct cpuinfo_x86 *c = v;
diff --git a/arch/x86/kernel/cpuid_fault.c b/arch/x86/kernel/cpuid_fault.c
new file mode 100644
index 000000000000..339e2638c3b8
--- /dev/null
+++ b/arch/x86/kernel/cpuid_fault.c
@@ -0,0 +1,258 @@
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/ve.h>
+#include <asm/uaccess.h>
+
+struct cpuid_override_entry {
+	unsigned int op;
+	unsigned int count;
+	bool has_count;
+	unsigned int eax;
+	unsigned int ebx;
+	unsigned int ecx;
+	unsigned int edx;
+};
+
+#define MAX_CPUID_OVERRIDE_ENTRIES	16
+
+struct cpuid_override_table {
+	struct rcu_head rcu_head;
+	int size;
+	struct cpuid_override_entry entries[MAX_CPUID_OVERRIDE_ENTRIES];
+};
+
+static struct cpuid_override_table __rcu *cpuid_override __read_mostly;
+static DEFINE_SPINLOCK(cpuid_override_lock);
+
+static void cpuid_override_update(struct cpuid_override_table *new_table)
+{
+	struct cpuid_override_table *old_table;
+
+	spin_lock(&cpuid_override_lock);
+	old_table = rcu_access_pointer(cpuid_override);
+	rcu_assign_pointer(cpuid_override, new_table);
+	spin_unlock(&cpuid_override_lock);
+
+	if (old_table)
+		kfree_rcu(old_table, rcu_head);
+}
+
+static bool cpuid_override_match(unsigned int op, unsigned int count,
+				 unsigned int *eax, unsigned int *ebx,
+				 unsigned int *ecx, unsigned int *edx)
+{
+	bool ret = false;
+	struct cpuid_override_table *t;
+	struct cpuid_override_entry *e;
+	int i;
+
+	rcu_read_lock();
+	t = rcu_dereference(cpuid_override);
+	if (!t)
+		goto out;
+
+	for (i = 0; i < t->size; i++) {
+		e = &t->entries[i];
+		if (e->op != op)
+			continue;
+		if (e->has_count && e->count != count)
+			continue;
+		*eax = e->eax;
+		*ebx = e->ebx;
+		*ecx = e->ecx;
+		*edx = e->edx;
+		ret = true;
+		break;
+	}
+out:
+	rcu_read_unlock();
+	return ret;
+}
+
+void __do_cpuid_fault(unsigned int op, unsigned int count,
+		      unsigned int *eax, unsigned int *ebx,
+		      unsigned int *ecx, unsigned int *edx)
+{
+	/* check if op is overridden */
+	if (cpuid_override_match(op, count, eax, ebx, ecx, edx))
+		return;
+
+	/* fallback to real cpuid */
+	cpuid_count(op, count, eax, ebx, ecx, edx);
+}
+
+void do_cpuid_fault(struct pt_regs *regs)
+{
+	unsigned int eax, ebx, ecx, edx;
+
+	__do_cpuid_fault(regs->ax, regs->cx, &eax, &ebx, &ecx, &edx);
+
+	regs->ax = eax;
+	regs->bx = ebx;
+	regs->cx = ecx;
+	regs->dx = edx;
+}
+
+/*
+ * CPUID override entry format:
+ *
+ * op[ count]: eax ebx ecx edx
+ *
+ * All values are in HEX.
+ */
+static int cpuid_override_entry_parse(const char *s, char **endp,
+				      struct cpuid_override_entry *e)
+{
+	int taken;
+	char *end;
+
+	if (sscanf(s, "%x %x: %x %x %x %x%n",
+		   &e->op, &e->count, &e->eax, &e->ebx, &e->ecx, &e->edx,
+		   &taken) == 6)
+		e->has_count = true;
+	else if (sscanf(s, "%x: %x %x %x %x%n",
+			&e->op, &e->eax, &e->ebx, &e->ecx, &e->edx,
+			&taken) == 5)
+		e->has_count = false;
+	else
+		return -EINVAL;
+
+	end = (char *)s + taken;
+	if (*end) {
+		if (*end != '\n')
+			return -EINVAL;
+		++end;
+	}
+	*endp = end;
+	return 0;
+}
+
+static ssize_t cpuid_override_write(struct file *file, const char __user *buf,
+				    size_t count, loff_t *ppos)
+{
+	struct cpuid_override_table *t = NULL;
+	void *page = NULL;
+	char *s;
+	int err;
+
+	err = -E2BIG;
+	if (count >= PAGE_SIZE)
+		goto out;
+
+	err = -ENOMEM;
+	t = kmalloc(sizeof(*t), GFP_KERNEL);
+	if (!t)
+		goto out;
+
+	page = (void *)__get_free_page(GFP_KERNEL);
+	if (!page)
+		goto out;
+
+	err = copy_from_user(page, buf, count);
+	if (err)
+		goto out;
+
+	s = page;
+	s[count] = '\0';
+	t->size = 0;
+	while (*(s = skip_spaces(s))) {
+		err = -E2BIG;
+		if (t->size == MAX_CPUID_OVERRIDE_ENTRIES)
+			goto out;
+		err = -EINVAL;
+		if (cpuid_override_entry_parse(s, &s, &t->entries[t->size++]))
+			goto out;
+	}
+	if (!t->size) {
+		kfree(t);
+		t = NULL;
+	}
+	err = 0;
+out:
+	free_page((unsigned long)page);
+
+	if (!err)
+		cpuid_override_update(t);
+	else
+		kfree(t);
+
+	return err ?: count;
+}
+
+static void *__cpuid_override_seq_start(loff_t pos)
+{
+	struct cpuid_override_table *t = rcu_dereference(cpuid_override);
+	return t && pos < t->size ? &t->entries[pos] : NULL;
+}
+
+static void *cpuid_override_seq_start(struct seq_file *seq, loff_t *ppos)
+{
+	rcu_read_lock();
+	return __cpuid_override_seq_start(*ppos);
+}
+
+static void *cpuid_override_seq_next(struct seq_file *seq,
+				     void *v, loff_t *ppos)
+{
+	++*ppos;
+	return __cpuid_override_seq_start(*ppos);
+}
+
+static void cpuid_override_seq_stop(struct seq_file *s, void *v)
+{
+	rcu_read_unlock();
+}
+
+static int cpuid_override_seq_show(struct seq_file *s, void *v)
+{
+	struct cpuid_override_entry *e = v;
+
+	seq_printf(s, "0x%08x", e->op);
+	if (e->has_count)
+		seq_printf(s, " 0x%08x", e->count);
+	seq_printf(s, ": 0x%08x 0x%08x 0x%08x 0x%08x\n",
+		   e->eax, e->ebx, e->ecx, e->edx);
+	return 0;
+}
+
+static struct seq_operations cpuid_override_seq_ops = {
+	.start = cpuid_override_seq_start,
+	.next  = cpuid_override_seq_next,
+	.stop  = cpuid_override_seq_stop,
+	.show  = cpuid_override_seq_show,
+};
+
+static int cpuid_override_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &cpuid_override_seq_ops);
+}
+
+static struct file_operations proc_cpuid_override_ops = {
+	.owner   = THIS_MODULE,
+	.open    = cpuid_override_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+	.write   = cpuid_override_write,
+};
+
+static int __init cpuid_fault_init(void)
+{
+	struct proc_dir_entry *proc;
+
+	if (!static_cpu_has(X86_FEATURE_CPUID_FAULT))
+		return 0;
+
+	proc = proc_create("cpuid_override", 0644, proc_vz_dir,
+			   &proc_cpuid_override_ops);
+	if (!proc)
+		return -ENOMEM;
+
+	return 0;
+}
+module_init(cpuid_fault_init);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4b96d9a574ff..c43e3b80e50f 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -518,6 +518,27 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 	do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL);
 }
 
+static int check_cpuid_fault(struct pt_regs *regs, long error_code)
+{
+	unsigned long addr;
+	unsigned short opcode;
+
+	if (error_code != 0)
+		return 0;
+
+	addr = convert_ip_to_linear(current, regs);
+	if (get_user(opcode, (unsigned short __user *)addr))
+		return 0;
+
+	if (opcode != 0xa20f)
+		return 0;
+
+	do_cpuid_fault(regs);
+
+	regs->ip += 2;
+	return 1;
+}
+
 dotraplinkage void
 do_general_protection(struct pt_regs *regs, long error_code)
 {
@@ -551,6 +572,9 @@ do_general_protection(struct pt_regs *regs, long error_code)
 		return;
 	}
 
+	if (check_cpuid_fault(regs, error_code))
+		return;
+
 	tsk->thread.error_code = error_code;
 	tsk->thread.trap_nr = X86_TRAP_GP;
 
-- 
2.26.2



More information about the Devel mailing list