[Devel] [PATCH rh7 03/11] Port diff-arch-x86-introduce-cpuid-override
Vladimir Davydov
vdavydov at virtuozzo.com
Fri Oct 16 09:22:47 PDT 2015
Author: Vladimir Davydov
Email: vdavydov at parallels.com
Subject: arch: x86: introduce cpuid override
Date: Tue, 2 Dec 2014 18:21:59 +0300
Recent Intel CPUs rejected CPUID masking, which is required for flex
migration, in favor of CPUID faulting. So we need to support it in
kenrel.
This patch adds user writable file /proc/vz/cpuid_override, which
contains CPUID override table. Each table entry must have the following
format:
op[ count]: eax ebx ecx edx
where @op and optional @count define a CPUID function, whose output one
would like to override (@op and @count are loaded to EAX and ECX
registers respectively before calling CPUID); @eax, @ebx, @ecx, @edx -
the desired CPUID output for the specified function. All values must be
in HEX, 0x prefix is optional.
Notes:
- the file is only present on hosts that support CPUID faulting;
- CPUID faulting is always enabled if it is supported;
- CPUID output is overridden on all present CPUs;
- the maximal number of entries one can override equals 16;
- each write(2) to the file removes all existing entries before adding
new ones, so the whole table must be written in one write(2); in
particular writing an empty line to the file removes all existing
rules.
Example:
Suppose we want to mask out SSE2 (CPUID.01H:EDX:26) and RDTSCP
(CPUID.80000001H:EDX:27). Then we should execute the following sequence:
- get the current cpuid value:
# cpuid -r | grep -e '^\s*0x00000001' -e '^\s*0x80000001' | head -n 2
0x00000001 0x00: eax=0x000306e4 ebx=0x00200800 ecx=0x7fbee3ff edx=0xbfebfbff
0x80000001 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000001 edx=0x2c100800
- clear the feature bits we want to mask out and write the result to
/proc/vz/cpuid_override:
# cat >/proc/vz/cpuid_override <<EOF
0x00000001: 0x000306e4 0x00200800 0x7fbee3ff 0xbbebfbff
0x80000001: 0x00000000 0x00000000 0x00000001 0x24100800
EOF
- check that cpuid output was overridden:
# cpuid -r | grep -e '^\s*0x00000001' -e '^\s*0x80000001' | head -n 2
0x00000001 0x00: eax=0x000306e4 ebx=0x00200800 ecx=0x7fbee3ff edx=0xbbebfbff
0x80000001 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000001 edx=0x24100800
https://jira.sw.ru/browse/PSBM-28682
Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
Acked-by: Cyrill Gorcunov <gorcunov at parallels.com>
=============================================================================
https://jira.sw.ru/browse/PSBM-33638
Signed-off-by: Vladimir Davydov <vdavydov at virtuozzo.com>
---
arch/x86/include/asm/cpufeature.h | 2 +
arch/x86/include/asm/traps.h | 2 +
arch/x86/include/uapi/asm/msr-index.h | 1 +
arch/x86/kernel/Makefile | 1 +
arch/x86/kernel/cpu/intel.c | 18 +++
arch/x86/kernel/cpuid_fault.c | 258 ++++++++++++++++++++++++++++++++++
arch/x86/kernel/traps.c | 24 ++++
7 files changed, 306 insertions(+)
create mode 100644 arch/x86/kernel/cpuid_fault.c
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index b337e669ed0f..f3f63fde5eb3 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -102,6 +102,7 @@
#define X86_FEATURE_APERFMPERF (3*32+28) /* APERFMPERF */
#define X86_FEATURE_EAGER_FPU (3*32+29) /* "eagerfpu" Non lazy FPU restore */
#define X86_FEATURE_NONSTOP_TSC_S3 (3*32+30) /* TSC doesn't stop in S3 state */
+#define X86_FEATURE_CPUID_FAULTING (3*32+31) /* cpuid faulting */
/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
@@ -337,6 +338,7 @@ extern const char * const x86_power_flags[32];
#define cpu_has_cx16 boot_cpu_has(X86_FEATURE_CX16)
#define cpu_has_eager_fpu boot_cpu_has(X86_FEATURE_EAGER_FPU)
#define cpu_has_topoext boot_cpu_has(X86_FEATURE_TOPOEXT)
+#define cpu_has_cpuid_faulting boot_cpu_has(X86_FEATURE_CPUID_FAULTING)
#ifdef CONFIG_X86_64
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index d017966d93ee..85f8b7b33af4 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -115,6 +115,8 @@ asmlinkage void smp_thermal_interrupt(void);
asmlinkage void mce_threshold_interrupt(void);
#endif
+void do_cpuid_fault(struct pt_regs *);
+
/* Interrupts/Exceptions */
enum {
X86_TRAP_DE = 0, /* 0, Divide-by-zero */
diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index f14ab2b5f55a..c23a1e140857 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -48,6 +48,7 @@
#define MSR_MTRRcap 0x000000fe
#define MSR_IA32_BBL_CR_CTL 0x00000119
#define MSR_IA32_BBL_CR_CTL3 0x0000011e
+#define MSR_MISC_FEATURES_ENABLES 0x00000140
#define MSR_IA32_SYSENTER_CS 0x00000174
#define MSR_IA32_SYSENTER_ESP 0x00000175
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4d5df57dc43d..6ef2f77084dc 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -41,6 +41,7 @@ obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
obj-y += tsc.o io_delay.o rtc.o
obj-y += pci-iommu_table.o
obj-y += resource.o
+obj-y += cpuid_fault.o
obj-y += process.o
obj-y += i387.o xsave.o
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 7f670c8841d6..98da793c2069 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -352,6 +352,22 @@ static void detect_vmx_virtcap(struct cpuinfo_x86 *c)
}
}
+static void intel_cpuid_faulting_init(struct cpuinfo_x86 *c)
+{
+ unsigned int l1, l2;
+
+ /* check if cpuid faulting is supported */
+ if (rdmsr_safe(MSR_PLATFORM_INFO, &l1, &l2) != 0 ||
+ !(l1 & (1 << 31)))
+ return;
+
+ /* enable cpuid faulting */
+ rdmsr(MSR_MISC_FEATURES_ENABLES, l1, l2);
+ wrmsr(MSR_MISC_FEATURES_ENABLES, l1 | 1, l2);
+
+ set_cpu_cap(c, X86_FEATURE_CPUID_FAULTING);
+}
+
static void init_intel(struct cpuinfo_x86 *c)
{
unsigned int l2 = 0;
@@ -469,6 +485,8 @@ static void init_intel(struct cpuinfo_x86 *c)
wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
}
}
+
+ intel_cpuid_faulting_init(c);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpuid_fault.c b/arch/x86/kernel/cpuid_fault.c
new file mode 100644
index 000000000000..24d16c21614f
--- /dev/null
+++ b/arch/x86/kernel/cpuid_fault.c
@@ -0,0 +1,258 @@
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/ve.h>
+#include <asm/uaccess.h>
+
+struct cpuid_override_entry {
+ unsigned int op;
+ unsigned int count;
+ bool has_count;
+ unsigned int eax;
+ unsigned int ebx;
+ unsigned int ecx;
+ unsigned int edx;
+};
+
+#define MAX_CPUID_OVERRIDE_ENTRIES 16
+
+struct cpuid_override_table {
+ struct rcu_head rcu_head;
+ int size;
+ struct cpuid_override_entry entries[MAX_CPUID_OVERRIDE_ENTRIES];
+};
+
+static struct cpuid_override_table __rcu *cpuid_override;
+static DEFINE_SPINLOCK(cpuid_override_lock);
+
+static void cpuid_override_update(struct cpuid_override_table *new_table)
+{
+ struct cpuid_override_table *old_table;
+
+ spin_lock(&cpuid_override_lock);
+ old_table = rcu_access_pointer(cpuid_override);
+ rcu_assign_pointer(cpuid_override, new_table);
+ spin_unlock(&cpuid_override_lock);
+
+ if (old_table)
+ kfree_rcu(old_table, rcu_head);
+}
+
+static bool cpuid_override_match(unsigned int op, unsigned int count,
+ unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ bool ret = false;
+ struct cpuid_override_table *t;
+ struct cpuid_override_entry *e;
+ int i;
+
+ rcu_read_lock();
+ t = rcu_dereference(cpuid_override);
+ if (!t)
+ goto out;
+
+ for (i = 0; i < t->size; i++) {
+ e = &t->entries[i];
+ if (e->op != op)
+ continue;
+ if (e->has_count && e->count != count)
+ continue;
+ *eax = e->eax;
+ *ebx = e->ebx;
+ *ecx = e->ecx;
+ *edx = e->edx;
+ ret = true;
+ break;
+ }
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+static void __do_cpuid_fault(unsigned int op, unsigned int count,
+ unsigned int *eax, unsigned int *ebx,
+ unsigned int *ecx, unsigned int *edx)
+{
+ /* check if op is overridden */
+ if (cpuid_override_match(op, count, eax, ebx, ecx, edx))
+ return;
+
+ /* fallback to real cpuid */
+ cpuid_count(op, count, eax, ebx, ecx, edx);
+}
+
+void do_cpuid_fault(struct pt_regs *regs)
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ __do_cpuid_fault(regs->ax, regs->cx, &eax, &ebx, &ecx, &edx);
+
+ regs->ax = eax;
+ regs->bx = ebx;
+ regs->cx = ecx;
+ regs->dx = edx;
+}
+
+/*
+ * CPUID override entry format:
+ *
+ * op[ count]: eax ebx ecx edx
+ *
+ * All values are in HEX.
+ */
+static int cpuid_override_entry_parse(const char *s, char **endp,
+ struct cpuid_override_entry *e)
+{
+ int taken;
+ char *end;
+
+ if (sscanf(s, "%x %x: %x %x %x %x%n",
+ &e->op, &e->count, &e->eax, &e->ebx, &e->ecx, &e->edx,
+ &taken) == 6)
+ e->has_count = true;
+ else if (sscanf(s, "%x: %x %x %x %x%n",
+ &e->op, &e->eax, &e->ebx, &e->ecx, &e->edx,
+ &taken) == 5)
+ e->has_count = false;
+ else
+ return -EINVAL;
+
+ end = (char *)s + taken;
+ if (*end) {
+ if (*end != '\n')
+ return -EINVAL;
+ ++end;
+ }
+ *endp = end;
+ return 0;
+}
+
+static ssize_t cpuid_override_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct cpuid_override_table *t = NULL;
+ void *page = NULL;
+ char *s;
+ int err;
+
+ err = -E2BIG;
+ if (count >= PAGE_SIZE)
+ goto out;
+
+ err = -ENOMEM;
+ t = kmalloc(sizeof(*t), GFP_KERNEL);
+ if (!t)
+ goto out;
+
+ page = (void *)__get_free_page(GFP_KERNEL);
+ if (!page)
+ goto out;
+
+ err = copy_from_user(page, buf, count);
+ if (err)
+ goto out;
+
+ s = page;
+ s[count] = '\0';
+ t->size = 0;
+ while (*(s = skip_spaces(s))) {
+ err = -E2BIG;
+ if (t->size == MAX_CPUID_OVERRIDE_ENTRIES)
+ goto out;
+ err = -EINVAL;
+ if (cpuid_override_entry_parse(s, &s, &t->entries[t->size++]))
+ goto out;
+ }
+ if (!t->size) {
+ kfree(t);
+ t = NULL;
+ }
+ err = 0;
+out:
+ free_page((unsigned long)page);
+
+ if (!err)
+ cpuid_override_update(t);
+ else
+ kfree(t);
+
+ return err ?: count;
+}
+
+static void *__cpuid_override_seq_start(loff_t pos)
+{
+ struct cpuid_override_table *t = rcu_dereference(cpuid_override);
+ return t && pos < t->size ? &t->entries[pos] : NULL;
+}
+
+static void *cpuid_override_seq_start(struct seq_file *seq, loff_t *ppos)
+{
+ rcu_read_lock();
+ return __cpuid_override_seq_start(*ppos);
+}
+
+static void *cpuid_override_seq_next(struct seq_file *seq,
+ void *v, loff_t *ppos)
+{
+ ++*ppos;
+ return __cpuid_override_seq_start(*ppos);
+}
+
+static void cpuid_override_seq_stop(struct seq_file *s, void *v)
+{
+ rcu_read_unlock();
+}
+
+static int cpuid_override_seq_show(struct seq_file *s, void *v)
+{
+ struct cpuid_override_entry *e = v;
+
+ seq_printf(s, "0x%08x", e->op);
+ if (e->has_count)
+ seq_printf(s, " 0x%08x", e->count);
+ seq_printf(s, ": 0x%08x 0x%08x 0x%08x 0x%08x\n",
+ e->eax, e->ebx, e->ecx, e->edx);
+ return 0;
+}
+
+static struct seq_operations cpuid_override_seq_ops = {
+ .start = cpuid_override_seq_start,
+ .next = cpuid_override_seq_next,
+ .stop = cpuid_override_seq_stop,
+ .show = cpuid_override_seq_show,
+};
+
+static int cpuid_override_seq_open(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &cpuid_override_seq_ops);
+}
+
+static struct file_operations proc_cpuid_override_ops = {
+ .owner = THIS_MODULE,
+ .open = cpuid_override_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+ .write = cpuid_override_write,
+};
+
+static int __init cpuid_fault_init(void)
+{
+ struct proc_dir_entry *proc;
+
+ if (!cpu_has_cpuid_faulting)
+ return 0;
+
+ proc = proc_create("cpuid_override", 0644, proc_vz_dir,
+ &proc_cpuid_override_ops);
+ if (!proc)
+ return -ENOMEM;
+
+ return 0;
+}
+module_init(cpuid_fault_init);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 9823443df079..09223d59076e 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -247,6 +247,27 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
}
#endif
+static int check_cpuid_fault(struct pt_regs *regs, long error_code)
+{
+ unsigned long addr;
+ unsigned short opcode;
+
+ if (error_code != 0)
+ return 0;
+
+ addr = convert_ip_to_linear(current, regs);
+ if (get_user(opcode, (unsigned short __user *)addr))
+ return 0;
+
+ if (opcode != 0xa20f)
+ return 0;
+
+ do_cpuid_fault(regs);
+
+ regs->ip += 2;
+ return 1;
+}
+
dotraplinkage void __kprobes
do_general_protection(struct pt_regs *regs, long error_code)
{
@@ -277,6 +298,9 @@ do_general_protection(struct pt_regs *regs, long error_code)
goto exit;
}
+ if (check_cpuid_fault(regs, error_code))
+ return;
+
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_GP;
--
2.1.4
More information about the Devel
mailing list