[Devel] [PATCH vz8 1/3] arch/x86: introduce cpuid override
Kirill Tkhai
ktkhai at virtuozzo.com
Tue Nov 3 13:04:36 MSK 2020
On 30.10.2020 14:45, Andrey Ryabinin wrote:
> From: Vladimir Davydov <vdavydov at virtuozzo.com>
>
> Port diff-arch-x86-introduce-cpuid-override
>
> Recent Intel CPUs rejected CPUID masking, which is required for flex
> migration, in favor of CPUID faulting. So we need to support it in
> kenrel.
>
> This patch adds user writable file /proc/vz/cpuid_override, which
> contains CPUID override table. Each table entry must have the following
> format:
>
> op[ count]: eax ebx ecx edx
>
> where @op and optional @count define a CPUID function, whose output one
> would like to override (@op and @count are loaded to EAX and ECX
> registers respectively before calling CPUID); @eax, @ebx, @ecx, @edx -
> the desired CPUID output for the specified function. All values must be
> in HEX, 0x prefix is optional.
>
> Notes:
>
> - the file is only present on hosts that support CPUID faulting;
> - CPUID faulting is always enabled if it is supported;
> - CPUID output is overridden on all present CPUs;
> - the maximal number of entries one can override equals 16;
> - each write(2) to the file removes all existing entries before adding
> new ones, so the whole table must be written in one write(2); in
> particular writing an empty line to the file removes all existing
> rules.
>
> Example:
>
> Suppose we want to mask out SSE2 (CPUID.01H:EDX:26) and RDTSCP
> (CPUID.80000001H:EDX:27). Then we should execute the following sequence:
>
> - get the current cpuid value:
>
> # cpuid -r | grep -e '^\s*0x00000001' -e '^\s*0x80000001' | head -n 2
> 0x00000001 0x00: eax=0x000306e4 ebx=0x00200800 ecx=0x7fbee3ff edx=0xbfebfbff
> 0x80000001 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000001 edx=0x2c100800
>
> - clear the feature bits we want to mask out and write the result to
> /proc/vz/cpuid_override:
>
> # cat >/proc/vz/cpuid_override <<EOF
> 0x00000001: 0x000306e4 0x00200800 0x7fbee3ff 0xbbebfbff
> 0x80000001: 0x00000000 0x00000000 0x00000001 0x24100800
> EOF
>
> - check that cpuid output was overridden:
>
> # cpuid -r | grep -e '^\s*0x00000001' -e '^\s*0x80000001' | head -n 2
> 0x00000001 0x00: eax=0x000306e4 ebx=0x00200800 ecx=0x7fbee3ff edx=0xbbebfbff
> 0x80000001 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000001 edx=0x24100800
>
> https://jira.sw.ru/browse/PSBM-28682
>
> Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
>
> Acked-by: Cyrill Gorcunov <gorcunov at parallels.com>
> =============================================================================
>
> https://jira.sw.ru/browse/PSBM-33638
>
> Signed-off-by: Vladimir Davydov <vdavydov at virtuozzo.com>
> Rebase:
> Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
>
> https://jira.sw.ru/browse/PSBM-121823
> [aryabinin: vz8 rebase]
> Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
For the series:
Reviewed-by: Kirill Tkhai <ktkhai at virtuozzo.com>
> ---
> arch/x86/include/asm/msr-index.h | 1 +
> arch/x86/include/asm/traps.h | 2 +
> arch/x86/kernel/Makefile | 1 +
> arch/x86/kernel/cpu/proc.c | 4 +
> arch/x86/kernel/cpuid_fault.c | 258 +++++++++++++++++++++++++++++++
> arch/x86/kernel/traps.c | 24 +++
> 6 files changed, 290 insertions(+)
> create mode 100644 arch/x86/kernel/cpuid_fault.c
>
> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
> index 6a21c227775c..9668ec6a064d 100644
> --- a/arch/x86/include/asm/msr-index.h
> +++ b/arch/x86/include/asm/msr-index.h
> @@ -114,6 +114,7 @@
>
> #define MSR_IA32_BBL_CR_CTL 0x00000119
> #define MSR_IA32_BBL_CR_CTL3 0x0000011e
> +#define MSR_MISC_FEATURES_ENABLES 0x00000140
>
> #define MSR_IA32_TSX_CTRL 0x00000122
> #define TSX_CTRL_RTM_DISABLE BIT(0) /* Disable RTM feature */
> diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
> index 0ae298ea01a1..0282c81719e7 100644
> --- a/arch/x86/include/asm/traps.h
> +++ b/arch/x86/include/asm/traps.h
> @@ -124,6 +124,8 @@ void __noreturn handle_stack_overflow(const char *message,
> unsigned long fault_address);
> #endif
>
> +void do_cpuid_fault(struct pt_regs *);
> +
> /* Interrupts/Exceptions */
> enum {
> X86_TRAP_DE = 0, /* 0, Divide-by-zero */
> diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
> index 431d8c6e641d..b9451b653b04 100644
> --- a/arch/x86/kernel/Makefile
> +++ b/arch/x86/kernel/Makefile
> @@ -63,6 +63,7 @@ obj-y += pci-iommu_table.o
> obj-y += resource.o
> obj-y += irqflags.o
> obj-y += spec_ctrl.o
> +obj-y += cpuid_fault.o
>
> obj-y += process.o
> obj-y += fpu/
> diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
> index 2c8522a39ed5..d6b17a60acf6 100644
> --- a/arch/x86/kernel/cpu/proc.c
> +++ b/arch/x86/kernel/cpu/proc.c
> @@ -54,6 +54,10 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
> }
> #endif
>
> +extern void __do_cpuid_fault(unsigned int op, unsigned int count,
> + unsigned int *eax, unsigned int *ebx,
> + unsigned int *ecx, unsigned int *edx);
> +
> static int show_cpuinfo(struct seq_file *m, void *v)
> {
> struct cpuinfo_x86 *c = v;
> diff --git a/arch/x86/kernel/cpuid_fault.c b/arch/x86/kernel/cpuid_fault.c
> new file mode 100644
> index 000000000000..339e2638c3b8
> --- /dev/null
> +++ b/arch/x86/kernel/cpuid_fault.c
> @@ -0,0 +1,258 @@
> +#include <linux/gfp.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +#include <linux/rcupdate.h>
> +#include <linux/module.h>
> +#include <linux/proc_fs.h>
> +#include <linux/seq_file.h>
> +#include <linux/ve.h>
> +#include <asm/uaccess.h>
> +
> +struct cpuid_override_entry {
> + unsigned int op;
> + unsigned int count;
> + bool has_count;
> + unsigned int eax;
> + unsigned int ebx;
> + unsigned int ecx;
> + unsigned int edx;
> +};
> +
> +#define MAX_CPUID_OVERRIDE_ENTRIES 16
> +
> +struct cpuid_override_table {
> + struct rcu_head rcu_head;
> + int size;
> + struct cpuid_override_entry entries[MAX_CPUID_OVERRIDE_ENTRIES];
> +};
> +
> +static struct cpuid_override_table __rcu *cpuid_override __read_mostly;
> +static DEFINE_SPINLOCK(cpuid_override_lock);
> +
> +static void cpuid_override_update(struct cpuid_override_table *new_table)
> +{
> + struct cpuid_override_table *old_table;
> +
> + spin_lock(&cpuid_override_lock);
> + old_table = rcu_access_pointer(cpuid_override);
> + rcu_assign_pointer(cpuid_override, new_table);
> + spin_unlock(&cpuid_override_lock);
> +
> + if (old_table)
> + kfree_rcu(old_table, rcu_head);
> +}
> +
> +static bool cpuid_override_match(unsigned int op, unsigned int count,
> + unsigned int *eax, unsigned int *ebx,
> + unsigned int *ecx, unsigned int *edx)
> +{
> + bool ret = false;
> + struct cpuid_override_table *t;
> + struct cpuid_override_entry *e;
> + int i;
> +
> + rcu_read_lock();
> + t = rcu_dereference(cpuid_override);
> + if (!t)
> + goto out;
> +
> + for (i = 0; i < t->size; i++) {
> + e = &t->entries[i];
> + if (e->op != op)
> + continue;
> + if (e->has_count && e->count != count)
> + continue;
> + *eax = e->eax;
> + *ebx = e->ebx;
> + *ecx = e->ecx;
> + *edx = e->edx;
> + ret = true;
> + break;
> + }
> +out:
> + rcu_read_unlock();
> + return ret;
> +}
> +
> +void __do_cpuid_fault(unsigned int op, unsigned int count,
> + unsigned int *eax, unsigned int *ebx,
> + unsigned int *ecx, unsigned int *edx)
> +{
> + /* check if op is overridden */
> + if (cpuid_override_match(op, count, eax, ebx, ecx, edx))
> + return;
> +
> + /* fallback to real cpuid */
> + cpuid_count(op, count, eax, ebx, ecx, edx);
> +}
> +
> +void do_cpuid_fault(struct pt_regs *regs)
> +{
> + unsigned int eax, ebx, ecx, edx;
> +
> + __do_cpuid_fault(regs->ax, regs->cx, &eax, &ebx, &ecx, &edx);
> +
> + regs->ax = eax;
> + regs->bx = ebx;
> + regs->cx = ecx;
> + regs->dx = edx;
> +}
> +
> +/*
> + * CPUID override entry format:
> + *
> + * op[ count]: eax ebx ecx edx
> + *
> + * All values are in HEX.
> + */
> +static int cpuid_override_entry_parse(const char *s, char **endp,
> + struct cpuid_override_entry *e)
> +{
> + int taken;
> + char *end;
> +
> + if (sscanf(s, "%x %x: %x %x %x %x%n",
> + &e->op, &e->count, &e->eax, &e->ebx, &e->ecx, &e->edx,
> + &taken) == 6)
> + e->has_count = true;
> + else if (sscanf(s, "%x: %x %x %x %x%n",
> + &e->op, &e->eax, &e->ebx, &e->ecx, &e->edx,
> + &taken) == 5)
> + e->has_count = false;
> + else
> + return -EINVAL;
> +
> + end = (char *)s + taken;
> + if (*end) {
> + if (*end != '\n')
> + return -EINVAL;
> + ++end;
> + }
> + *endp = end;
> + return 0;
> +}
> +
> +static ssize_t cpuid_override_write(struct file *file, const char __user *buf,
> + size_t count, loff_t *ppos)
> +{
> + struct cpuid_override_table *t = NULL;
> + void *page = NULL;
> + char *s;
> + int err;
> +
> + err = -E2BIG;
> + if (count >= PAGE_SIZE)
> + goto out;
> +
> + err = -ENOMEM;
> + t = kmalloc(sizeof(*t), GFP_KERNEL);
> + if (!t)
> + goto out;
> +
> + page = (void *)__get_free_page(GFP_KERNEL);
> + if (!page)
> + goto out;
> +
> + err = copy_from_user(page, buf, count);
> + if (err)
> + goto out;
> +
> + s = page;
> + s[count] = '\0';
> + t->size = 0;
> + while (*(s = skip_spaces(s))) {
> + err = -E2BIG;
> + if (t->size == MAX_CPUID_OVERRIDE_ENTRIES)
> + goto out;
> + err = -EINVAL;
> + if (cpuid_override_entry_parse(s, &s, &t->entries[t->size++]))
> + goto out;
> + }
> + if (!t->size) {
> + kfree(t);
> + t = NULL;
> + }
> + err = 0;
> +out:
> + free_page((unsigned long)page);
> +
> + if (!err)
> + cpuid_override_update(t);
> + else
> + kfree(t);
> +
> + return err ?: count;
> +}
> +
> +static void *__cpuid_override_seq_start(loff_t pos)
> +{
> + struct cpuid_override_table *t = rcu_dereference(cpuid_override);
> + return t && pos < t->size ? &t->entries[pos] : NULL;
> +}
> +
> +static void *cpuid_override_seq_start(struct seq_file *seq, loff_t *ppos)
> +{
> + rcu_read_lock();
> + return __cpuid_override_seq_start(*ppos);
> +}
> +
> +static void *cpuid_override_seq_next(struct seq_file *seq,
> + void *v, loff_t *ppos)
> +{
> + ++*ppos;
> + return __cpuid_override_seq_start(*ppos);
> +}
> +
> +static void cpuid_override_seq_stop(struct seq_file *s, void *v)
> +{
> + rcu_read_unlock();
> +}
> +
> +static int cpuid_override_seq_show(struct seq_file *s, void *v)
> +{
> + struct cpuid_override_entry *e = v;
> +
> + seq_printf(s, "0x%08x", e->op);
> + if (e->has_count)
> + seq_printf(s, " 0x%08x", e->count);
> + seq_printf(s, ": 0x%08x 0x%08x 0x%08x 0x%08x\n",
> + e->eax, e->ebx, e->ecx, e->edx);
> + return 0;
> +}
> +
> +static struct seq_operations cpuid_override_seq_ops = {
> + .start = cpuid_override_seq_start,
> + .next = cpuid_override_seq_next,
> + .stop = cpuid_override_seq_stop,
> + .show = cpuid_override_seq_show,
> +};
> +
> +static int cpuid_override_seq_open(struct inode *inode, struct file *file)
> +{
> + return seq_open(file, &cpuid_override_seq_ops);
> +}
> +
> +static struct file_operations proc_cpuid_override_ops = {
> + .owner = THIS_MODULE,
> + .open = cpuid_override_seq_open,
> + .read = seq_read,
> + .llseek = seq_lseek,
> + .release = seq_release,
> + .write = cpuid_override_write,
> +};
> +
> +static int __init cpuid_fault_init(void)
> +{
> + struct proc_dir_entry *proc;
> +
> + if (!static_cpu_has(X86_FEATURE_CPUID_FAULT))
> + return 0;
> +
> + proc = proc_create("cpuid_override", 0644, proc_vz_dir,
> + &proc_cpuid_override_ops);
> + if (!proc)
> + return -ENOMEM;
> +
> + return 0;
> +}
> +module_init(cpuid_fault_init);
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index 4b96d9a574ff..c43e3b80e50f 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -518,6 +518,27 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
> do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL);
> }
>
> +static int check_cpuid_fault(struct pt_regs *regs, long error_code)
> +{
> + unsigned long addr;
> + unsigned short opcode;
> +
> + if (error_code != 0)
> + return 0;
> +
> + addr = convert_ip_to_linear(current, regs);
> + if (get_user(opcode, (unsigned short __user *)addr))
> + return 0;
> +
> + if (opcode != 0xa20f)
> + return 0;
> +
> + do_cpuid_fault(regs);
> +
> + regs->ip += 2;
> + return 1;
> +}
> +
> dotraplinkage void
> do_general_protection(struct pt_regs *regs, long error_code)
> {
> @@ -551,6 +572,9 @@ do_general_protection(struct pt_regs *regs, long error_code)
> return;
> }
>
> + if (check_cpuid_fault(regs, error_code))
> + return;
> +
> tsk->thread.error_code = error_code;
> tsk->thread.trap_nr = X86_TRAP_GP;
>
>
More information about the Devel
mailing list