[Devel] [PATCH vz8 1/3] arch/x86: introduce cpuid override

Kirill Tkhai ktkhai at virtuozzo.com
Tue Nov 3 13:04:36 MSK 2020


On 30.10.2020 14:45, Andrey Ryabinin wrote:
> From: Vladimir Davydov <vdavydov at virtuozzo.com>
> 
> Port diff-arch-x86-introduce-cpuid-override
> 
> Recent Intel CPUs rejected CPUID masking, which is required for flex
> migration, in favor of CPUID faulting. So we need to support it in
> kenrel.
> 
> This patch adds user writable file /proc/vz/cpuid_override, which
> contains CPUID override table. Each table entry must have the following
> format:
> 
>   op[ count]: eax ebx ecx edx
> 
> where @op and optional @count define a CPUID function, whose output one
> would like to override (@op and @count are loaded to EAX and ECX
> registers respectively before calling CPUID); @eax, @ebx, @ecx, @edx -
> the desired CPUID output for the specified function. All values must be
> in HEX, 0x prefix is optional.
> 
> Notes:
> 
>  - the file is only present on hosts that support CPUID faulting;
>  - CPUID faulting is always enabled if it is supported;
>  - CPUID output is overridden on all present CPUs;
>  - the maximal number of entries one can override equals 16;
>  - each write(2) to the file removes all existing entries before adding
>    new ones, so the whole table must be written in one write(2); in
>    particular writing an empty line to the file removes all existing
>    rules.
> 
> Example:
> 
> Suppose we want to mask out SSE2 (CPUID.01H:EDX:26) and RDTSCP
> (CPUID.80000001H:EDX:27). Then we should execute the following sequence:
> 
>  - get the current cpuid value:
> 
>    # cpuid -r | grep -e '^\s*0x00000001' -e '^\s*0x80000001' | head -n 2
>       0x00000001 0x00: eax=0x000306e4 ebx=0x00200800 ecx=0x7fbee3ff edx=0xbfebfbff
>       0x80000001 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000001 edx=0x2c100800
> 
>  - clear the feature bits we want to mask out and write the result to
>    /proc/vz/cpuid_override:
> 
>    # cat >/proc/vz/cpuid_override <<EOF
>    0x00000001: 0x000306e4 0x00200800 0x7fbee3ff 0xbbebfbff
>    0x80000001: 0x00000000 0x00000000 0x00000001 0x24100800
>    EOF
> 
>  - check that cpuid output was overridden:
> 
>    # cpuid -r | grep -e '^\s*0x00000001' -e '^\s*0x80000001' | head -n 2
>       0x00000001 0x00: eax=0x000306e4 ebx=0x00200800 ecx=0x7fbee3ff edx=0xbbebfbff
>       0x80000001 0x00: eax=0x00000000 ebx=0x00000000 ecx=0x00000001 edx=0x24100800
> 
> https://jira.sw.ru/browse/PSBM-28682
> 
> Signed-off-by: Vladimir Davydov <vdavydov at parallels.com>
> 
> Acked-by: Cyrill Gorcunov <gorcunov at parallels.com>
> =============================================================================
> 
> https://jira.sw.ru/browse/PSBM-33638
> 
> Signed-off-by: Vladimir Davydov <vdavydov at virtuozzo.com>
> Rebase:
> Signed-off-by: Kirill Tkhai <ktkhai at virtuozzo.com>
> 
> https://jira.sw.ru/browse/PSBM-121823
> [aryabinin: vz8 rebase]
> Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>

For the series:

Reviewed-by: Kirill Tkhai <ktkhai at virtuozzo.com>

> ---
>  arch/x86/include/asm/msr-index.h |   1 +
>  arch/x86/include/asm/traps.h     |   2 +
>  arch/x86/kernel/Makefile         |   1 +
>  arch/x86/kernel/cpu/proc.c       |   4 +
>  arch/x86/kernel/cpuid_fault.c    | 258 +++++++++++++++++++++++++++++++
>  arch/x86/kernel/traps.c          |  24 +++
>  6 files changed, 290 insertions(+)
>  create mode 100644 arch/x86/kernel/cpuid_fault.c
> 
> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
> index 6a21c227775c..9668ec6a064d 100644
> --- a/arch/x86/include/asm/msr-index.h
> +++ b/arch/x86/include/asm/msr-index.h
> @@ -114,6 +114,7 @@
>  
>  #define MSR_IA32_BBL_CR_CTL		0x00000119
>  #define MSR_IA32_BBL_CR_CTL3		0x0000011e
> +#define MSR_MISC_FEATURES_ENABLES	0x00000140
>  
>  #define MSR_IA32_TSX_CTRL		0x00000122
>  #define TSX_CTRL_RTM_DISABLE		BIT(0)	/* Disable RTM feature */
> diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
> index 0ae298ea01a1..0282c81719e7 100644
> --- a/arch/x86/include/asm/traps.h
> +++ b/arch/x86/include/asm/traps.h
> @@ -124,6 +124,8 @@ void __noreturn handle_stack_overflow(const char *message,
>  				      unsigned long fault_address);
>  #endif
>  
> +void do_cpuid_fault(struct pt_regs *);
> +
>  /* Interrupts/Exceptions */
>  enum {
>  	X86_TRAP_DE = 0,	/*  0, Divide-by-zero */
> diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
> index 431d8c6e641d..b9451b653b04 100644
> --- a/arch/x86/kernel/Makefile
> +++ b/arch/x86/kernel/Makefile
> @@ -63,6 +63,7 @@ obj-y			+= pci-iommu_table.o
>  obj-y			+= resource.o
>  obj-y			+= irqflags.o
>  obj-y			+= spec_ctrl.o
> +obj-y			+= cpuid_fault.o
>  
>  obj-y				+= process.o
>  obj-y				+= fpu/
> diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
> index 2c8522a39ed5..d6b17a60acf6 100644
> --- a/arch/x86/kernel/cpu/proc.c
> +++ b/arch/x86/kernel/cpu/proc.c
> @@ -54,6 +54,10 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
>  }
>  #endif
>  
> +extern void __do_cpuid_fault(unsigned int op, unsigned int count,
> +			     unsigned int *eax, unsigned int *ebx,
> +			     unsigned int *ecx, unsigned int *edx);
> +
>  static int show_cpuinfo(struct seq_file *m, void *v)
>  {
>  	struct cpuinfo_x86 *c = v;
> diff --git a/arch/x86/kernel/cpuid_fault.c b/arch/x86/kernel/cpuid_fault.c
> new file mode 100644
> index 000000000000..339e2638c3b8
> --- /dev/null
> +++ b/arch/x86/kernel/cpuid_fault.c
> @@ -0,0 +1,258 @@
> +#include <linux/gfp.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +#include <linux/rcupdate.h>
> +#include <linux/module.h>
> +#include <linux/proc_fs.h>
> +#include <linux/seq_file.h>
> +#include <linux/ve.h>
> +#include <asm/uaccess.h>
> +
> +struct cpuid_override_entry {
> +	unsigned int op;
> +	unsigned int count;
> +	bool has_count;
> +	unsigned int eax;
> +	unsigned int ebx;
> +	unsigned int ecx;
> +	unsigned int edx;
> +};
> +
> +#define MAX_CPUID_OVERRIDE_ENTRIES	16
> +
> +struct cpuid_override_table {
> +	struct rcu_head rcu_head;
> +	int size;
> +	struct cpuid_override_entry entries[MAX_CPUID_OVERRIDE_ENTRIES];
> +};
> +
> +static struct cpuid_override_table __rcu *cpuid_override __read_mostly;
> +static DEFINE_SPINLOCK(cpuid_override_lock);
> +
> +static void cpuid_override_update(struct cpuid_override_table *new_table)
> +{
> +	struct cpuid_override_table *old_table;
> +
> +	spin_lock(&cpuid_override_lock);
> +	old_table = rcu_access_pointer(cpuid_override);
> +	rcu_assign_pointer(cpuid_override, new_table);
> +	spin_unlock(&cpuid_override_lock);
> +
> +	if (old_table)
> +		kfree_rcu(old_table, rcu_head);
> +}
> +
> +static bool cpuid_override_match(unsigned int op, unsigned int count,
> +				 unsigned int *eax, unsigned int *ebx,
> +				 unsigned int *ecx, unsigned int *edx)
> +{
> +	bool ret = false;
> +	struct cpuid_override_table *t;
> +	struct cpuid_override_entry *e;
> +	int i;
> +
> +	rcu_read_lock();
> +	t = rcu_dereference(cpuid_override);
> +	if (!t)
> +		goto out;
> +
> +	for (i = 0; i < t->size; i++) {
> +		e = &t->entries[i];
> +		if (e->op != op)
> +			continue;
> +		if (e->has_count && e->count != count)
> +			continue;
> +		*eax = e->eax;
> +		*ebx = e->ebx;
> +		*ecx = e->ecx;
> +		*edx = e->edx;
> +		ret = true;
> +		break;
> +	}
> +out:
> +	rcu_read_unlock();
> +	return ret;
> +}
> +
> +void __do_cpuid_fault(unsigned int op, unsigned int count,
> +		      unsigned int *eax, unsigned int *ebx,
> +		      unsigned int *ecx, unsigned int *edx)
> +{
> +	/* check if op is overridden */
> +	if (cpuid_override_match(op, count, eax, ebx, ecx, edx))
> +		return;
> +
> +	/* fallback to real cpuid */
> +	cpuid_count(op, count, eax, ebx, ecx, edx);
> +}
> +
> +void do_cpuid_fault(struct pt_regs *regs)
> +{
> +	unsigned int eax, ebx, ecx, edx;
> +
> +	__do_cpuid_fault(regs->ax, regs->cx, &eax, &ebx, &ecx, &edx);
> +
> +	regs->ax = eax;
> +	regs->bx = ebx;
> +	regs->cx = ecx;
> +	regs->dx = edx;
> +}
> +
> +/*
> + * CPUID override entry format:
> + *
> + * op[ count]: eax ebx ecx edx
> + *
> + * All values are in HEX.
> + */
> +static int cpuid_override_entry_parse(const char *s, char **endp,
> +				      struct cpuid_override_entry *e)
> +{
> +	int taken;
> +	char *end;
> +
> +	if (sscanf(s, "%x %x: %x %x %x %x%n",
> +		   &e->op, &e->count, &e->eax, &e->ebx, &e->ecx, &e->edx,
> +		   &taken) == 6)
> +		e->has_count = true;
> +	else if (sscanf(s, "%x: %x %x %x %x%n",
> +			&e->op, &e->eax, &e->ebx, &e->ecx, &e->edx,
> +			&taken) == 5)
> +		e->has_count = false;
> +	else
> +		return -EINVAL;
> +
> +	end = (char *)s + taken;
> +	if (*end) {
> +		if (*end != '\n')
> +			return -EINVAL;
> +		++end;
> +	}
> +	*endp = end;
> +	return 0;
> +}
> +
> +static ssize_t cpuid_override_write(struct file *file, const char __user *buf,
> +				    size_t count, loff_t *ppos)
> +{
> +	struct cpuid_override_table *t = NULL;
> +	void *page = NULL;
> +	char *s;
> +	int err;
> +
> +	err = -E2BIG;
> +	if (count >= PAGE_SIZE)
> +		goto out;
> +
> +	err = -ENOMEM;
> +	t = kmalloc(sizeof(*t), GFP_KERNEL);
> +	if (!t)
> +		goto out;
> +
> +	page = (void *)__get_free_page(GFP_KERNEL);
> +	if (!page)
> +		goto out;
> +
> +	err = copy_from_user(page, buf, count);
> +	if (err)
> +		goto out;
> +
> +	s = page;
> +	s[count] = '\0';
> +	t->size = 0;
> +	while (*(s = skip_spaces(s))) {
> +		err = -E2BIG;
> +		if (t->size == MAX_CPUID_OVERRIDE_ENTRIES)
> +			goto out;
> +		err = -EINVAL;
> +		if (cpuid_override_entry_parse(s, &s, &t->entries[t->size++]))
> +			goto out;
> +	}
> +	if (!t->size) {
> +		kfree(t);
> +		t = NULL;
> +	}
> +	err = 0;
> +out:
> +	free_page((unsigned long)page);
> +
> +	if (!err)
> +		cpuid_override_update(t);
> +	else
> +		kfree(t);
> +
> +	return err ?: count;
> +}
> +
> +static void *__cpuid_override_seq_start(loff_t pos)
> +{
> +	struct cpuid_override_table *t = rcu_dereference(cpuid_override);
> +	return t && pos < t->size ? &t->entries[pos] : NULL;
> +}
> +
> +static void *cpuid_override_seq_start(struct seq_file *seq, loff_t *ppos)
> +{
> +	rcu_read_lock();
> +	return __cpuid_override_seq_start(*ppos);
> +}
> +
> +static void *cpuid_override_seq_next(struct seq_file *seq,
> +				     void *v, loff_t *ppos)
> +{
> +	++*ppos;
> +	return __cpuid_override_seq_start(*ppos);
> +}
> +
> +static void cpuid_override_seq_stop(struct seq_file *s, void *v)
> +{
> +	rcu_read_unlock();
> +}
> +
> +static int cpuid_override_seq_show(struct seq_file *s, void *v)
> +{
> +	struct cpuid_override_entry *e = v;
> +
> +	seq_printf(s, "0x%08x", e->op);
> +	if (e->has_count)
> +		seq_printf(s, " 0x%08x", e->count);
> +	seq_printf(s, ": 0x%08x 0x%08x 0x%08x 0x%08x\n",
> +		   e->eax, e->ebx, e->ecx, e->edx);
> +	return 0;
> +}
> +
> +static struct seq_operations cpuid_override_seq_ops = {
> +	.start = cpuid_override_seq_start,
> +	.next  = cpuid_override_seq_next,
> +	.stop  = cpuid_override_seq_stop,
> +	.show  = cpuid_override_seq_show,
> +};
> +
> +static int cpuid_override_seq_open(struct inode *inode, struct file *file)
> +{
> +	return seq_open(file, &cpuid_override_seq_ops);
> +}
> +
> +static struct file_operations proc_cpuid_override_ops = {
> +	.owner   = THIS_MODULE,
> +	.open    = cpuid_override_seq_open,
> +	.read    = seq_read,
> +	.llseek  = seq_lseek,
> +	.release = seq_release,
> +	.write   = cpuid_override_write,
> +};
> +
> +static int __init cpuid_fault_init(void)
> +{
> +	struct proc_dir_entry *proc;
> +
> +	if (!static_cpu_has(X86_FEATURE_CPUID_FAULT))
> +		return 0;
> +
> +	proc = proc_create("cpuid_override", 0644, proc_vz_dir,
> +			   &proc_cpuid_override_ops);
> +	if (!proc)
> +		return -ENOMEM;
> +
> +	return 0;
> +}
> +module_init(cpuid_fault_init);
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index 4b96d9a574ff..c43e3b80e50f 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -518,6 +518,27 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
>  	do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL);
>  }
>  
> +static int check_cpuid_fault(struct pt_regs *regs, long error_code)
> +{
> +	unsigned long addr;
> +	unsigned short opcode;
> +
> +	if (error_code != 0)
> +		return 0;
> +
> +	addr = convert_ip_to_linear(current, regs);
> +	if (get_user(opcode, (unsigned short __user *)addr))
> +		return 0;
> +
> +	if (opcode != 0xa20f)
> +		return 0;
> +
> +	do_cpuid_fault(regs);
> +
> +	regs->ip += 2;
> +	return 1;
> +}
> +
>  dotraplinkage void
>  do_general_protection(struct pt_regs *regs, long error_code)
>  {
> @@ -551,6 +572,9 @@ do_general_protection(struct pt_regs *regs, long error_code)
>  		return;
>  	}
>  
> +	if (check_cpuid_fault(regs, error_code))
> +		return;
> +
>  	tsk->thread.error_code = error_code;
>  	tsk->thread.trap_nr = X86_TRAP_GP;
>  
> 



More information about the Devel mailing list