[CRIU] [PATCH 04/18] x86: cpu -- Detect and save fpu status in info

Cyrill Gorcunov gorcunov at gmail.com
Thu Jul 19 15:47:32 MSK 2018


 - externd compel_cpuinfo_t to keep all fpu information
   neded for xsaves mode

 - fetch xsaves data in compel_cpuid

All this will allow us to extend criu to support
avx-512 intructions.

Signed-off-by: Cyrill Gorcunov <gorcunov at gmail.com>
---
 compel/arch/x86/src/lib/cpu.c                  | 206 ++++++++++++++++++++++++-
 compel/arch/x86/src/lib/include/uapi/asm/cpu.h |  15 ++
 compel/arch/x86/src/lib/include/uapi/asm/fpu.h | 185 +++++++++++++++++++++-
 3 files changed, 399 insertions(+), 7 deletions(-)

diff --git a/compel/arch/x86/src/lib/cpu.c b/compel/arch/x86/src/lib/cpu.c
index 93e430face6f..4657f9723ba6 100644
--- a/compel/arch/x86/src/lib/cpu.c
+++ b/compel/arch/x86/src/lib/cpu.c
@@ -6,6 +6,7 @@
 #include "common/compiler.h"
 
 #include "log.h"
+#include "common/bug.h"
 
 #undef	LOG_PREFIX
 #define LOG_PREFIX "cpu: "
@@ -13,6 +14,40 @@
 static compel_cpuinfo_t rt_info;
 static bool rt_info_done = false;
 
+/*
+ * Although we spell it out in here, the Processor Trace
+ * xfeature is completely unused. We use other mechanisms
+ * to save/restore PT state in Linux.
+ */
+
+static const char * const xfeature_names[] = {
+	"x87 floating point registers"	,
+	"SSE registers"			,
+	"AVX registers"			,
+	"MPX bounds registers"		,
+	"MPX CSR"			,
+	"AVX-512 opmask"		,
+	"AVX-512 Hi256"			,
+	"AVX-512 ZMM_Hi256"		,
+	"Processor Trace"		,
+	"Protection Keys User registers",
+	"Hardware Duty Cycling"		,
+};
+
+static short xsave_cpuid_features[] = {
+	X86_FEATURE_FPU,
+	X86_FEATURE_XMM,
+	X86_FEATURE_AVX,
+	X86_FEATURE_MPX,
+	X86_FEATURE_MPX,
+	X86_FEATURE_AVX512F,
+	X86_FEATURE_AVX512F,
+	X86_FEATURE_AVX512F,
+	X86_FEATURE_INTEL_PT,
+	X86_FEATURE_PKU,
+	X86_FEATURE_HDC,
+};
+
 void compel_set_cpu_cap(compel_cpuinfo_t *c, unsigned int feature)
 {
 	if (likely(feature < NCAPINTS_BITS))
@@ -32,6 +67,172 @@ int compel_test_cpu_cap(compel_cpuinfo_t *c, unsigned int feature)
 	return 0;
 }
 
+static int compel_fpuid(compel_cpuinfo_t *c)
+{
+	unsigned int last_good_offset;
+	uint32_t eax, ebx, ecx, edx;
+	size_t i;
+
+	BUILD_BUG_ON(ARRAY_SIZE(xsave_cpuid_features) !=
+		     ARRAY_SIZE(xfeature_names));
+
+	if (!compel_test_cpu_cap(c, X86_FEATURE_FPU)) {
+		pr_err("fpu: No FPU detected\n");
+		return -1;
+	}
+
+	if (!compel_test_cpu_cap(c, X86_FEATURE_XSAVE)) {
+		pr_info("fpu: x87 FPU will use %s\n",
+			compel_test_cpu_cap(c, X86_FEATURE_FXSR) ?
+			"FXSAVE" : "FSAVE");
+		return 0;
+	}
+
+	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
+	c->xfeatures_mask = eax + ((uint64_t)edx << 32);
+
+	if ((c->xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
+		/*
+		 * This indicates that something really unexpected happened
+		 * with the enumeration.
+		 */
+		pr_err("fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx\n",
+		       (unsigned long long)c->xfeatures_mask);
+		return -1;
+	}
+
+	/*
+	 * Clear XSAVE features that are disabled in the normal CPUID.
+	 */
+	for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
+		if (!compel_test_cpu_cap(c, xsave_cpuid_features[i]))
+			c->xfeatures_mask &= ~(1 << i);
+	}
+
+	c->xfeatures_mask &= XCNTXT_MASK;
+	c->xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR;
+
+	/*
+	 * xsaves is not enabled in userspace, so
+	 * xsaves is mostly for debug purpose.
+	 */
+	cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
+	c->xsave_size = ebx;
+	c->xsave_size_max = ecx;
+
+	cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
+	c->xsaves_size = ebx;
+
+	pr_debug("fpu: xfeatures_mask 0x%llx xsave_size %u xsave_size_max %u xsaves_size %u\n",
+		 (unsigned long long)c->xfeatures_mask,
+		 c->xsave_size, c->xsave_size_max, c->xsaves_size);
+
+	if (c->xsave_size_max > sizeof(struct xsave_struct))
+		pr_warn_once("fpu: max xsave frame exceed xsave_struct (%u %u)\n",
+			     c->xsave_size_max, (unsigned)sizeof(struct xsave_struct));
+
+	memset(c->xstate_offsets, 0xff, sizeof(c->xstate_offsets));
+	memset(c->xstate_sizes, 0xff, sizeof(c->xstate_sizes));
+	memset(c->xstate_comp_offsets, 0xff, sizeof(c->xstate_comp_offsets));
+	memset(c->xstate_comp_sizes, 0xff, sizeof(c->xstate_comp_sizes));
+
+	/* start at the beginnning of the "extended state" */
+	last_good_offset = offsetof(struct xsave_struct, extended_state_area);
+
+	/*
+	 * The FP xstates and SSE xstates are legacy states. They are always
+	 * in the fixed offsets in the xsave area in either compacted form
+	 * or standard form.
+	 */
+	c->xstate_offsets[0]	= 0;
+	c->xstate_sizes[0]	= offsetof(struct i387_fxsave_struct, xmm_space);
+	c->xstate_offsets[1]	= c->xstate_sizes[0];
+	c->xstate_sizes[1]	= FIELD_SIZEOF(struct i387_fxsave_struct, xmm_space);
+
+	for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+		if (!(c->xfeatures_mask & (1UL << i)))
+			continue;
+
+		/*
+		 * If an xfeature is supervisor state, the offset
+		 * in EBX is invalid. We leave it to -1.
+		 *
+		 * SDM says: If state component 'i' is a user state component,
+		 * ECX[0] return 0; if state component i is a supervisor
+		 * state component, ECX[0] returns 1.
+		 */
+		cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
+		if (!(ecx & 1))
+			c->xstate_offsets[i] = ebx;
+
+		c->xstate_sizes[i] = eax;
+
+		/*
+		 * In our xstate size checks, we assume that the
+		 * highest-numbered xstate feature has the
+		 * highest offset in the buffer.  Ensure it does.
+		 */
+		if (last_good_offset > c->xstate_offsets[i])
+			pr_warn_once("fpu: misordered xstate %d %d\n",
+				     last_good_offset, c->xstate_offsets[i]);
+
+		last_good_offset = c->xstate_offsets[i];
+	}
+
+	BUILD_BUG_ON(sizeof(c->xstate_offsets) != sizeof(c->xstate_sizes));
+	BUILD_BUG_ON(sizeof(c->xstate_comp_offsets) != sizeof(c->xstate_comp_sizes));
+
+	c->xstate_comp_offsets[0]	= 0;
+	c->xstate_comp_sizes[0]		= offsetof(struct i387_fxsave_struct, xmm_space);
+	c->xstate_comp_offsets[1]	= c->xstate_comp_sizes[0];
+	c->xstate_comp_sizes[1]		= FIELD_SIZEOF(struct i387_fxsave_struct, xmm_space);
+
+	if (!compel_test_cpu_cap(c, X86_FEATURE_XSAVES)) {
+		for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+			if ((c->xfeatures_mask & (1UL << i))) {
+				c->xstate_comp_offsets[i] = c->xstate_offsets[i];
+				c->xstate_comp_sizes[i] = c->xstate_sizes[i];
+			}
+		}
+	} else {
+		c->xstate_comp_offsets[FIRST_EXTENDED_XFEATURE] =
+			FXSAVE_SIZE + XSAVE_HDR_SIZE;
+
+		for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
+			if ((c->xfeatures_mask & (1UL << i)))
+				c->xstate_comp_sizes[i] = c->xstate_sizes[i];
+			else
+				c->xstate_comp_sizes[i] = 0;
+
+			if (i > FIRST_EXTENDED_XFEATURE) {
+				c->xstate_comp_offsets[i] = c->xstate_comp_offsets[i-1]
+					+ c->xstate_comp_sizes[i-1];
+
+				/*
+				 * The value returned by ECX[1] indicates the alignment
+				 * of state component 'i' when the compacted format
+				 * of the extended region of an XSAVE area is used:
+				 */
+				cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx);
+				if (ecx & 2)
+					c->xstate_comp_offsets[i] = ALIGN(c->xstate_comp_offsets[i], 64);
+			}
+		}
+	}
+
+	if (!pr_quelled(COMPEL_LOG_DEBUG)) {
+		for (i = 0; i < ARRAY_SIZE(c->xstate_offsets); i++) {
+			if (!(c->xfeatures_mask & (1UL << i)))
+				continue;
+			pr_debug("fpu: %-32s xstate_offsets %6d / %-6d xstate_sizes %6d / %-6d\n",
+				 xfeature_names[i], c->xstate_offsets[i], c->xstate_comp_offsets[i],
+				 c->xstate_sizes[i], c->xstate_comp_sizes[i]);
+		}
+	}
+
+	return 0;
+}
+
 int compel_cpuid(compel_cpuinfo_t *c)
 {
 	uint32_t eax, ebx, ecx, edx;
@@ -222,7 +423,10 @@ int compel_cpuid(compel_cpuinfo_t *c)
 		break;
 	}
 
-	return 0;
+	pr_debug("x86_family %u x86_vendor_id %s x86_model_id %s\n",
+		 c->x86_family, c->x86_vendor_id, c->x86_model_id);
+
+	return compel_fpuid(c);
 }
 
 bool compel_cpu_has_feature(unsigned int feature)
diff --git a/compel/arch/x86/src/lib/include/uapi/asm/cpu.h b/compel/arch/x86/src/lib/include/uapi/asm/cpu.h
index 65f0576f2689..6a0c91af490d 100644
--- a/compel/arch/x86/src/lib/include/uapi/asm/cpu.h
+++ b/compel/arch/x86/src/lib/include/uapi/asm/cpu.h
@@ -3,6 +3,8 @@
 
 #include <stdint.h>
 
+#include <compel/asm/fpu.h>
+
 /*
  * Adopted from linux kernel and enhanced from Intel/AMD manuals.
  * Note these bits are not ABI for linux kernel but they _are_
@@ -277,6 +279,7 @@ enum cpuid_leafs {
 #define X86_FEATURE_HWP_ACT_WINDOW	(14*32+ 9) /* HWP Activity Window */
 #define X86_FEATURE_HWP_EPP		(14*32+10) /* HWP Energy Perf. Preference */
 #define X86_FEATURE_HWP_PKG_REQ		(14*32+11) /* HWP Package Level Request */
+#define X86_FEATURE_HDC			(14*32+13) /* HDC base registers present */
 
 /* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */
 #define X86_FEATURE_NPT			(15*32+ 0) /* Nested Page Table support */
@@ -318,6 +321,7 @@ enum {
 };
 
 struct cpuinfo_x86 {
+	/* cpu context */
 	uint8_t			x86_family;
 	uint8_t			x86_vendor;
 	uint8_t			x86_model;
@@ -328,6 +332,17 @@ struct cpuinfo_x86 {
 	int			cpuid_level;
 	char			x86_vendor_id[16];
 	char			x86_model_id[64];
+
+	/* fpu context */
+	uint64_t		xfeatures_mask;
+	uint32_t		xsave_size_max;
+	uint32_t		xsave_size;
+	uint32_t		xstate_offsets[XFEATURE_MAX];
+	uint32_t		xstate_sizes[XFEATURE_MAX];
+
+	uint32_t		xsaves_size;
+	uint32_t		xstate_comp_offsets[XFEATURE_MAX];
+	uint32_t		xstate_comp_sizes[XFEATURE_MAX];
 };
 
 typedef struct cpuinfo_x86 compel_cpuinfo_t;
diff --git a/compel/arch/x86/src/lib/include/uapi/asm/fpu.h b/compel/arch/x86/src/lib/include/uapi/asm/fpu.h
index dca280bdb61d..b18c9175768f 100644
--- a/compel/arch/x86/src/lib/include/uapi/asm/fpu.h
+++ b/compel/arch/x86/src/lib/include/uapi/asm/fpu.h
@@ -19,7 +19,66 @@
 #define XSTATE_YMM			0x4
 
 #define FXSAVE_SIZE			512
-#define XSAVE_SIZE			832
+#define XSAVE_SIZE			4096
+
+#define XSAVE_HDR_SIZE			64
+#define XSAVE_HDR_OFFSET		FXSAVE_SIZE
+
+#define XSAVE_YMM_SIZE			256
+#define XSAVE_YMM_OFFSET		(XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
+
+/*
+ * List of XSAVE features Linux knows about:
+ */
+enum xfeature {
+	XFEATURE_FP,
+	XFEATURE_SSE,
+	/*
+	 * Values above here are "legacy states".
+	 * Those below are "extended states".
+	 */
+	XFEATURE_YMM,
+	XFEATURE_BNDREGS,
+	XFEATURE_BNDCSR,
+	XFEATURE_OPMASK,
+	XFEATURE_ZMM_Hi256,
+	XFEATURE_Hi16_ZMM,
+	XFEATURE_PT,
+	XFEATURE_PKRU,
+	XFEATURE_HDC,
+
+	XFEATURE_MAX,
+};
+
+#define XSTATE_CPUID			0x0000000d
+
+#define XFEATURE_MASK_FP		(1 << XFEATURE_FP)
+#define XFEATURE_MASK_SSE		(1 << XFEATURE_SSE)
+#define XFEATURE_MASK_YMM		(1 << XFEATURE_YMM)
+#define XFEATURE_MASK_BNDREGS		(1 << XFEATURE_BNDREGS)
+#define XFEATURE_MASK_BNDCSR		(1 << XFEATURE_BNDCSR)
+#define XFEATURE_MASK_OPMASK		(1 << XFEATURE_OPMASK)
+#define XFEATURE_MASK_ZMM_Hi256		(1 << XFEATURE_ZMM_Hi256)
+#define XFEATURE_MASK_Hi16_ZMM		(1 << XFEATURE_Hi16_ZMM)
+#define XFEATURE_MASK_PT		(1 << XFEATURE_PT)
+#define XFEATURE_MASK_PKRU		(1 << XFEATURE_PKRU)
+#define XFEATURE_MASK_HDC		(1 << XFEATURE_HDC)
+
+#define XFEATURE_MASK_FPSSE		(XFEATURE_MASK_FP | XFEATURE_MASK_SSE)
+#define XFEATURE_MASK_AVX512		(XFEATURE_MASK_OPMASK | XFEATURE_MASK_ZMM_Hi256 | XFEATURE_MASK_Hi16_ZMM)
+
+#define FIRST_EXTENDED_XFEATURE		XFEATURE_YMM
+
+/* Supervisor features */
+#define XFEATURE_MASK_SUPERVISOR	(XFEATURE_MASK_PT | XFEATURE_HDC)
+
+/* All currently supported features */
+#define XCNTXT_MASK							  \
+	(XFEATURE_MASK_FP		| XFEATURE_MASK_SSE		| \
+	 XFEATURE_MASK_YMM		| XFEATURE_MASK_OPMASK		| \
+	 XFEATURE_MASK_ZMM_Hi256	| XFEATURE_MASK_Hi16_ZMM	| \
+	 XFEATURE_MASK_PKRU		| XFEATURE_MASK_BNDREGS		| \
+	 XFEATURE_MASK_BNDCSR)
 
 struct fpx_sw_bytes {
 	uint32_t			magic1;
@@ -66,27 +125,141 @@ struct i387_fxsave_struct {
 
 struct xsave_hdr_struct {
 	uint64_t			xstate_bv;
-	uint64_t			reserved1[2];
-	uint64_t			reserved2[5];
+	uint64_t			xcomp_bv;
+	uint64_t			reserved[6];
 } __packed;
 
+/*
+ * xstate_header.xcomp_bv[63] indicates that the extended_state_area
+ * is in compacted format.
+ */
+#define XCOMP_BV_COMPACTED_FORMAT	((uint64_t)1 << 63)
+
+/*
+ * State component 2:
+ *
+ * There are 16x 256-bit AVX registers named YMM0-YMM15.
+ * The low 128 bits are aliased to the 16 SSE registers (XMM0-XMM15)
+ * and are stored in 'struct fxregs_state::xmm_space[]' in the
+ * "legacy" area.
+ *
+ * The high 128 bits are stored here.
+ */
 struct ymmh_struct {
-	uint32_t			ymmh_space[64];
+	uint32_t                        ymmh_space[64];
+} __packed;
+
+/* Intel MPX support: */
+
+struct mpx_bndreg {
+	uint64_t			lower_bound;
+	uint64_t			upper_bound;
+} __packed;
+
+/*
+ * State component 3 is used for the 4 128-bit bounds registers
+ */
+struct mpx_bndreg_state {
+	struct mpx_bndreg		bndreg[4];
+} __packed;
+
+/*
+ * State component 4 is used for the 64-bit user-mode MPX
+ * configuration register BNDCFGU and the 64-bit MPX status
+ * register BNDSTATUS.  We call the pair "BNDCSR".
+ */
+struct mpx_bndcsr {
+	uint64_t			bndcfgu;
+	uint64_t			bndstatus;
 } __packed;
 
+/*
+ * The BNDCSR state is padded out to be 64-bytes in size.
+ */
+struct mpx_bndcsr_state {
+	union {
+		struct mpx_bndcsr	bndcsr;
+		uint8_t			pad_to_64_bytes[64];
+	};
+} __packed;
+
+/* AVX-512 Components: */
+
+/*
+ * State component 5 is used for the 8 64-bit opmask registers
+ * k0-k7 (opmask state).
+ */
+struct avx_512_opmask_state {
+	uint64_t			opmask_reg[8];
+} __packed;
+
+/*
+ * State component 6 is used for the upper 256 bits of the
+ * registers ZMM0-ZMM15. These 16 256-bit values are denoted
+ * ZMM0_H-ZMM15_H (ZMM_Hi256 state).
+ */
+struct avx_512_zmm_uppers_state {
+	uint64_t			zmm_upper[16 * 4];
+} __packed;
+
+/*
+ * State component 7 is used for the 16 512-bit registers
+ * ZMM16-ZMM31 (Hi16_ZMM state).
+ */
+struct avx_512_hi16_state {
+	uint64_t			hi16_zmm[16 * 8];
+} __packed;
+
+/*
+ * State component 9: 32-bit PKRU register.  The state is
+ * 8 bytes long but only 4 bytes is used currently.
+ */
+struct pkru_state {
+	uint32_t			pkru;
+	uint32_t			pad;
+} __packed;
+
+/*
+ * This is our most modern FPU state format, as saved by the XSAVE
+ * and restored by the XRSTOR instructions.
+ *
+ * It consists of a legacy fxregs portion, an xstate header and
+ * subsequent areas as defined by the xstate header. Not all CPUs
+ * support all the extensions, so the size of the extended area
+ * can vary quite a bit between CPUs.
+ *
+ *
+ * One page should be enough for the whole xsave state.
+ */
+#define EXTENDED_STATE_AREA_SIZE	(4096 - sizeof(struct i387_fxsave_struct) - sizeof(struct xsave_hdr_struct))
+
 /*
  * cpu requires it to be 64 byte aligned
  */
 struct xsave_struct {
 	struct i387_fxsave_struct	i387;
 	struct xsave_hdr_struct		xsave_hdr;
-	struct ymmh_struct		ymmh;
+	union {
+		/*
+		 * This ymmh is unndeed, for
+		 * backward compatibility.
+		 */
+		struct ymmh_struct	ymmh;
+		uint8_t			extended_state_area[EXTENDED_STATE_AREA_SIZE];
+	};
 } __aligned(FP_MIN_ALIGN_BYTES) __packed;
 
 struct xsave_struct_ia32 {
 	struct i387_fxsave_struct	i387;
 	struct xsave_hdr_struct		xsave_hdr;
-	struct ymmh_struct		ymmh;
+	union {
+		/*
+		 * This ymmh is unndeed, for
+		 * backward compatibility.
+		 */
+		struct ymmh_struct	ymmh;
+		uint8_t			extended_state_area[EXTENDED_STATE_AREA_SIZE];
+	};
 } __aligned(FXSAVE_ALIGN_BYTES) __packed;
 
 typedef struct {
-- 
2.14.4



More information about the CRIU mailing list