[coreboot] New patch to review for coreboot: 8474b56 haswell: support for parallel SMM relocation

Tue Mar 19 01:59:08 CET 2013

Stefan Reinauer (stefan.reinauer at coreboot.org) just uploaded a new patch set to gerrit, which you can find at http://review.coreboot.org/2801

-gerrit

commit 8474b569ddb0b072ed07bce6ec40d50a4010f8b6
Author: Aaron Durbin <adurbin at chromium.org>
Date:   Wed Feb 13 11:22:25 2013 -0600

    haswell: support for parallel SMM relocation
    
    The haswell processors support the ability to save their SMM state
    into MSR space instead of the memory. This feaure allows for parallel
    SMM relocation handlers as well as setting the same SMBASE for each
    CPU since the save state memory area is not used.
    
    The catch is that in order determine if this feature is available the
    CPU needs to be in SMM context. In order to implement parallel SMM
    relocation the BSP enters the relocation handler twice. The first time
    is to determine if that feature is available. If it is, then that
    feature is enabled the BSP exits the relocation handler without
    relocating SMBASE. It then releases the APs to run the SMM relocation
    handler. After the APs have completed the relocation the BSP will
    re-enter the SMM relocation handler to relocate its own SMBASE to the
    final location.  If the parallel SMM feature is not available the BSP
    relocates its SMBASE as it did before.
    
    This change also introduces the BSP waiting for the APs to relocate
    their SMBASE before proceeding with the remainder of the boot process.
    
    Ensured both the parallel path and the serial path still continue
    to work on cold, warm, and S3 resume paths.
    
    Change-Id: Iea24fd8f9561f1b194393cdb77c79adb48039ea2
    Signed-off-by: Aaron Durbin <adurbin at chromium.org>
---
 src/cpu/intel/haswell/haswell.h      |   3 +-
 src/cpu/intel/haswell/haswell_init.c |   3 -
 src/cpu/intel/haswell/mp_init.c      |  52 ++++++++++---
 src/cpu/intel/haswell/smmrelocate.c  | 142 +++++++++++++++++++++++++++++------
 4 files changed, 163 insertions(+), 37 deletions(-)

diff --git a/src/cpu/intel/haswell/haswell.h b/src/cpu/intel/haswell/haswell.h
index 26807e9..a510e7d 100644
--- a/src/cpu/intel/haswell/haswell.h
+++ b/src/cpu/intel/haswell/haswell.h
@@ -144,6 +144,7 @@ int cpu_config_tdp_levels(void);
 /* Returns 0 on success, < 0 on failure. */
 int smm_initialize(void);
 void smm_initiate_relocation(void);
+void smm_initiate_relocation_parallel(void);
 struct bus;
 void bsp_init_and_start_aps(struct bus *cpu_bus);
 /* Returns 0 on succes. < 0 on failure. */
@@ -151,7 +152,7 @@ int setup_ap_init(struct bus *cpu_bus, int *max_cpus,
                   const void *microcode_patch);
 /* Returns 0 on success, < 0 on failure. */
 int start_aps(struct bus *cpu_bus, int max_cpus);
-void release_aps_for_smm_relocation(void);
+void release_aps_for_smm_relocation(int do_parallel_relocation);
 #endif
 #endif
 
diff --git a/src/cpu/intel/haswell/haswell_init.c b/src/cpu/intel/haswell/haswell_init.c
index 82430b7..c7f89ee 100644
--- a/src/cpu/intel/haswell/haswell_init.c
+++ b/src/cpu/intel/haswell/haswell_init.c
@@ -549,9 +549,6 @@ void bsp_init_and_start_aps(struct bus *cpu_bus)
 		return;
 	}
 
-	/* Release APs to perform SMM relocation. */
-	release_aps_for_smm_relocation();
-
 	/* After SMM relocation a 2nd microcode load is required. */
 	intel_microcode_load_unlocked(microcode_patch);
 }
diff --git a/src/cpu/intel/haswell/mp_init.c b/src/cpu/intel/haswell/mp_init.c
index 7f15c39..c8bd5c2 100644
--- a/src/cpu/intel/haswell/mp_init.c
+++ b/src/cpu/intel/haswell/mp_init.c
@@ -75,9 +75,16 @@ static device_t cpu_devs[CONFIG_MAX_CPUS];
 
 /* Number of APs checked that have checked in. */
 static atomic_t num_aps;
+/* Number of APs that have relocated their SMM handler. */
+static atomic_t num_aps_relocated_smm;
 /* Barrier to stop APs from performing SMM relcoation. */
 static int smm_relocation_barrier_begin __attribute__ ((aligned (64)));
 
+static inline void mfence(void)
+{
+	__asm__ __volatile__("mfence\t\n": : :"memory");
+}
+
 static inline void wait_for_barrier(volatile int *barrier)
 {
 	while (*barrier == 0) {
@@ -95,13 +102,18 @@ static void ap_wait_for_smm_relocation_begin(void)
 	wait_for_barrier(&smm_relocation_barrier_begin);
 }
 
+/* This function pointer is used by the non-BSP CPUs to initiate relocation. It
+ * points to either a serial or parallel SMM initiation. */
+static void (*ap_initiate_smm_relocation)(void) = &smm_initiate_relocation;
+
 
 /* Returns 1 if timeout waiting for APs. 0 if target aps found. */
-static int wait_for_aps(int target, int total_delay, int delay_step)
+static int wait_for_aps(atomic_t *val, int target, int total_delay,
+                        int delay_step)
 {
 	int timeout = 0;
 	int delayed = 0;
-	while (atomic_read(&num_aps) != target) {
+	while (atomic_read(val) != target) {
 		udelay(delay_step);
 		delayed += delay_step;
 		if (delayed >= total_delay) {
@@ -113,9 +125,19 @@ static int wait_for_aps(int target, int total_delay, int delay_step)
 	return timeout;
 }
 
-void release_aps_for_smm_relocation(void)
+void release_aps_for_smm_relocation(int do_parallel)
 {
+	/* Change the AP SMM initiation function, and ensure it is visible
+	 * before releasing the APs. */
+	if (do_parallel) {
+		ap_initiate_smm_relocation = &smm_initiate_relocation_parallel;
+		mfence();
+	}
 	release_barrier(&smm_relocation_barrier_begin);
+	/* Wait for CPUs to relocate their SMM handler up to 100ms. */
+	if (wait_for_aps(&num_aps_relocated_smm, atomic_read(&num_aps),
+	                 100000 /* 100 ms */, 200 /* us */))
+		printk(BIOS_DEBUG, "Timed out waiting for AP SMM relocation\n");
 }
 
 /* The mtrr code sets up ROM caching on the BSP, but not the others. However,
@@ -172,7 +194,10 @@ ap_init(unsigned int cpu, void *microcode_ptr)
 
 	ap_wait_for_smm_relocation_begin();
 
-	smm_initiate_relocation();
+	ap_initiate_smm_relocation();
+
+	/* Indicate that SMM relocation has occured on this thread. */
+	atomic_inc(&num_aps_relocated_smm);
 
 	/* After SMM relocation a 2nd microcode load is required. */
 	intel_microcode_load_unlocked(microcode_ptr);
@@ -483,7 +508,7 @@ int start_aps(struct bus *cpu_bus, int ap_count)
 		printk(BIOS_DEBUG, "done.\n");
 	}
 	/* Wait for CPUs to check in up to 200 us. */
-	wait_for_aps(ap_count, 200 /* us */, 15 /* us */);
+	wait_for_aps(&num_aps, ap_count, 200 /* us */, 15 /* us */);
 
 	/* Send 2nd SIPI */
 	if ((lapic_read(LAPIC_ICR) & LAPIC_ICR_BUSY)) {
@@ -507,7 +532,7 @@ int start_aps(struct bus *cpu_bus, int ap_count)
 	}
 
 	/* Wait for CPUs to check in. */
-	if (wait_for_aps(ap_count, 10000 /* 10 ms */, 50 /* us */)) {
+	if (wait_for_aps(&num_aps, ap_count, 10000 /* 10 ms */, 50 /* us */)) {
 		printk(BIOS_DEBUG, "Not all APs checked in: %d/%d.\n",
 		       atomic_read(&num_aps), ap_count);
 		return -1;
@@ -516,17 +541,12 @@ int start_aps(struct bus *cpu_bus, int ap_count)
 	return 0;
 }
 
-DECLARE_SPIN_LOCK(smm_relocation_lock);
-
-void smm_initiate_relocation(void)
+void smm_initiate_relocation_parallel(void)
 {
-	spin_lock(&smm_relocation_lock);
-
 	if ((lapic_read(LAPIC_ICR) & LAPIC_ICR_BUSY)) {
 		printk(BIOS_DEBUG, "Waiting for ICR not to be busy...");
 		if (apic_wait_timeout(1000 /* 1 ms */, 50)) {
 			printk(BIOS_DEBUG, "timed out. Aborting.\n");
-			spin_unlock(&smm_relocation_lock);
 			return;
 		} else
 			printk(BIOS_DEBUG, "done.\n");
@@ -539,6 +559,14 @@ void smm_initiate_relocation(void)
 	} else
 		printk(BIOS_DEBUG, "Relocation complete.\n");
 
+}
+
+DECLARE_SPIN_LOCK(smm_relocation_lock);
+
+void smm_initiate_relocation(void)
+{
+	spin_lock(&smm_relocation_lock);
+	smm_initiate_relocation_parallel();
 	spin_unlock(&smm_relocation_lock);
 }
 
diff --git a/src/cpu/intel/haswell/smmrelocate.c b/src/cpu/intel/haswell/smmrelocate.c
index 2bf304e..2a322a7 100644
--- a/src/cpu/intel/haswell/smmrelocate.c
+++ b/src/cpu/intel/haswell/smmrelocate.c
@@ -36,6 +36,14 @@
 #define EMRRphysMask_MSR 0x1f5
 #define UNCORE_EMRRphysBase_MSR 0x2f4
 #define UNCORE_EMRRphysMask_MSR 0x2f5
+#define SMM_MCA_CAP_MSR 0x17d
+#define   SMM_CPU_SVRSTR_BIT 57
+#define   SMM_CPU_SVRSTR_MASK (1 << (SMM_CPU_SVRSTR_BIT - 32))
+#define SMM_FEATURE_CONTROL_MSR 0x4e0
+#define   SMM_CPU_SAVE_EN (1 << 1)
+/* SMM save state MSRs */
+#define SMBASE_MSR 0xc20
+#define IEDBASE_MSR 0xc22
 
 #define SMRR_SUPPORTED (1<<11)
 #define EMRR_SUPPORTED (1<<12)
@@ -51,6 +59,10 @@ struct smm_relocation_params {
 	msr_t emrr_mask;
 	msr_t uncore_emrr_base;
 	msr_t uncore_emrr_mask;
+	/* The smm_save_state_in_msrs field indicates if SMM save state
+	 * locations live in MSRs. This indicates to the CPUs how to adjust
+	 * the SMMBASE and IEDBASE */
+	int smm_save_state_in_msrs;
 };
 
 /* This gets filled in and used during relocation. */
@@ -82,13 +94,79 @@ static inline void write_uncore_emrr(struct smm_relocation_params *relo_params)
 	wrmsr(UNCORE_EMRRphysMask_MSR, relo_params->uncore_emrr_mask);
 }
 
+static void update_save_state(int cpu,
+                              struct smm_relocation_params *relo_params,
+                              const struct smm_runtime *runtime)
+{
+	u32 smbase;
+	u32 iedbase;
+
+	/* The relocated handler runs with all CPUs concurrently. Therefore
+	 * stagger the entry points adjusting SMBASE downwards by save state
+	 * size * CPU num. */
+	smbase = relo_params->smram_base - cpu * runtime->save_state_size;
+	iedbase = relo_params->ied_base;
+
+	printk(BIOS_DEBUG, "New SMBASE=0x%08x IEDBASE=0x%08x\n",
+	       smbase, iedbase);
+
+	/* All threads need to set IEDBASE and SMBASE to the relocated
+	 * handler region. However, the save state location depends on the
+	 * smm_save_state_in_msrs field in the relocation parameters. If
+	 * smm_save_state_in_msrs is non-zero then the CPUs are relocating
+	 * the SMM handler in parallel, and each CPUs save state area is
+	 * located in their respective MSR space. If smm_save_state_in_msrs
+	 * is zero then the SMM relocation is happening serially so the
+	 * save state is at the same default location for all CPUs. */
+	if (relo_params->smm_save_state_in_msrs) {
+		msr_t smbase_msr;
+		msr_t iedbase_msr;
+
+		smbase_msr.lo = smbase;
+		smbase_msr.hi = 0;
+
+		/* According the BWG the IEDBASE MSR is in bits 63:32. It's
+		 * not clear why it differs from the SMBASE MSR. */
+		iedbase_msr.lo = 0;
+		iedbase_msr.hi = iedbase;
+
+		wrmsr(SMBASE_MSR, smbase_msr);
+		wrmsr(IEDBASE_MSR, iedbase_msr);
+	} else {
+		em64t101_smm_state_save_area_t *save_state;
+
+		save_state = (void *)(runtime->smbase + SMM_DEFAULT_SIZE -
+				      runtime->save_state_size);
+
+		save_state->smbase = smbase;
+		save_state->iedbase = iedbase;
+	}
+}
+
+/* Returns 1 if SMM MSR save state was set. */
+static int bsp_setup_msr_save_state(struct smm_relocation_params *relo_params)
+{
+	msr_t smm_mca_cap;
+
+	smm_mca_cap = rdmsr(SMM_MCA_CAP_MSR);
+	if (smm_mca_cap.hi & SMM_CPU_SVRSTR_MASK) {
+		msr_t smm_feature_control;
+
+		smm_feature_control = rdmsr(SMM_FEATURE_CONTROL_MSR);
+		smm_feature_control.hi = 0;
+		smm_feature_control.lo |= SMM_CPU_SAVE_EN;
+		wrmsr(SMM_FEATURE_CONTROL_MSR, smm_feature_control);
+		relo_params->smm_save_state_in_msrs = 1;
+	}
+	return relo_params->smm_save_state_in_msrs;
+}
+
 /* The relocation work is actually performed in SMM context, but the code
  * resides in the ramstage module. This occurs by trampolining from the default
  * SMRAM entry point to here. */
 static void __attribute__((cdecl))
 cpu_smm_do_relocation(void *arg, int cpu, const struct smm_runtime *runtime)
 {
-	em64t101_smm_state_save_area_t *save_state;
 	msr_t mtrr_cap;
 	struct smm_relocation_params *relo_params = arg;
 
@@ -100,21 +178,32 @@ cpu_smm_do_relocation(void *arg, int cpu, const struct smm_runtime *runtime)
 
 	printk(BIOS_DEBUG, "In relocation handler: cpu %d\n", cpu);
 
-	/* All threads need to set IEDBASE and SMBASE in the save state area.
-	 * Since one thread runs at a time during the relocation the save state
-	 * is the same for all cpus. */
-	save_state = (void *)(runtime->smbase + SMM_DEFAULT_SIZE -
-                              runtime->save_state_size);
-
-	/* The relocated handler runs with all CPUs concurrently. Therefore
-	 * stagger the entry points adjusting SMBASE downwards by save state
-	 * size * CPU num. */
-	save_state->smbase = relo_params->smram_base -
-	                     cpu * runtime->save_state_size;
-	save_state->iedbase = relo_params->ied_base;
+	/* Determine if the processor supports saving state in MSRs. If so,
+	 * enable it before the non-BSPs run so that SMM relocation can occur
+	 * in parallel in the non-BSP CPUs. */
+	if (cpu == 0) {
+		/* If smm_save_state_in_msrs is 1 then that means this is the
+		 * 2nd time through the relocation handler for the BSP.
+		 * Parallel SMM handler relocation is taking place. However,
+		 * it is desired to access other CPUs save state in the real
+		 * SMM handler. Therefore, disable the SMM save state in MSRs
+		 * feature. */
+		if (relo_params->smm_save_state_in_msrs) {
+			msr_t smm_feature_control;
+
+			smm_feature_control = rdmsr(SMM_FEATURE_CONTROL_MSR);
+			smm_feature_control.lo &= ~SMM_CPU_SAVE_EN;
+			wrmsr(SMM_FEATURE_CONTROL_MSR, smm_feature_control);
+		} else if (bsp_setup_msr_save_state(relo_params))
+			/* Just return from relocation handler if MSR save
+			 * state is enabled. In that case the BSP will come
+			 * back into the relocation handler to setup the new
+			 * SMBASE as well disabling SMM save state in MSRs. */
+			return;
+	}
 
-	printk(BIOS_DEBUG, "New SMBASE=0x%08x IEDBASE=0x%08x @ %p\n",
-	       save_state->smbase, save_state->iedbase, save_state);
+	/* Make appropriate changes to the save state map. */
+	update_save_state(cpu, relo_params, runtime);
 
 	/* Write EMRR and SMRR MSRs based on indicated support. */
 	mtrr_cap = rdmsr(MTRRcap_MSR);
@@ -128,8 +217,6 @@ cpu_smm_do_relocation(void *arg, int cpu, const struct smm_runtime *runtime)
 		if (cpu == 0)
 			write_uncore_emrr(relo_params);
 	}
-
-	southbridge_clear_smi_status();
 }
 
 static u32 northbridge_get_base_reg(device_t dev, int reg)
@@ -199,10 +286,12 @@ static void fill_in_relocation_params(device_t dev,
 static int install_relocation_handler(int num_cpus,
                                       struct smm_relocation_params *relo_params)
 {
-	/* The default SMM entry happens serially at the default location.
-	 * Therefore, there is only 1 concurrent save state area. Set the
-	 * stack size to the save state size, and call into the
-	 * do_relocation handler. */
+	/* The default SMM entry can happen in parallel or serially. If the
+	 * default SMM entry is done in parallel the BSP has already setup
+	 * the saving state to each CPU's MSRs. At least one save state size
+	 * is required for the initial SMM entry for the BSP to determine if
+	 * parallel SMM relocation is even feasible.  Set the stack size to
+	 * the save state size, and call into the do_relocation handler. */
 	int save_state_size = sizeof(em64t101_smm_state_save_area_t);
 	struct smm_loader_params smm_params = {
 		.per_cpu_stack_size = save_state_size,
@@ -309,6 +398,17 @@ int smm_initialize(void)
 	/* Run the relocation handler. */
 	smm_initiate_relocation();
 
+	/* If smm_save_state_in_msrs is non-zero then parallel SMM relocation
+	 * shall take place. Run the relocation handler a second time to do
+	 * the final move. */
+	if (smm_reloc_params.smm_save_state_in_msrs) {
+		printk(BIOS_DEBUG, "Doing parallel SMM relocation.\n");
+		release_aps_for_smm_relocation(1);
+		smm_initiate_relocation_parallel();
+	} else {
+		release_aps_for_smm_relocation(0);
+	}
+
 	/* Lock down the SMRAM space. */
 	smm_lock();