[coreboot] New patch to review for coreboot: 6483510 haswell: Parallel AP bringup

Tue Mar 19 01:58:45 CET 2013

Stefan Reinauer (stefan.reinauer at coreboot.org) just uploaded a new patch set to gerrit, which you can find at http://review.coreboot.org/2779

-gerrit

commit 648351002891186c84bd2c4a5c5d876c160e393a
Author: Aaron Durbin <adurbin at chromium.org>
Date:   Tue Jan 15 08:27:05 2013 -0600

    haswell: Parallel AP bringup
    
    This patch parallelizes the AP startup for Haswell-based devices. It
    does not touch the generic secondary startup code. Instead it provides
    its own MP support matching up with the Haswell BWG. It seemed to be too
    much trouble to support the old startup way and this new way. Because of
    that parallel loading is the only thing supported.
    
    A couple of things to note:
    1. Micrcode needs to be loaded twice. Once before MTRR and caching is
       enabled. And a second time after SMM relocation.
    2. The sipi_vector is entirely self-contained. Once it is loaded and
       written back to RAM the APs do not access memory outside of the
       sipi_vector load location until a sync up in ramstage.
    3. SMM relocation is kicked off by an IPI to self w/ SMI set as the
       destination mode.
    
    The following are timings from cbmem with dev mode disabled and recovery mode
    enabled to boot directly into the kernel. This was done on the
    baskingridge CRB with a 4-core 8-thread CPU and 2 DIMMs 1GiB each. The
    kernel has console enabled on the serial port. Entry 70 is the device
    initialization, and that is where the APs are brought up. With these two
    examples it looks to shave off ~200 ms of boot time.
    
    Before:
       1:55,382
       2:57,606 (2,223)
       3:3,108,983 (3,051,377)
       4:3,110,084 (1,101)
       8:3,113,109 (3,024)
       9:3,156,694 (43,585)
      10:3,156,815 (120)
      30:3,157,110 (295)
      40:3,158,180 (1,069)
      50:3,160,157 (1,977)
      60:3,160,366 (208)
      70:4,221,044 (1,060,677)
      75:4,221,062 (18)
      80:4,227,185 (6,122)
      90:4,227,669 (484)
      99:4,265,596 (37,927)
    1000:4,267,822 (2,225)
    1001:4,268,507 (685)
    1002:4,268,780 (272)
    1003:4,398,676 (129,896)
    1004:4,398,979 (303)
    1100:7,477,601 (3,078,621)
    1101:7,480,210 (2,608)
    
    After:
       1:49,518
       2:51,778 (2,259)
       3:3,081,186 (3,029,407)
       4:3,082,252 (1,066)
       8:3,085,137 (2,884)
       9:3,130,339 (45,202)
      10:3,130,518 (178)
      30:3,130,544 (26)
      40:3,131,125 (580)
      50:3,133,023 (1,897)
      60:3,133,278 (255)
      70:4,009,259 (875,980)
      75:4,009,273 (13)
      80:4,015,947 (6,674)
      90:4,016,430 (482)
      99:4,056,265 (39,835)
    1000:4,058,492 (2,226)
    1001:4,059,176 (684)
    1002:4,059,450 (273)
    1003:4,189,333 (129,883)
    1004:4,189,770 (436)
    1100:7,262,358 (3,072,588)
    1101:7,263,926 (1,567)
    
    Booted the baskingridge board as noted above. Also analyzed serial
    messages with pcserial enabled.
    
    Change-Id: Ifedc7f787953647c228b11afdb725686e38c4098
    Signed-off-by: Aaron Durbin <adurbin at chromium.org>
---
 src/cpu/intel/haswell/Kconfig        |   1 +
 src/cpu/intel/haswell/Makefile.inc   |  23 ++
 src/cpu/intel/haswell/haswell.h      |   9 +
 src/cpu/intel/haswell/haswell_init.c | 145 ++++-----
 src/cpu/intel/haswell/mp_init.c      | 550 +++++++++++++++++++++++++++++++++++
 src/cpu/intel/haswell/sipi_header.c  |   6 +
 src/cpu/intel/haswell/sipi_vector.S  | 190 ++++++++++++
 src/cpu/intel/haswell/smmrelocate.c  |  25 +-
 8 files changed, 845 insertions(+), 104 deletions(-)

diff --git a/src/cpu/intel/haswell/Kconfig b/src/cpu/intel/haswell/Kconfig
index bf0abbc..95d0b18 100644
--- a/src/cpu/intel/haswell/Kconfig
+++ b/src/cpu/intel/haswell/Kconfig
@@ -11,6 +11,7 @@ config CPU_SPECIFIC_OPTIONS
 	select UDELAY_LAPIC
 	select SMM_TSEG
 	select SMM_MODULES
+	select RELOCATABLE_MODULES
 	select CPU_MICROCODE_IN_CBFS
 	#select AP_IN_SIPI_WAIT
 	select TSC_SYNC_MFENCE
diff --git a/src/cpu/intel/haswell/Makefile.inc b/src/cpu/intel/haswell/Makefile.inc
index b2116f2..a19a8c5 100644
--- a/src/cpu/intel/haswell/Makefile.inc
+++ b/src/cpu/intel/haswell/Makefile.inc
@@ -1,5 +1,6 @@
 ramstage-y += haswell_init.c
 subdirs-y += ../../x86/name
+ramstage-y += mp_init.c
 romstage-y += romstage.c
 
 ramstage-$(CONFIG_GENERATE_ACPI_TABLES) += acpi.c
@@ -10,3 +11,25 @@ cpu_microcode-$(CONFIG_CPU_MICROCODE_CBFS_GENERATE)  += microcode_blob.c
 smm-$(CONFIG_HAVE_SMI_HANDLER) += finalize.c
 
 cpu_incs += $(src)/cpu/intel/haswell/cache_as_ram.inc
+
+# AP startub stub
+SIPI_ELF=$(obj)/cpu/intel/haswell/sipi_vector.elf
+SIPI_BIN=$(SIPI_ELF:.elf=)
+SIPI_DOTO=$(SIPI_ELF:.elf=.o)
+
+ramstage-srcs += $(SIPI_BIN)
+rmodules-y += sipi_vector.S
+rmodules-y += sipi_header.c
+
+$(SIPI_DOTO): $(dir $(SIPI_ELF))sipi_vector.rmodules.o $(dir $(SIPI_ELF))sipi_header.rmodules.o
+	$(CC) $(LDFLAGS) -nostdlib -r -o $@ $^
+
+$(eval $(call rmodule_link,$(SIPI_ELF), $(SIPI_ELF:.elf=.o), 0))
+
+$(SIPI_BIN): $(SIPI_ELF)
+	$(OBJCOPY) -O binary $< $@
+
+$(SIPI_BIN).ramstage.o: $(SIPI_BIN)
+	@printf "    OBJCOPY    $(subst $(obj)/,,$(@))\n"
+	cd $(dir $@); $(OBJCOPY) -I binary $(notdir $<) -O elf32-i386 -B i386 $(notdir $@)
+
diff --git a/src/cpu/intel/haswell/haswell.h b/src/cpu/intel/haswell/haswell.h
index 733ddd3..26807e9 100644
--- a/src/cpu/intel/haswell/haswell.h
+++ b/src/cpu/intel/haswell/haswell.h
@@ -141,8 +141,17 @@ void intel_cpu_haswell_finalize_smm(void);
 /* Configure power limits for turbo mode */
 void set_power_limits(u8 power_limit_1_time);
 int cpu_config_tdp_levels(void);
+/* Returns 0 on success, < 0 on failure. */
+int smm_initialize(void);
+void smm_initiate_relocation(void);
 struct bus;
 void bsp_init_and_start_aps(struct bus *cpu_bus);
+/* Returns 0 on succes. < 0 on failure. */
+int setup_ap_init(struct bus *cpu_bus, int *max_cpus,
+                  const void *microcode_patch);
+/* Returns 0 on success, < 0 on failure. */
+int start_aps(struct bus *cpu_bus, int max_cpus);
+void release_aps_for_smm_relocation(void);
 #endif
 #endif
 
diff --git a/src/cpu/intel/haswell/haswell_init.c b/src/cpu/intel/haswell/haswell_init.c
index 9e62b31..82430b7 100644
--- a/src/cpu/intel/haswell/haswell_init.c
+++ b/src/cpu/intel/haswell/haswell_init.c
@@ -442,71 +442,30 @@ static void configure_mca(void)
 static unsigned ehci_debug_addr;
 #endif
 
-/*
- * Initialize any extra cores/threads in this package.
- */
-static void intel_cores_init(device_t cpu)
-{
-	struct cpuid_result result;
-	unsigned threads_per_package, threads_per_core, i;
-
-	/* Logical processors (threads) per core */
-	result = cpuid_ext(0xb, 0);
-	threads_per_core = result.ebx & 0xffff;
-
-	/* Logical processors (threads) per package */
-	result = cpuid_ext(0xb, 1);
-	threads_per_package = result.ebx & 0xffff;
-
-	/* Only initialize extra cores from BSP */
-	if (cpu->path.apic.apic_id)
-		return;
-
-	printk(BIOS_DEBUG, "CPU: %u has %u cores, %u threads per core\n",
-	       cpu->path.apic.apic_id, threads_per_package/threads_per_core,
-	       threads_per_core);
-
-	for (i = 1; i < threads_per_package; ++i) {
-		struct device_path cpu_path;
-		device_t new;
-
-		/* Build the cpu device path */
-		cpu_path.type = DEVICE_PATH_APIC;
-		cpu_path.apic.apic_id =
-			cpu->path.apic.apic_id + i;
-
-		/* Update APIC ID if no hyperthreading */
-		if (threads_per_core == 1)
-			cpu_path.apic.apic_id <<= 1;
-
-		/* Allocate the new cpu device structure */
-		new = alloc_dev(cpu->bus, &cpu_path);
-		if (!new)
-			continue;
-
-		printk(BIOS_DEBUG, "CPU: %u has core %u\n",
-		       cpu->path.apic.apic_id,
-		       new->path.apic.apic_id);
-
-#if CONFIG_SMP && CONFIG_MAX_CPUS > 1
-		/* Start the new cpu */
-		if (!start_cpu(new)) {
-			/* Record the error in cpu? */
-			printk(BIOS_ERR, "CPU %u would not start!\n",
-			       new->path.apic.apic_id);
-		}
-#endif
-	}
-}
-
-static void bsp_init_before_ap_bringup(void)
+static void bsp_init_before_ap_bringup(struct bus *cpu_bus)
 {
+	struct device_path cpu_path;
+	struct cpu_info *info;
 	char processor_name[49];
 
 	/* Print processor name */
 	fill_processor_name(processor_name);
 	printk(BIOS_INFO, "CPU: %s.\n", processor_name);
 
+	/* Ensure the local apic is enabled */
+	enable_lapic();
+
+	/* Set the device path of the boot cpu. */
+	cpu_path.type = DEVICE_PATH_APIC;
+	cpu_path.apic.apic_id = lapicid();
+
+	/* Find the device structure for the boot cpu. */
+	info = cpu_info();
+	info->cpu = alloc_find_dev(cpu_bus, &cpu_path);
+
+	if (info->index != 0)
+		printk(BIOS_CRIT, "BSP index(%d) != 0!\n", info->index);
+
 #if CONFIG_USBDEBUG
 	// Is this caution really needed?
 	if(!ehci_debug_addr)
@@ -523,23 +482,12 @@ static void bsp_init_before_ap_bringup(void)
 	set_ehci_debug(ehci_debug_addr);
 #endif
 
-	enable_lapic();
-}
-
-static void ap_init(device_t cpu)
-{
-	/* Microcode needs to be loaded before caching is enabled. */
-	intel_update_microcode_from_cbfs();
-
-	/* Turn on caching if we haven't already */
-	x86_enable_cache();
-	x86_setup_fixed_mtrrs();
-	x86_setup_var_mtrrs(cpuid_eax(0x80000008) & 0xff, 2);
-
-	enable_lapic();
+	/* Call through the cpu driver's initialization. */
+	cpu_initialize(0);
 }
 
-static void cpu_common_init(device_t cpu)
+/* All CPUs including BSP will run the following function. */
+static void haswell_init(device_t cpu)
 {
 	/* Clear out pending MCEs */
 	configure_mca();
@@ -572,33 +520,40 @@ static void cpu_common_init(device_t cpu)
 
 void bsp_init_and_start_aps(struct bus *cpu_bus)
 {
+	int max_cpus;
+	int num_aps;
+	const void *microcode_patch;
+
 	/* Perform any necesarry BSP initialization before APs are brought up.
 	 * This call alos allows the BSP to prepare for any secondary effects
 	 * from calling cpu_initialize() such as smm_init(). */
-	bsp_init_before_ap_bringup();
-
-	/*
-	 * This calls into the gerneic initialize_cpus() which attempts to
-	 * start APs on the APIC bus in the devicetree.  No APs get started
-	 * because there is only the BSP and a placeholder (disabled) in the
-	 * devicetree. initialize_cpus() also does SMM initialization by way
-	 * of smm_init(). It will eventually call cpu_initialize(0) which calls
-	 * dev_ops->init(). For Haswell the dev_ops->init() starts up the APs
-	 * by way of intel_cores_init().
-	 */
-	initialize_cpus(cpu_bus);
-}
+	bsp_init_before_ap_bringup(cpu_bus);
 
-static void haswell_init(device_t cpu)
-{
-	if (cpu->path.apic.apic_id == 0) {
-		cpu_common_init(cpu);
-		/* Start up extra cores */
-		intel_cores_init(cpu);
-	} else {
-		ap_init(cpu);
-		cpu_common_init(cpu);
+	microcode_patch = intel_microcode_find();
+
+	/* This needs to be called after the mtrr setup so the BSP mtrrs
+	 * can be mirrored by the APs. */
+	if (setup_ap_init(cpu_bus, &max_cpus, microcode_patch)) {
+		printk(BIOS_CRIT, "AP setup initialization failed. "
+		       "No APs will be brought up.\n");
+		return;
+	}
+
+	num_aps = max_cpus - 1;
+	if (start_aps(cpu_bus, num_aps)) {
+		printk(BIOS_CRIT, "AP startup failed. Trying to continue.\n");
 	}
+
+	if (smm_initialize()) {
+		printk(BIOS_CRIT, "SMM Initialiazation failed...\n");
+		return;
+	}
+
+	/* Release APs to perform SMM relocation. */
+	release_aps_for_smm_relocation();
+
+	/* After SMM relocation a 2nd microcode load is required. */
+	intel_microcode_load_unlocked(microcode_patch);
 }
 
 static struct device_operations cpu_dev_ops = {
diff --git a/src/cpu/intel/haswell/mp_init.c b/src/cpu/intel/haswell/mp_init.c
new file mode 100644
index 0000000..b1567ba
--- /dev/null
+++ b/src/cpu/intel/haswell/mp_init.c
@@ -0,0 +1,550 @@
+/*
+ * This file is part of the coreboot project.
+ *
+ * Copyright (C) 2013 ChromeOS Authors
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; version 2 of
+ * the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
+ * MA 02110-1301 USA
+ */
+
+#include <console/console.h>
+#include <stdint.h>
+#include <rmodule.h>
+#include <arch/cpu.h>
+#include <cpu/cpu.h>
+#include <cpu/intel/microcode.h>
+#include <cpu/x86/cache.h>
+#include <cpu/x86/lapic.h>
+#include <cpu/x86/msr.h>
+#include <cpu/x86/mtrr.h>
+#include <cpu/x86/smm.h>
+#include <delay.h>
+#include <device/device.h>
+#include <device/path.h>
+#include <lib.h>
+#include <smp/atomic.h>
+#include <smp/spinlock.h>
+#include "haswell.h"
+
+/* This needs to match the layout in the .module_parametrs section. */
+struct sipi_params {
+	u16 gdtlimit;
+	u32 gdt;
+	u16 unused;
+	u32 idt_ptr;
+	u32 stack_top;
+	u32 stack_size;
+	u32 microcode_ptr;
+	u32 msr_table_ptr;
+	u32 msr_count;
+	u32 c_handler;
+	u32 c_handler_arg;
+	u8 apic_to_cpu_num[CONFIG_MAX_CPUS];
+} __attribute__((packed));
+
+/* This also needs to match the assembly code for saved MSR encoding. */
+struct saved_msr {
+	u32 index;
+	u32 lo;
+	u32 hi;
+} __attribute__((packed));
+
+
+/* The sipi vector rmodule is included in the ramstage using 'objdump -B'. */
+extern char _binary_sipi_vector_start[];
+/* These symbols are defined in c_start.S. */
+extern char gdt[];
+extern char gdt_limit[];
+extern char idtarg[];
+
+/* This table keeps track of each CPU's APIC id. */
+static u8 apic_id_table[CONFIG_MAX_CPUS];
+static device_t cpu_devs[CONFIG_MAX_CPUS];
+
+/* Number of APs checked that have checked in. */
+static atomic_t num_aps;
+/* Barrier to stop APs from performing SMM relcoation. */
+static int smm_relocation_barrier_begin __attribute__ ((aligned (64)));
+
+static inline void wait_for_barrier(volatile int *barrier)
+{
+	while (*barrier == 0) {
+		asm ("pause");
+	}
+}
+
+static inline void release_barrier(volatile int *barrier)
+{
+	*barrier = 1;
+}
+
+static void ap_wait_for_smm_relocation_begin(void)
+{
+	wait_for_barrier(&smm_relocation_barrier_begin);
+}
+
+
+/* Returns 1 if timeout waiting for APs. 0 if target aps found. */
+static int wait_for_aps(int target, int total_delay, int delay_step)
+{
+	int timeout = 0;
+	int delayed = 0;
+	while (atomic_read(&num_aps) != target) {
+		udelay(delay_step);
+		delayed += delay_step;
+		if (delayed >= total_delay) {
+			timeout = 1;
+			break;
+		}
+	}
+
+	return timeout;
+}
+
+void release_aps_for_smm_relocation(void)
+{
+	release_barrier(&smm_relocation_barrier_begin);
+}
+
+/* The mtrr code sets up ROM caching on the BSP, but not the others. However,
+ * the boot loader payload disables this. In order for Linux not to complain
+ * ensure the caching is disabled for tha APs before going to sleep. */
+static void cleanup_rom_caching(void)
+{
+#if CONFIG_CACHE_ROM
+	msr_t msr;
+	unsigned int last_var_mtrr;
+
+	msr = rdmsr(MTRRcap_MSR);
+	last_var_mtrr = (msr.lo & 0xff) - 1;
+
+	/* Check if the MTRR is valid. */
+	msr = rdmsr(MTRRphysMask_MSR(last_var_mtrr));
+	if ((msr.lo & MTRRphysMaskValid) == 0)
+		return;
+	msr = rdmsr(MTRRphysBase_MSR(last_var_mtrr));
+	/* Assum that if the MTRR is of write protected type, the MTRR is used
+	 * to cache the ROM. */
+	if ((msr.lo & MTRR_NUM_TYPES) == MTRR_TYPE_WRPROT) {
+		msr.lo = msr.hi = 0;
+		disable_cache();
+		wrmsr(MTRRphysMask_MSR(last_var_mtrr), msr);
+		wrmsr(MTRRphysBase_MSR(last_var_mtrr), msr);
+		enable_cache();
+	}
+#endif
+}
+
+/* By the time APs call ap_init() caching has been setup, and microcode has
+ * been loaded. */
+static void ap_init(unsigned int cpu, void *microcode_ptr)
+{
+	struct cpu_info *info;
+
+	/* Signal that the AP has arrived. */
+	atomic_inc(&num_aps);
+
+	/* Ensure the local apic is enabled */
+	enable_lapic();
+
+	info = cpu_info();
+	info->index = cpu;
+	info->cpu = cpu_devs[cpu];
+
+	apic_id_table[info->index] = lapicid();
+	info->cpu->path.apic.apic_id = apic_id_table[info->index];
+
+	/* Call through the cpu driver's initialization. */
+	cpu_initialize(info->index);
+
+	ap_wait_for_smm_relocation_begin();
+
+	smm_initiate_relocation();
+
+	/* After SMM relocation a 2nd microcode load is required. */
+	intel_microcode_load_unlocked(microcode_ptr);
+
+	/* Cleanup ROM caching. */
+	cleanup_rom_caching();
+
+	/* FIXME(adurbin): park CPUs properly -- preferably somewhere in a
+	 * reserved part of memory that the OS cannot get to. */
+	stop_this_cpu();
+}
+
+static void setup_default_sipi_vector_params(struct sipi_params *sp)
+{
+	int i;
+
+	sp->gdt = (u32)&gdt;
+	sp->gdtlimit = (u32)&gdt_limit;
+	sp->idt_ptr = (u32)&idtarg;
+	sp->stack_size = CONFIG_STACK_SIZE;
+	sp->stack_top = (u32)&_estack;
+	/* Adjust the stack top to take into account cpu_info. */
+	sp->stack_top -= sizeof(struct cpu_info);
+	/* Default to linear APIC id space. */
+	for (i = 0; i < CONFIG_MAX_CPUS; i++)
+		sp->apic_to_cpu_num[i] = i;
+}
+
+#define NUM_FIXED_MTRRS 11
+static unsigned int fixed_mtrrs[NUM_FIXED_MTRRS] = {
+	MTRRfix64K_00000_MSR, MTRRfix16K_80000_MSR, MTRRfix16K_A0000_MSR,
+	MTRRfix4K_C0000_MSR, MTRRfix4K_C8000_MSR, MTRRfix4K_D0000_MSR,
+	MTRRfix4K_D8000_MSR, MTRRfix4K_E0000_MSR, MTRRfix4K_E8000_MSR,
+	MTRRfix4K_F0000_MSR, MTRRfix4K_F8000_MSR,
+};
+
+static inline struct saved_msr *save_msr(int index, struct saved_msr *entry)
+{
+	msr_t msr;
+
+	msr = rdmsr(index);
+	entry->index = index;
+	entry->lo = msr.lo;
+	entry->hi = msr.hi;
+
+	/* Return the next entry. */
+	entry++;
+	return entry;
+}
+
+static int save_bsp_msrs(char *start, int size)
+{
+	int msr_count;
+	int num_var_mtrrs;
+	struct saved_msr *msr_entry;
+	int i;
+	msr_t msr;
+
+	/* Determine number of MTRRs need to be saved. */
+	msr = rdmsr(MTRRcap_MSR);
+	num_var_mtrrs = msr.lo & 0xff;
+
+	/* 2 * num_var_mtrrs for base and mask. +1 for IA32_MTRR_DEF_TYPE. */
+	msr_count = 2 * num_var_mtrrs + NUM_FIXED_MTRRS + 1;
+
+	if ((msr_count * sizeof(struct saved_msr)) > size) {
+		printk(BIOS_CRIT, "Cannot mirror all %d msrs.\n", msr_count);
+		return -1;
+	}
+
+	msr_entry = (void *)start;
+	for (i = 0; i < NUM_FIXED_MTRRS; i++) {
+		msr_entry = save_msr(fixed_mtrrs[i], msr_entry);
+	}
+
+	for (i = 0; i < num_var_mtrrs; i++) {
+		msr_entry = save_msr(MTRRphysBase_MSR(i), msr_entry);
+		msr_entry = save_msr(MTRRphysMask_MSR(i), msr_entry);
+	}
+
+	msr_entry = save_msr(MTRRdefType_MSR, msr_entry);
+
+	return msr_count;
+}
+
+/* The SIPI vector is loaded at the SMM_DEFAULT_BASE. The reason is at the
+ * memory range is already reserved so the OS cannot use it. That region is
+ * free to use for AP bringup before SMM is initialized. */
+static u32 sipi_vector_location = SMM_DEFAULT_BASE;
+static int sipi_vector_location_size = SMM_DEFAULT_SIZE;
+
+static int load_sipi_vector(const void *microcode_patch)
+{
+	struct rmodule sipi_mod;
+	int module_size;
+	int num_msrs;
+	struct sipi_params *sp;
+	char *mod_loc = (void *)sipi_vector_location;
+	const int loc_size = sipi_vector_location_size;
+
+	if (rmodule_parse(&_binary_sipi_vector_start, &sipi_mod)) {
+		printk(BIOS_CRIT, "Unable to parse sipi module.\n");
+		return -1;
+	}
+
+	if (rmodule_entry_offset(&sipi_mod) != 0) {
+		printk(BIOS_CRIT, "SIPI module entry offset is not 0!\n");
+		return -1;
+	}
+
+	if (rmodule_load_alignment(&sipi_mod) != 4096) {
+		printk(BIOS_CRIT, "SIPI module load alignment(%d) != 4096.\n",
+		       rmodule_load_alignment(&sipi_mod));
+		return -1;
+	}
+
+	module_size = rmodule_memory_size(&sipi_mod);
+
+	/* Align to 4 bytes. */
+	module_size += 3;
+	module_size &= ~3;
+
+	if (module_size > loc_size) {
+		printk(BIOS_CRIT, "SIPI module size (%d) > region size (%d).\n",
+		       module_size, loc_size);
+		return -1;
+	}
+
+	num_msrs = save_bsp_msrs(&mod_loc[module_size], loc_size - module_size);
+
+	if (num_msrs < 0) {
+		printk(BIOS_CRIT, "Error mirroring BSP's msrs.\n");
+		return -1;
+	}
+
+	if (rmodule_load(mod_loc, &sipi_mod)) {
+		printk(BIOS_CRIT, "Unable to load SIPI module.\n");
+		return -1;
+	}
+
+	sp = rmodule_parameters(&sipi_mod);
+
+	if (sp == NULL) {
+		printk(BIOS_CRIT, "SIPI module has no parameters.\n");
+		return -1;
+	}
+
+	setup_default_sipi_vector_params(sp);
+	/* Setup MSR table. */
+	sp->msr_table_ptr = (u32)&mod_loc[module_size];
+	sp->msr_count = num_msrs;
+	/* Provide pointer to microcode patch. */
+	sp->microcode_ptr = (u32)microcode_patch;
+	/* The microcode pointer is passed on through to the c handler so
+	 * that it can be loaded again after SMM relocation. */
+	sp->c_handler_arg = (u32)microcode_patch;
+	sp->c_handler = (u32)&ap_init;
+
+	/* Make sure SIPI vector hits RAM so the APs that come up will see
+	 * the startup code even if the caches are disabled.  */
+	wbinvd();
+
+	return 0;
+}
+
+static int allocate_cpu_devices(struct bus *cpu_bus, int *total_hw_threads)
+{
+	int i;
+	int num_threads;
+	int num_cores;
+	int max_cpus;
+	struct cpu_info *info;
+	msr_t msr;
+
+	info = cpu_info();
+	cpu_devs[info->index] = info->cpu;
+	apic_id_table[info->index] = info->cpu->path.apic.apic_id;
+
+	msr = rdmsr(CORE_THREAD_COUNT_MSR);
+	num_threads = (msr.lo >> 0) & 0xffff;
+	num_cores = (msr.lo >> 16) & 0xffff;
+	printk(BIOS_DEBUG, "CPU has %u cores, %u threads enabled.\n",
+	       num_cores, num_threads);
+
+	max_cpus = num_threads;
+	*total_hw_threads = num_threads;
+	if (num_threads > CONFIG_MAX_CPUS) {
+		printk(BIOS_CRIT, "CPU count(%d) exceeds CONFIG_MAX_CPUS(%d)\n",
+		       num_threads, CONFIG_MAX_CPUS);
+		max_cpus = CONFIG_MAX_CPUS;
+	}
+
+	for (i = 1; i < max_cpus; i++) {
+		struct device_path cpu_path;
+		device_t new;
+
+		/* Build the cpu device path */
+		cpu_path.type = DEVICE_PATH_APIC;
+		cpu_path.apic.apic_id = info->cpu->path.apic.apic_id + i;
+
+		/* Allocate the new cpu device structure */
+		new = alloc_find_dev(cpu_bus, &cpu_path);
+		if (new == NULL) {
+			printk(BIOS_CRIT, "Could not allocte cpu device\n");
+			max_cpus--;
+		}
+		cpu_devs[i] = new;
+	}
+
+	return max_cpus;
+}
+
+int setup_ap_init(struct bus *cpu_bus, int *max_cpus,
+                  const void *microcode_patch)
+{
+	int num_cpus;
+	int hw_threads;
+
+	/* Default to currently running CPU. */
+	num_cpus = allocate_cpu_devices(cpu_bus, &hw_threads);
+
+	/* Load the SIPI vector. */
+	if (load_sipi_vector(microcode_patch))
+		return -1;
+
+	*max_cpus = num_cpus;
+
+	if (num_cpus < hw_threads) {
+		printk(BIOS_CRIT,
+		       "ERROR: More HW threads (%d) than support (%d).\n",
+		       hw_threads, num_cpus);
+		return -1;
+	}
+
+	return 0;
+}
+
+/* Returns 1 for timeout. 0 on success. */
+static int apic_wait_timeout(int total_delay, int delay_step)
+{
+	int total = 0;
+	int timeout = 0;
+
+	while (lapic_read(LAPIC_ICR) & LAPIC_ICR_BUSY) {
+		udelay(delay_step);
+		total += delay_step;
+		if (total >= total_delay) {
+			timeout = 1;
+			break;
+		}
+	}
+
+	return timeout;
+}
+
+int start_aps(struct bus *cpu_bus, int ap_count)
+{
+	int sipi_vector;
+
+	if (ap_count == 0)
+		return 0;
+
+	/* The vector is sent as a 4k aligned address in one byte. */
+	sipi_vector = sipi_vector_location >> 12;
+
+	if (sipi_vector > 256) {
+		printk(BIOS_CRIT, "SIPI vector too large! 0x%08x\n",
+		       sipi_vector);
+		return -1;
+	}
+
+	printk(BIOS_DEBUG, "Attempting to start %d APs\n", ap_count);
+
+	if ((lapic_read(LAPIC_ICR) & LAPIC_ICR_BUSY)) {
+		printk(BIOS_DEBUG, "Waiting for ICR not to be busy...");
+		if (apic_wait_timeout(1000 /* 1 ms */, 50)) {
+			printk(BIOS_DEBUG, "timed out. Aborting.\n");
+			return -1;
+		} else
+			printk(BIOS_DEBUG, "done.\n");
+	}
+
+	/* Send INIT IPI to all but self. */
+	lapic_write_around(LAPIC_ICR2, SET_LAPIC_DEST_FIELD(0));
+	lapic_write_around(LAPIC_ICR, LAPIC_DEST_ALLBUT | LAPIC_INT_ASSERT |
+	                   LAPIC_DM_INIT);
+	printk(BIOS_DEBUG, "Waiting for INIT to complete...");
+
+	/* Wait for 10 ms to complete. */
+	if (apic_wait_timeout(10000 /* 10 ms */, 100 /* us */)) {
+		printk(BIOS_DEBUG, "timed out. Bailing. \n");
+		return -1;
+	} else {
+		printk(BIOS_DEBUG, "done.\n");
+	}
+
+	/* Send 1st SIPI */
+	if ((lapic_read(LAPIC_ICR) & LAPIC_ICR_BUSY)) {
+		printk(BIOS_DEBUG, "Waiting for ICR not to be busy...");
+		if (apic_wait_timeout(1000 /* 1 ms */, 50)) {
+			printk(BIOS_DEBUG, "timed out. Aborting.\n");
+			return -1;
+		} else
+			printk(BIOS_DEBUG, "done.\n");
+	}
+
+	lapic_write_around(LAPIC_ICR2, SET_LAPIC_DEST_FIELD(0));
+	lapic_write_around(LAPIC_ICR, LAPIC_DEST_ALLBUT | LAPIC_INT_ASSERT |
+	                   LAPIC_DM_STARTUP | sipi_vector);
+	printk(BIOS_DEBUG, "Waiting for 1st SIPI to complete...");
+	if (apic_wait_timeout(10000 /* 10 ms */, 50 /* us */)) {
+		printk(BIOS_DEBUG, "timed out.\n");
+		return -1;
+	} else {
+		printk(BIOS_DEBUG, "done.\n");
+	}
+	/* Wait for CPUs to check in up to 200 us. */
+	wait_for_aps(ap_count, 200 /* us */, 15 /* us */);
+
+	/* Send 2nd SIPI */
+	if ((lapic_read(LAPIC_ICR) & LAPIC_ICR_BUSY)) {
+		printk(BIOS_DEBUG, "Waiting for ICR not to be busy...");
+		if (apic_wait_timeout(1000 /* 1 ms */, 50)) {
+			printk(BIOS_DEBUG, "timed out. Aborting.\n");
+			return -1;
+		} else
+			printk(BIOS_DEBUG, "done.\n");
+	}
+
+	lapic_write_around(LAPIC_ICR2, SET_LAPIC_DEST_FIELD(0));
+	lapic_write_around(LAPIC_ICR, LAPIC_DEST_ALLBUT | LAPIC_INT_ASSERT |
+	                   LAPIC_DM_STARTUP | sipi_vector);
+	printk(BIOS_DEBUG, "Waiting for 2nd SIPI to complete...");
+	if (apic_wait_timeout(10000 /* 10 ms */, 50 /* us */)) {
+		printk(BIOS_DEBUG, "timed out.\n");
+		return -1;
+	} else {
+		printk(BIOS_DEBUG, "done.\n");
+	}
+
+	/* Wait for CPUs to check in. */
+	if (wait_for_aps(ap_count, 10000 /* 10 ms */, 50 /* us */)) {
+		printk(BIOS_DEBUG, "Not all APs checked in: %d/%d.\n",
+		       atomic_read(&num_aps), ap_count);
+		return -1;
+	}
+
+	return 0;
+}
+
+DECLARE_SPIN_LOCK(smm_relocation_lock);
+
+void smm_initiate_relocation(void)
+{
+	spin_lock(&smm_relocation_lock);
+
+	if ((lapic_read(LAPIC_ICR) & LAPIC_ICR_BUSY)) {
+		printk(BIOS_DEBUG, "Waiting for ICR not to be busy...");
+		if (apic_wait_timeout(1000 /* 1 ms */, 50)) {
+			printk(BIOS_DEBUG, "timed out. Aborting.\n");
+			spin_unlock(&smm_relocation_lock);
+			return;
+		} else
+			printk(BIOS_DEBUG, "done.\n");
+	}
+
+	lapic_write_around(LAPIC_ICR2, SET_LAPIC_DEST_FIELD(lapicid()));
+	lapic_write_around(LAPIC_ICR, LAPIC_INT_ASSERT | LAPIC_DM_SMI);
+	if (apic_wait_timeout(1000 /* 1 ms */, 100 /* us */)) {
+		printk(BIOS_DEBUG, "SMI Relocation timed out.\n");
+	} else
+		printk(BIOS_DEBUG, "Relocation complete.\n");
+
+	spin_unlock(&smm_relocation_lock);
+}
+
diff --git a/src/cpu/intel/haswell/sipi_header.c b/src/cpu/intel/haswell/sipi_header.c
new file mode 100644
index 0000000..846a82d
--- /dev/null
+++ b/src/cpu/intel/haswell/sipi_header.c
@@ -0,0 +1,6 @@
+#include <rmodule.h>
+
+
+extern void *ap_start;
+
+DEFINE_RMODULE_HEADER(sipi_vector_header, ap_start, RMODULE_TYPE_SIPI_VECTOR);
diff --git a/src/cpu/intel/haswell/sipi_vector.S b/src/cpu/intel/haswell/sipi_vector.S
new file mode 100644
index 0000000..664a9ee
--- /dev/null
+++ b/src/cpu/intel/haswell/sipi_vector.S
@@ -0,0 +1,190 @@
+/*
+ * This file is part of the coreboot project.
+ *
+ * Copyright (C) 2013 ChromeOS Authors
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; version 2 of
+ * the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
+ * MA 02110-1301 USA
+ */
+
+/* The SIPI vector is responsible for initializing the APs in the sytem. It
+ * loads microcode, sets up MSRs, and enables caching before calling into
+ * C code. */
+
+/* These segment selectors need to match the gdt entries in c_start.S. */
+#define CODE_SEG 0x10
+#define DATA_SEG 0x18
+
+#define IA32_UPDT_TRIG 0x79
+#define IA32_BIOS_SIGN_ID 0x8b
+
+.section ".module_parameters", "aw", @progbits
+ap_start_params:
+gdtaddr:
+.word 0 /* limit */
+.long 0 /* table */
+.word 0 /* unused */
+idt_ptr:
+.long 0
+stack_top:
+.long 0
+stack_size:
+.long 0
+microcode_ptr:
+.long 0
+msr_table_ptr:
+.long 0
+msr_count:
+.long 0
+c_handler:
+.long 0
+c_handler_arg:
+.long 0
+apic_to_cpu_num:
+.fill CONFIG_MAX_CPUS,1,0xff
+
+.text
+.code16
+.global ap_start
+ap_start:
+	cli
+	xorl	%eax, %eax
+	movl	%eax, %cr3    /* Invalidate TLB*/
+
+	/* On hyper threaded cpus, invalidating the cache here is
+	 * very very bad.  Don't.
+	 */
+
+	/* setup the data segment */
+	movw	%cs, %ax
+	movw	%ax, %ds
+
+	/* The gdtaddr needs to be releative to the data segment in order
+	 * to properly dereference it. The .text section comes first in an
+	 * rmodule so ap_start can be used as a proxy for the load address. */
+	movl	$(gdtaddr), %ebx
+	sub	$(ap_start), %ebx
+
+	data32 lgdt (%ebx)
+
+	movl	%cr0, %eax
+	andl	$0x7FFAFFD1, %eax /* PG,AM,WP,NE,TS,EM,MP = 0 */
+	orl	$0x60000001, %eax /* CD, NW, PE = 1 */
+	movl	%eax, %cr0
+
+	ljmpl	$CODE_SEG, $1f
+1:
+	.code32
+	movw	$DATA_SEG, %ax
+	movw	%ax, %ds
+	movw	%ax, %es
+	movw	%ax, %ss
+	movw	%ax, %fs
+	movw	%ax, %gs
+
+	/* Load the Interrupt descriptor table */
+	mov	idt_ptr, %ebx
+	lidt	(%ebx)
+
+	/* The CPU number is calculated by reading the initial APIC id. */
+	mov	$1, %eax
+	cpuid
+	/* Default APIC id in ebx[31:24]. Move it to bl. */
+	bswap	%ebx
+	mov	$(apic_to_cpu_num), %eax
+	xor	%ecx, %ecx
+
+1:
+	cmp	(%eax, %ecx, 1), %bl
+	je	1f
+	inc	%ecx
+	cmp	$CONFIG_MAX_CPUS, %ecx
+	jne	1b
+
+	/* This is bad. No CPU number found. However, the BSP should have setup
+	 * the AP handler properly. Just park the CPU. */
+	mov	$0x80, %dx
+	movw	$0xdead, %ax
+	outw	%ax, %dx
+	jmp	halt_jump
+1:
+	/* Setup stacks for each CPU. */
+	movl	stack_size, %eax
+	mul	%ecx
+	movl	stack_top, %edx
+	subl	%eax, %edx
+	mov	%edx, %esp
+	/* Save cpu number. */
+	mov	%ecx, %esi
+
+	/* Determine if one should check microcode versions. */
+	mov	microcode_ptr, %edi
+	test	%edi, %edi
+	jz	1f /* Bypass if no microde exists. */
+
+	/* Get the Microcode version. */
+	mov	$1, %eax
+	cpuid
+	mov	$IA32_BIOS_SIGN_ID, %ecx
+	rdmsr
+	/* If something already loaded skip loading again. */
+	test	%edx, %edx
+	jnz	1f
+
+	/* Load new microcode. */
+	mov	$IA32_UPDT_TRIG, %ecx
+	xor	%edx, %edx
+	mov	%edi, %eax
+	/* The microcode pointer is passed in pointing to the header. Adjust
+	 * pointer to reflect the payload (header size is 48 bytes). */
+	add	$48, %eax
+	pusha
+	wrmsr
+	popa
+
+1:
+	/*
+	 * Load MSRs. Each entry in the table consists of:
+	 * 0: index,
+	 * 4: value[31:0]
+	 * 8: value[63:32]
+	 */
+	mov	msr_table_ptr, %edi
+	mov	msr_count, %ebx
+	test	%ebx, %ebx
+	jz	1f
+load_msr:
+	mov	(%edi), %ecx
+	mov	4(%edi), %eax
+	mov	8(%edi), %edx
+	wrmsr
+	add	$12, %edi
+	dec	%ebx
+	jnz	load_msr
+
+1:
+	/* Enable caching. */
+	mov	%cr0, %eax
+	and	$0x9fffffff, %eax /* CD, NW = 0 */
+	mov	%eax, %cr0
+
+	/* c_handler(cpu_num, *c_handler_arg) */
+	push	c_handler_arg
+	push	%esi	/* cpu_num */
+	mov	c_handler, %eax
+	call	*%eax
+halt_jump:
+	hlt
+	jmp	halt_jump
diff --git a/src/cpu/intel/haswell/smmrelocate.c b/src/cpu/intel/haswell/smmrelocate.c
index 1ccc9bb..4312d79 100644
--- a/src/cpu/intel/haswell/smmrelocate.c
+++ b/src/cpu/intel/haswell/smmrelocate.c
@@ -23,6 +23,7 @@
 #include <device/pci.h>
 #include <cpu/cpu.h>
 #include <cpu/x86/cache.h>
+#include <cpu/x86/lapic.h>
 #include <cpu/x86/msr.h>
 #include <cpu/x86/mtrr.h>
 #include <cpu/x86/smm.h>
@@ -297,24 +298,30 @@ static int cpu_smm_setup(void)
 	return 0;
 }
 
-void smm_init(void)
+int smm_initialize(void)
 {
 	/* Return early if CPU SMM setup failed. */
 	if (cpu_smm_setup())
-		return;
+		return -1;
 
 	southbridge_smm_init();
 
-	/* Initiate first SMI to kick off SMM-context relocation. Note: this
-	 * SMI being triggered here queues up an SMI in the APs which are in
-	 * wait-for-SIPI state. Once an AP gets an SIPI it will service the SMI
-	 * at the SMM_DEFAULT_BASE before jumping to startup vector. */
-	southbridge_trigger_smi();
-
-	printk(BIOS_DEBUG, "Relocation complete.\n");
+	/* Run the relocation handler. */
+	smm_initiate_relocation();
 
 	/* Lock down the SMRAM space. */
 	smm_lock();
+
+	return 0;
+}
+
+void smm_init(void)
+{
+	/* smm_init() is normally called from initialize_cpus() in
+	 * lapic_cpu_init.c. However, that path is no longer used. Don't reuse
+	 * the function name because that would cause confusion.
+	 * The smm_initialize() function above is used to setup SMM at the
+	 * appropriate time. */
 }
 
 void smm_lock(void)