[coreboot-gerrit] New patch to review for coreboot: 40c68b3 libpayload: Make EHCI driver cache-aware

Isaac Christensen (isaac.christensen@se-eng.com) gerrit at coreboot.org
Wed Aug 13 00:13:34 CEST 2014


Isaac Christensen (isaac.christensen at se-eng.com) just uploaded a new patch set to gerrit, which you can find at http://review.coreboot.org/6633

-gerrit

commit 40c68b3590b6f6953d996de1f03d646b72990fa2
Author: Julius Werner <jwerner at chromium.org>
Date:   Wed Aug 28 12:29:28 2013 -0700

    libpayload: Make EHCI driver cache-aware
    
    This patch makes the EHCI driver work on ARM platforms which usually do
    not support automatic cache snooping. It uses the new DMA memory
    mechanism (which needs to be correctly set up in the Coreboot mainboard
    code) to allocate all EHCI-internal communication structures in
    cache-coherent memory, and cleans/invalidates the externally supplied
    transfer buffers in Bulk and Control functions with explicit calls as
    necessary.
    
    Old-Change-Id: Ie8a62545d905b7a4fdd2a56b9405774be69779e5
    Signed-off-by: Julius Werner <jwerner at chromium.org>
    Reviewed-on: https://chromium-review.googlesource.com/167339
    (cherry picked from commit 322338934add36a5372ffe7d2a45e61a4fdd4a54)
    
    libpayload: ehci: Cache management is hard, let's go copying...
    
    It turns out that my previous commit to make the EHCI stack cache aware
    on ARM devices wasn't quite correct, and the problem is actually much
    trickier than I thought. After having some fun with more weird transfer
    problems that appear/disappear based on stack alignment, this is my
    current worst-case threat model that any cache managing implementation
    would need to handle correctly:
    
    Some upper layer calls ehci_bulk() with a transfer buffer on its stack.
    Due to stack alignment, it happens to start just at the top of a cache
    line, so up to 64 - 4 bytes of ehci_bulk's stack will share that line.
    ehci_bulk() calls dcache_clean() and initializes the USB transfer.
    Between that point and the call to dcache_invalidate() at the end of
    ehci_bulk(), any access to the stack variables in that cache line (even
    a speculative prefetch) will refetch the line into the cache. Afterwards
    any other access to a random memory location that just happens to get
    aliased to the same cache line may evict it again, causing the processor
    to write out stale data to the transfer buffer and possibly overwrite
    data that has already been received over USB.
    
    In short, any dcache_clean/dcache_invalidate-based implementation that
    preserves correctness while allowing any arbitrary (non cache-aligned)
    memory location as a transfer buffer is presumed to be impossible.
    Instead, this patch causes all transfer data to be copied to/from a
    cache-coherent bounce buffer. It will still transfer directly if the
    supplied buffer is already cache-coherent, which can be used by callers
    to optimize their transfers (and is true by default on x86).
    
    Old-Change-Id: I112908410bdbc8ca028d44f2f5d388c529f8057f
    Signed-off-by: Julius Werner <jwerner at chromium.org>
    Reviewed-on: https://chromium-review.googlesource.com/169231
    Reviewed-by: Stefan Reinauer <reinauer at chromium.org>
    (cherry picked from commit 702dc50f1d56fe206442079fa443437f4336daed)
    
    Squashed the initial commit and a follow up fix.
    
    Change-Id: Idf7e5aa855b4f0221f82fa380a76049f273e4c88
    Signed-off-by: Isaac Christensen <isaac.christensen at se-eng.com>
---
 payloads/libpayload/drivers/usb/ehci.c         | 105 +++++++++++++++++++------
 payloads/libpayload/drivers/usb/ehci_private.h |   3 +-
 payloads/libpayload/include/stdlib.h           |   2 +
 payloads/libpayload/include/x86/arch/cache.h   |   6 +-
 payloads/libpayload/libc/malloc.c              |  17 +++-
 5 files changed, 104 insertions(+), 29 deletions(-)

diff --git a/payloads/libpayload/drivers/usb/ehci.c b/payloads/libpayload/drivers/usb/ehci.c
index af7daf1..129b9b8 100644
--- a/payloads/libpayload/drivers/usb/ehci.c
+++ b/payloads/libpayload/drivers/usb/ehci.c
@@ -30,6 +30,7 @@
 //#define USB_DEBUG
 
 #include <libpayload.h>
+#include <arch/cache.h>
 #include "ehci.h"
 #include "ehci_private.h"
 
@@ -358,9 +359,10 @@ static int ehci_process_async_schedule(
 	return result;
 }
 
-static int ehci_bulk (endpoint_t *ep, int size, u8 *data, int finalize)
+static int ehci_bulk (endpoint_t *ep, int size, u8 *src, int finalize)
 {
 	int result = 0;
+	u8 *end = src + size;
 	int remaining = size;
 	int endp = ep->endpoint & 0xf;
 	int pid = (ep->direction==IN)?EHCI_IN:EHCI_OUT;
@@ -372,30 +374,42 @@ static int ehci_bulk (endpoint_t *ep, int size, u8 *data, int finalize)
 			return -1;
 	}
 
-	qtd_t *head = memalign(64, sizeof(qtd_t));
+	if (!dma_coherent(src)) {
+		end = EHCI_INST(ep->dev->controller)->dma_buffer + size;
+		if (size > DMA_SIZE) {
+			usb_debug("EHCI bulk transfer too large for DMA buffer: %d\n", size);
+			return -1;
+		}
+		if (pid == EHCI_OUT)
+			memcpy(end - size, src, size);
+	}
+
+	ehci_qh_t *qh = dma_memalign(64, sizeof(ehci_qh_t));
+	qtd_t *head = dma_memalign(64, sizeof(qtd_t));
 	qtd_t *cur = head;
+	if (!qh || !head)
+		goto oom;
 	while (1) {
 		memset((void *)cur, 0, sizeof(qtd_t));
 		cur->token = QTD_ACTIVE |
 			(pid << QTD_PID_SHIFT) |
 			(0 << QTD_CERR_SHIFT);
-		u32 chunk = fill_td(cur, data, remaining);
-		remaining -= chunk;
-		data += chunk;
+		remaining -= fill_td(cur, end - remaining, remaining);
 
 		cur->alt_next_qtd = QTD_TERMINATE;
-		if (remaining == 0) {
+		if (remaining <= 0) {
 			cur->next_qtd = virt_to_phys(0) | QTD_TERMINATE;
 			break;
 		} else {
-			qtd_t *next = memalign(64, sizeof(qtd_t));
+			qtd_t *next = dma_memalign(64, sizeof(qtd_t));
+			if (!next)
+				goto oom;
 			cur->next_qtd = virt_to_phys(next);
 			cur = next;
 		}
 	}
 
 	/* create QH */
-	ehci_qh_t *qh = memalign(64, sizeof(ehci_qh_t));
 	memset((void *)qh, 0, sizeof(ehci_qh_t));
 	qh->horiz_link_ptr = virt_to_phys(qh) | QH_QH;
 	qh->epchar = ep->dev->address |
@@ -415,21 +429,31 @@ static int ehci_bulk (endpoint_t *ep, int size, u8 *data, int finalize)
 
 	result = ehci_process_async_schedule(
 			EHCI_INST(ep->dev->controller), qh, head);
-	if (result >= 0)
+	if (result >= 0) {
 		result = size - result;
+		if (pid == EHCI_IN && end != src + size)
+			memcpy(src, end - size, result);
+	}
 
 	ep->toggle = (cur->token & QTD_TOGGLE_MASK) >> QTD_TOGGLE_SHIFT;
 
 	free_qh_and_tds(qh, head);
 
 	return result;
+
+oom:
+	usb_debug("Not enough DMA memory for EHCI control structures!\n");
+	free_qh_and_tds(qh, head);
+	return -1;
 }
 
 
 /* FIXME: Handle control transfers as 3 QHs, so the 2nd stage can be >0x4000 bytes */
-static int ehci_control (usbdev_t *dev, direction_t dir, int drlen, void *devreq,
-			 int dalen, u8 *data)
+static int ehci_control (usbdev_t *dev, direction_t dir, int drlen, void *setup,
+			 int dalen, u8 *src)
 {
+	u8 *data = src;
+	u8 *devreq = setup;
 	int endp = 0; // this is control. always 0 (for now)
 	int toggle = 0;
 	int mlen = dev->endpoints[0].maxpacketsize;
@@ -443,9 +467,26 @@ static int ehci_control (usbdev_t *dev, direction_t dir, int drlen, void *devreq
 		non_hs_ctrl_ep = 1;
 	}
 
+	if (!dma_coherent(setup)) {
+		devreq = EHCI_INST(dev->controller)->dma_buffer;
+		memcpy(devreq, setup, drlen);
+	}
+	if (dalen > 0 && !dma_coherent(src)) {
+		data = EHCI_INST(dev->controller)->dma_buffer + drlen;
+		if (drlen + dalen > DMA_SIZE) {
+			usb_debug("EHCI control transfer too large for DMA buffer: %d\n", drlen + dalen);
+			return -1;
+		}
+		if (dir == OUT)
+			memcpy(data, src, dalen);
+	}
+
 	/* create qTDs */
-	qtd_t *head = memalign(64, sizeof(qtd_t));
+	qtd_t *head = dma_memalign(64, sizeof(qtd_t));
+	ehci_qh_t *qh = dma_memalign(64, sizeof(ehci_qh_t));
 	qtd_t *cur = head;
+	if (!qh || !head)
+		goto oom;
 	memset((void *)cur, 0, sizeof(qtd_t));
 	cur->token = QTD_ACTIVE |
 		(toggle?QTD_TOGGLE_DATA1:0) |
@@ -454,9 +495,11 @@ static int ehci_control (usbdev_t *dev, direction_t dir, int drlen, void *devreq
 	if (fill_td(cur, devreq, drlen) != drlen) {
 		usb_debug("ERROR: couldn't send the entire device request\n");
 	}
-	qtd_t *next = memalign(64, sizeof(qtd_t));
+	qtd_t *next = dma_memalign(64, sizeof(qtd_t));
 	cur->next_qtd = virt_to_phys(next);
 	cur->alt_next_qtd = QTD_TERMINATE;
+	if (!next)
+		goto oom;
 
 	/* FIXME: We're limited to 16-20K (depending on alignment) for payload for now.
 	 * Figure out, how toggle can be set sensibly in this scenario */
@@ -471,7 +514,9 @@ static int ehci_control (usbdev_t *dev, direction_t dir, int drlen, void *devreq
 		if (fill_td(cur, data, dalen) != dalen) {
 			usb_debug("ERROR: couldn't send the entire control payload\n");
 		}
-		next = memalign(64, sizeof(qtd_t));
+		next = dma_memalign(64, sizeof(qtd_t));
+		if (!next)
+			goto oom;
 		cur->next_qtd = virt_to_phys(next);
 		cur->alt_next_qtd = QTD_TERMINATE;
 	}
@@ -488,7 +533,6 @@ static int ehci_control (usbdev_t *dev, direction_t dir, int drlen, void *devreq
 	cur->alt_next_qtd = QTD_TERMINATE;
 
 	/* create QH */
-	ehci_qh_t *qh = memalign(64, sizeof(ehci_qh_t));
 	memset((void *)qh, 0, sizeof(ehci_qh_t));
 	qh->horiz_link_ptr = virt_to_phys(qh) | QH_QH;
 	qh->epchar = dev->address |
@@ -506,11 +550,19 @@ static int ehci_control (usbdev_t *dev, direction_t dir, int drlen, void *devreq
 
 	result = ehci_process_async_schedule(
 			EHCI_INST(dev->controller), qh, head);
-	if (result >= 0)
+	if (result >= 0) {
 		result = dalen - result;
+		if (dir == IN && data != src)
+			memcpy(src, data, result);
+	}
 
 	free_qh_and_tds(qh, head);
 	return result;
+
+oom:
+	usb_debug("Not enough DMA memory for EHCI control structures!\n");
+	free_qh_and_tds(qh, head);
+	return -1;
 }
 
 
@@ -581,13 +633,12 @@ static void *ehci_create_intr_queue(
 			return NULL;
 	}
 
-	intr_queue_t *const intrq =
-		(intr_queue_t *)memalign(64, sizeof(intr_queue_t));
+	intr_queue_t *const intrq = (intr_queue_t *)malloc(sizeof(intr_queue_t));
 	/*
 	 * reqcount data chunks
 	 * plus one more spare, which we'll leave out of queue
 	 */
-	u8 *data = (u8 *)malloc(reqsize * (reqcount + 1));
+	u8 *data = (u8 *)dma_malloc(reqsize * (reqcount + 1));
 	if (!intrq || !data)
 		fatal("Not enough memory to create USB interrupt queue.\n");
 	intrq->data = data;
@@ -595,7 +646,7 @@ static void *ehci_create_intr_queue(
 	intrq->reqsize = reqsize;
 
 	/* create #reqcount transfer descriptors (qTDs) */
-	intrq->head = (intr_qtd_t *)memalign(64, sizeof(intr_qtd_t));
+	intrq->head = (intr_qtd_t *)dma_memalign(64, sizeof(intr_qtd_t));
 	intr_qtd_t *cur_td = intrq->head;
 	for (i = 0; i < reqcount; ++i) {
 		fill_intr_queue_td(intrq, cur_td, data);
@@ -603,7 +654,7 @@ static void *ehci_create_intr_queue(
 		if (i < reqcount - 1) {
 			/* create one more qTD */
 			intr_qtd_t *const next_td =
-				(intr_qtd_t *)memalign(64, sizeof(intr_qtd_t));
+				(intr_qtd_t *)dma_memalign(64, sizeof(intr_qtd_t));
 			cur_td->td.next_qtd = virt_to_phys(&next_td->td);
 			cur_td->next = next_td;
 			cur_td = next_td;
@@ -612,7 +663,7 @@ static void *ehci_create_intr_queue(
 	intrq->tail = cur_td;
 
 	/* create spare qTD */
-	intrq->spare = (intr_qtd_t *)memalign(64, sizeof(intr_qtd_t));
+	intrq->spare = (intr_qtd_t *)dma_memalign(64, sizeof(intr_qtd_t));
 	fill_intr_queue_td(intrq, intrq->spare, data);
 
 	/* initialize QH */
@@ -780,16 +831,22 @@ ehci_init (unsigned long physical_bar)
 
 	/* Initialize periodic frame list */
 	/* 1024 32-bit pointers, 4kb aligned */
-	u32 *const periodic_list = (u32 *)memalign(4096, 1024 * sizeof(u32));
+	u32 *const periodic_list = (u32 *)dma_memalign(4096, 1024 * sizeof(u32));
 	if (!periodic_list)
 		fatal("Not enough memory creating EHCI periodic frame list.\n");
 
+	if (dma_initialized()) {
+		EHCI_INST(controller)->dma_buffer = dma_memalign(4096, DMA_SIZE);
+		if (!EHCI_INST(controller)->dma_buffer)
+			fatal("Not enough DMA memory for EHCI bounce buffer.\n");
+	}
+
 	/*
 	 * Insert dummy QH in periodic frame list
 	 * This helps with broken host controllers
 	 * and doesn't violate the standard.
 	 */
-	EHCI_INST(controller)->dummy_qh = (ehci_qh_t *)memalign(64, sizeof(ehci_qh_t));
+	EHCI_INST(controller)->dummy_qh = (ehci_qh_t *)dma_memalign(64, sizeof(ehci_qh_t));
 	memset((void *)EHCI_INST(controller)->dummy_qh, 0,
 		sizeof(*EHCI_INST(controller)->dummy_qh));
 	EHCI_INST(controller)->dummy_qh->horiz_link_ptr = QH_TERMINATE;
diff --git a/payloads/libpayload/drivers/usb/ehci_private.h b/payloads/libpayload/drivers/usb/ehci_private.h
index 986ee79..c9fa6e0 100644
--- a/payloads/libpayload/drivers/usb/ehci_private.h
+++ b/payloads/libpayload/drivers/usb/ehci_private.h
@@ -135,13 +135,14 @@ typedef struct ehci {
 	hc_cap_t *capabilities;
 	hc_op_t *operation;
 	ehci_qh_t *dummy_qh;
+#define DMA_SIZE (64 * 1024)
+	void *dma_buffer;
 } ehci_t;
 
 #define PS_TERMINATE 1
 #define PS_TYPE_QH 1 << 1
 #define PS_PTR_MASK ~0x1f
 
-
 #define EHCI_INST(controller) ((ehci_t*)((controller)->instance))
 
 #endif
diff --git a/payloads/libpayload/include/stdlib.h b/payloads/libpayload/include/stdlib.h
index 8c8c200..d3448b9 100644
--- a/payloads/libpayload/include/stdlib.h
+++ b/payloads/libpayload/include/stdlib.h
@@ -115,6 +115,8 @@ void *memalign(size_t align, size_t size);
 void init_dma_memory(void *start, u32 size);
 void *dma_malloc(size_t size);
 void *dma_memalign(size_t align, size_t size);
+int dma_initialized(void);
+int dma_coherent(void *ptr);
 /** @} */
 
 /**
diff --git a/payloads/libpayload/include/x86/arch/cache.h b/payloads/libpayload/include/x86/arch/cache.h
index 73849b7..4a80736 100644
--- a/payloads/libpayload/include/x86/arch/cache.h
+++ b/payloads/libpayload/include/x86/arch/cache.h
@@ -32,9 +32,13 @@
 #ifndef __ARCH_CACHE_H__
 #define __ARCH_CACHE_H__
 
-/* These are noops, needed by the USB stack on ARM */
+/* NOOPs mirroring ARM's cache API, since x86 devices usually cache snoop */
 #define dmb()
 #define dsb()
+#define dcache_clean_all()
+#define dcache_clean_by_mva(addr, len)
+#define dcache_invalidate_all()
+#define dcache_invalidate_by_mva(addr, len)
 #define dcache_clean_invalidate_all()
 #define dcache_clean_invalidate_by_mva(addr, len)
 
diff --git a/payloads/libpayload/libc/malloc.c b/payloads/libpayload/libc/malloc.c
index 618d8b3..11dfeef 100644
--- a/payloads/libpayload/libc/malloc.c
+++ b/payloads/libpayload/libc/malloc.c
@@ -83,7 +83,7 @@ static int minimal_free = 0;
 void init_dma_memory(void *start, u32 size)
 {
 #ifdef CONFIG_LP_DEBUG_MALLOC
-	if (dma != heap) {
+	if (dma_initialized()) {
 		printf("WARNING: %s called twice!\n");
 		return;
 	}
@@ -97,6 +97,17 @@ void init_dma_memory(void *start, u32 size)
 	dma->align_regions = NULL;
 }
 
+int dma_initialized()
+{
+	return dma != heap;
+}
+
+/* For boards that don't initialize DMA we assume all locations are coherent */
+int dma_coherent(void *ptr)
+{
+	return !dma_initialized() || (dma->start <= ptr && dma->end > ptr);
+}
+
 static void setup(hdrtype_t volatile *start, int size)
 {
 	*start = FREE_BLOCK(size);
@@ -120,7 +131,7 @@ static void *alloc(int len, struct memory_type *type)
 
 	/* Make sure the region is setup correctly. */
 	if (!HAS_MAGIC(*ptr))
-		setup(ptr, (int)((&_eheap - &_heap) - HDRSIZE));
+		setup(ptr, (int)((type->end - type->start) - HDRSIZE));
 
 	/* Find some free space. */
 	do {
@@ -470,6 +481,6 @@ void print_malloc_map(void)
 	if (free_memory && (minimal_free > free_memory))
 		minimal_free = free_memory;
 	printf("Maximum memory consumption: %d bytes\n",
-		(unsigned int)(&_eheap - &_heap) - HDRSIZE - minimal_free);
+		(unsigned int)(heap->end - heap->start) - HDRSIZE - minimal_free);
 }
 #endif



More information about the coreboot-gerrit mailing list