[PATCH] usb: ehci: Fix "EHCI timed out on TD - token=XXXX" error on ehci-hcd

Lukasz Majewski lukma at denx.de
Wed Feb 26 12:29:55 CET 2020


This patch aims to improve robustness of 'usb' command operation on the
ehci-hcd IP block as it ports to contemporary U-Boot the patch described
and provided in [1] (originally applicable to U-Boot 2016.05).

According to the fix author - "rayvt" (from [1]):

TD stands for "Queue Element Transfer Descriptor (qTD)", which is a micro-code
instruction for the EHCI USB chip.
The "token" field is detailed information, control, and status of the TD's data
transfer. In particular, the rightmost byte is the status field. 0x80 bit means
the TD is active and the data transfer has not yet completed. 0x08 bit means
there was some sort of data transfer error (XactErr).

If the drive gets a transfer error, it refuses to do any other I/O transfer
until the error is properly cleared and reset. U-boot did not do this, so every
subsequent disk read would get a timeout error because the drive would not send
any data. The fix is to properly clear and reset the USB disk when it gets a
transfer error.

Every read operation starts at the maximum block size. When the USB pendrive is
not able to correctly serve this data read request, the dynamic reduction of IO
size is performed. Up to six tries (with smaller IO block each time) are
attempted.

A related problem is that some drives are slow to come up. Linux handles this by
issuing a spinup command and allowing more time for the drive to respond. The
same idea is applied in this fix.

On TPC70 (i.MX6Q) once per ~10 times (without this patch):

Bus usb at 2184200: USB EHCI 1.00
scanning bus usb at 2184200 for devices... 2 USB Device(s) found
       scanning usb for storage devices... 1 Storage Device(s) found
EHCI timed out on TD - token=0x1f8c80

Performance impact:

With U-Boot -master in mainline:
16869440 bytes read in 979 ms (16.4 MiB/s)

With this patch:
16869440 bytes read in 1219 ms (13.2 MiB/s)

Links:
[1] - https://forum.doozan.com/read.php?3,35295,35295#msg-35295
[2] - https://www.dropbox.com/s/nrkrd1no63viuu8/uboot-bodhi-2016.05-timeoutTD.patch?dl=0

Signed-off-by: Lukasz Majewski <lukma at denx.de>
[Unfortunately, the original patch [2] did not contain S-o-B from the original
author - "rayvt"]
---

 common/usb.c                |  10 +++-
 common/usb_storage.c        | 106 +++++++++++++++++++++++++++++++++---
 drivers/usb/host/ehci-hcd.c |  55 +++++++++++++++++--
 include/usb.h               |   1 +
 include/usb_defs.h          |   1 +
 5 files changed, 160 insertions(+), 13 deletions(-)

diff --git a/common/usb.c b/common/usb.c
index 349e838f1d..305482b5bb 100644
--- a/common/usb.c
+++ b/common/usb.c
@@ -925,14 +925,20 @@ static int get_descriptor_len(struct usb_device *dev, int len, int expect_len)
 	__maybe_unused struct usb_device_descriptor *desc;
 	ALLOC_CACHE_ALIGN_BUFFER(unsigned char, tmpbuf, USB_BUFSIZ);
 	int err;
+	int retry = 5;
 
 	desc = (struct usb_device_descriptor *)tmpbuf;
 
+again:
 	err = usb_get_descriptor(dev, USB_DT_DEVICE, 0, desc, len);
 	if (err < expect_len) {
 		if (err < 0) {
-			printf("unable to get device descriptor (error=%d)\n",
-				err);
+			printf("unable to get device descriptor (error=%d) retry: %d\n",
+			       err, retry);
+			mdelay(50);
+			if (--retry >= 0)
+				/* Some drives are just slow to wake up. */
+				goto again;
 			return err;
 		} else {
 			printf("USB device descriptor short read (expected %i, got %i)\n",
diff --git a/common/usb_storage.c b/common/usb_storage.c
index 097b6729c1..48c8c2ae64 100644
--- a/common/usb_storage.c
+++ b/common/usb_storage.c
@@ -111,6 +111,18 @@ int usb_stor_get_info(struct usb_device *dev, struct us_data *us,
 		      struct blk_desc *dev_desc);
 int usb_storage_probe(struct usb_device *dev, unsigned int ifnum,
 		      struct us_data *ss);
+
+#ifdef CONFIG_USB_EHCI_HCD
+	/*
+	 * The U-Boot EHCI driver can handle any transfer length as long as
+	 * there is enough free heap space left, but the SCSI READ(10) and
+	 * WRITE(10) commands are limited to 65535 blocks.
+	 */
+int usb_max_xfer_blk = 4096;
+#else
+int usb_max_xfer_blk = 20;
+#endif
+
 #if CONFIG_IS_ENABLED(BLK)
 static unsigned long usb_stor_read(struct udevice *dev, lbaint_t blknr,
 				   lbaint_t blkcnt, void *buffer);
@@ -729,6 +741,7 @@ static int usb_stor_BBB_transport(struct scsi_cmd *srb, struct us_data *us)
 	pipeout = usb_sndbulkpipe(us->pusb_dev, us->ep_out);
 	/* DATA phase + error handling */
 	data_actlen = 0;
+	mdelay(10);		/* Like linux does. */
 	/* no data, go immediately to the STATUS phase */
 	if (srb->datalen == 0)
 		goto st;
@@ -740,6 +753,16 @@ static int usb_stor_BBB_transport(struct scsi_cmd *srb, struct us_data *us)
 
 	result = usb_bulk_msg(us->pusb_dev, pipe, srb->pdata, srb->datalen,
 			      &data_actlen, USB_CNTL_TIMEOUT * 5);
+
+	/* special handling of XACTERR in DATA phase */
+	if (result < 0 && (us->pusb_dev->status & USB_ST_XACTERR)) {
+		debug("XACTERR in data phase - clr, reset, and return fail.\n");
+		usb_stor_BBB_clear_endpt_stall(us, dir_in ?
+					       us->ep_in : us->ep_out);
+		usb_stor_BBB_reset(us);
+		return USB_STOR_TRANSPORT_FAILED;
+	}
+
 	/* special handling of STALL in DATA phase */
 	if ((result < 0) && (us->pusb_dev->status & USB_ST_STALLED)) {
 		debug("DATA:stall\n");
@@ -1013,9 +1036,32 @@ static int usb_request_sense(struct scsi_cmd *srb, struct us_data *ss)
 	return 0;
 }
 
+/*
+ * This spins up the disk and also consumes the time that the
+ * disk takes to become active and ready to read data.
+ * Some drives (like Western Digital) can take more than 5 seconds.
+ * The delay occurs on the 1st data read from the disk.
+ * Extending the timeout here works better than handling the timeout
+ * as an error on a "real" read.
+ */
+static int usb_spinup(struct scsi_cmd *srb, struct us_data *ss)
+{
+	memset(&srb->cmd[0], 0, 12);
+	srb->cmd[0] = SCSI_START_STP;
+	srb->cmd[1] = srb->lun << 5;
+	srb->cmd[4] = 1;	/* Start spinup. */
+	srb->datalen = 0;
+	srb->cmdlen = 6;
+	ss->pusb_dev->extra_timeout = 9876;
+	ss->transport(srb, ss);
+	ss->pusb_dev->extra_timeout = 0;
+	return 0;
+}
+
 static int usb_test_unit_ready(struct scsi_cmd *srb, struct us_data *ss)
 {
 	int retries = 10;
+	int gave_extra_time = 0;
 
 	do {
 		memset(&srb->cmd[0], 0, 12);
@@ -1038,6 +1084,17 @@ static int usb_test_unit_ready(struct scsi_cmd *srb, struct us_data *ss)
 		if ((srb->sense_buf[2] == 0x02) &&
 		    (srb->sense_buf[12] == 0x3a))
 			return -1;
+		/*
+		 * If the status is "Not Ready - becoming ready", give it
+		 * more time.  Linux issues a spinup command (once) and gives
+		 * it 100 seconds.
+		 */
+		if (srb->sense_buf[2] == 0x02 &&
+		    srb->sense_buf[12] == 0x04 &&
+		    gave_extra_time == 0) {
+			retries = 100; /* Allow 10 seconds. */
+			gave_extra_time = retries;
+		}
 		mdelay(100);
 	} while (retries--);
 
@@ -1166,22 +1223,57 @@ static unsigned long usb_stor_read(struct blk_desc *block_dev, lbaint_t blknr,
 	      block_dev->devnum, start, blks, buf_addr);
 
 	do {
-		/* XXX need some comment here */
+		/*
+		 * Probably most errors are USB errors, not hard disk error.
+		 * Many disks use a USB chip that is flaky when doing large
+		 * transfers. The workaround is to dynamically reduce the
+		 * transfer size and allow an additional try.
+		 * This should pick up flaky disks. Linux uses a quirks table.
+		 * We'll use observation:
+		 * Give it 1 try very large, 1 try large, 2 tries medium and 2
+		 * tries small(ish).
+		 * On a solid fail (actual disk error - which should be rare),
+		 * this will give us 6 tries max, and only that many if the read
+		 * is quite large.
+		 * A fail on a very short read obviously doesn't have a
+		 * too-large max_blks.  Timeout due to spinup being a case in
+		 * this point.
+		 */
 		retry = 2;
 		srb->pdata = (unsigned char *)buf_addr;
-		if (blks > ss->max_xfer_blk)
-			smallblks = ss->max_xfer_blk;
+retry_it:
+		if (blks > usb_max_xfer_blk)
+			smallblks = usb_max_xfer_blk;
 		else
 			smallblks = (unsigned short) blks;
-retry_it:
-		if (smallblks == ss->max_xfer_blk)
+
+		if (smallblks == usb_max_xfer_blk)
 			usb_show_progress();
+
 		srb->datalen = block_dev->blksz * smallblks;
 		srb->pdata = (unsigned char *)buf_addr;
 		if (usb_read_10(srb, ss, start, smallblks)) {
 			debug("Read ERROR\n");
 			ss->flags &= ~USB_READY;
 			usb_request_sense(srb, ss);
+
+			/* Dynamically reduce the I/O size. */
+			if (smallblks > 2047) {
+				usb_max_xfer_blk = 2047;
+				++retry;
+			} else if (smallblks > 512) {
+				usb_max_xfer_blk = 512;
+				++retry;
+			} else if (smallblks > 511) {
+				usb_max_xfer_blk = 511;
+				++retry;
+			} else if (smallblks > 63) {
+				usb_max_xfer_blk = 63;
+				retry += 2;
+			}
+			debug("step down usb_max_xfer_blk to %d\n",
+			      usb_max_xfer_blk);
+
 			if (retry--)
 				goto retry_it;
 			blkcnt -= blks;
@@ -1196,8 +1288,7 @@ retry_it:
 	      start, smallblks, buf_addr);
 
 	usb_disable_asynch(0); /* asynch transfer allowed */
-	if (blkcnt >= ss->max_xfer_blk)
-		debug("\n");
+
 	return blkcnt;
 }
 
@@ -1492,6 +1583,7 @@ int usb_stor_get_info(struct usb_device *dev, struct us_data *ss,
 	dev_desc->log2blksz = LOG2(dev_desc->blksz);
 	dev_desc->type = perq;
 	debug(" address %d\n", dev_desc->target);
+	usb_spinup(pccb, ss);
 
 	return 1;
 }
diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c
index 1cc02052f5..c4041f1e9c 100644
--- a/drivers/usb/host/ehci-hcd.c
+++ b/drivers/usb/host/ehci-hcd.c
@@ -315,7 +315,9 @@ ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 	int timeout;
 	int ret = 0;
 	struct ehci_ctrl *ctrl = ehci_get_ctrl(dev);
+	int trynum;
 
+	debug("\n***** %s ****\n", __func__);
 	debug("dev=%p, pipe=%lx, buffer=%p, length=%d, req=%p\n", dev, pipe,
 	      buffer, length, req);
 	if (req != NULL)
@@ -557,6 +559,15 @@ ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 	ehci_writel(&ctrl->hcor->or_usbsts, (usbsts & 0x3f));
 
 	/* Enable async. schedule. */
+	trynum = 1;	/* No more than 2 tries, in case of XACTERR. */
+	/* When the 1st try gets xacterr,
+	 * 2nd try gets xacterr and often babble and/or halted.
+	 * 3rd try times out.
+	 * After the 2nd try, the disk has recovered, so we need to clear and
+	 * reset the USB port, then return fail so the upper layer can retry.
+	 */
+retry_xacterr:
+	vtd = &qtd[qtd_counter - 1];
 	cmd = ehci_readl(&ctrl->hcor->or_usbcmd);
 	if (!(cmd & CMD_ASE)) {
 		cmd |= CMD_ASE;
@@ -572,8 +583,9 @@ ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 
 	/* Wait for TDs to be processed. */
 	ts = get_timer(0);
-	vtd = &qtd[qtd_counter - 1];
+
 	timeout = USB_TIMEOUT_MS(pipe);
+	timeout += dev->extra_timeout;
 	do {
 		/* Invalidate dcache */
 		invalidate_dcache_range((unsigned long)&ctrl->qh_list,
@@ -588,9 +600,10 @@ ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 			break;
 		WATCHDOG_RESET();
 	} while (get_timer(ts) < timeout);
-	qhtoken = hc32_to_cpu(qh->qh_overlay.qt_token);
 
-	ctrl->qh_list.qh_link = cpu_to_hc32(virt_to_phys(&ctrl->qh_list) | QH_LINK_TYPE_QH);
+	debug("took %4lu ms of %4d\n", get_timer(ts), timeout);
+	ctrl->qh_list.qh_link =
+		cpu_to_hc32(virt_to_phys(&ctrl->qh_list) | QH_LINK_TYPE_QH);
 	flush_dcache_range((unsigned long)&ctrl->qh_list,
 		ALIGN_END_ADDR(struct QH, &ctrl->qh_list, 1));
 
@@ -605,14 +618,48 @@ ehci_submit_async(struct usb_device *dev, unsigned long pipe, void *buffer,
 	 */
 	if (buffer != NULL && length > 0)
 		invalidate_dcache_range((unsigned long)buffer,
-			ALIGN((unsigned long)buffer + length, ARCH_DMA_MINALIGN));
+			ALIGN((unsigned long)buffer + length,
+			      ARCH_DMA_MINALIGN));
 
 	/* Check that the TD processing happened */
 	if (QT_TOKEN_GET_STATUS(token) & QT_TOKEN_STATUS_ACTIVE)
 		printf("EHCI timed out on TD - token=%#x\n", token);
 
+	/* Disable async schedule. */
+	cmd = ehci_readl(&ctrl->hcor->or_usbcmd);
+	cmd &= ~CMD_ASE;
+	ehci_writel(&ctrl->hcor->or_usbcmd, cmd);
+
+	ret = handshake((uint32_t *)&ctrl->hcor->or_usbsts, STS_ASS, 0,
+			100 * 1000);
+	if (ret < 0) {
+		printf("EHCI fail timeout STS_ASS reset\n");
+		goto fail;
+	}
+
+	qhtoken = hc32_to_cpu(qh->qh_overlay.qt_token);
 	if (!(QT_TOKEN_GET_STATUS(qhtoken) & QT_TOKEN_STATUS_ACTIVE)) {
 		debug("TOKEN=%#x\n", qhtoken);
+		if (qhtoken & QT_TOKEN_STATUS_XACTERR) {
+			if (--trynum >= 0) {
+				/*
+				 * It is necessary to do this, otherwise the
+				 * disk is clagged.
+				 */
+				debug("reset the TD and redo, because of XACTERR\n");
+				qhtoken &= ~QT_TOKEN_STATUS_HALTED;
+				qhtoken |= QT_TOKEN_STATUS_ACTIVE |
+					QT_TOKEN_CERR(2);
+				vtd->qt_token = cpu_to_hc32(qhtoken);
+				qh->qh_overlay.qt_token = cpu_to_hc32(qhtoken);
+				goto retry_xacterr;
+			}
+			dev->status = USB_ST_XACTERR;
+			dev->act_len = length -
+				QT_TOKEN_GET_TOTALBYTES(qhtoken);
+			goto fail;
+		}
+
 		switch (QT_TOKEN_GET_STATUS(qhtoken) &
 			~(QT_TOKEN_STATUS_SPLITXSTATE | QT_TOKEN_STATUS_PERR)) {
 		case 0:
diff --git a/include/usb.h b/include/usb.h
index efb67ea33f..41c8c47f89 100644
--- a/include/usb.h
+++ b/include/usb.h
@@ -140,6 +140,7 @@ struct usb_device {
 	int act_len;			/* transferred bytes */
 	int maxchild;			/* Number of ports if hub */
 	int portnr;			/* Port number, 1=first */
+	int extra_timeout;	/* Add to timeout in ehci_submit_async */
 #if !CONFIG_IS_ENABLED(DM_USB)
 	/* parent hub, or NULL if this is the root hub */
 	struct usb_device *parent;
diff --git a/include/usb_defs.h b/include/usb_defs.h
index 6dd2c997f9..572f6ab296 100644
--- a/include/usb_defs.h
+++ b/include/usb_defs.h
@@ -197,6 +197,7 @@
 #define USB_ST_NAK_REC          0x10	/* NAK Received*/
 #define USB_ST_CRC_ERR          0x20	/* CRC/timeout Error */
 #define USB_ST_BIT_ERR          0x40	/* Bitstuff error */
+#define USB_ST_XACTERR          0x80	/* XACTERR error */
 #define USB_ST_NOT_PROC         0x80000000L	/* Not yet processed */
 
 
-- 
2.20.1



More information about the U-Boot mailing list