[U-Boot] [PATCH] Davinci: SPI performance enhancements

Mon Jun 21 16:41:13 CEST 2010

> 
> The following restructuring and optimisations increase the SPI
> read performance from 1.3MiB/s (on da850) to 2.87MiB/s (on da830):
> 
> Remove continual revaluation of driver state from the core of the
> copy loop. State can not change during the copy loop, so it is
> possible to move these evaluations to before the copy loop.
> 
> Cost is more code space as loop variants are required for each set
> of possible configurations. The loops are simpler however, so the
> extra is only 128bytes on da830 with CONFIG_SPI_HALF_DUPLEX
> defined.
> 
> Unrolling the first copy loop iteration allows the TX buffer to be
> pre-loaded reducing SPI clock starvation.
> 
> Unrolling the last copy loop iteration removes testing for the
> final loop iteration every time round the loop.
> 
> Using the RX buffer empty flag as a transfer throttle allows the
> assumption that it is always safe to write to the TX buffer, so
> polling of TX buffer full flag can be removed.
> 
> Signed-off-by: Nick Thompson <nick.thompson at ge.com>
> ---
> da850 and da830 are similar devices. The SPI module is common to
> both, but da850 uses DDR and da830 uses SDRAM. The EVM's might
> not actually be comparable, but they appear to be at least similar.
> 
> The speed was tested with a 8MiB transfer from SPI FLASH using:
> 
> sf read 0xc0008000 0 0x800000
> 
>  drivers/spi/davinci_spi.c |  195 +++++++++++++++++++++++++++++-----------

This patch does not apply against Wolfgang's next.
The patch should be against u-boot/next.

> ----
>  1 files changed, 128 insertions(+), 67 deletions(-)
> 
> diff --git a/drivers/spi/davinci_spi.c b/drivers/spi/davinci_spi.c
> index 08f837b..4518ecb 100644
> --- a/drivers/spi/davinci_spi.c
> +++ b/drivers/spi/davinci_spi.c
> @@ -66,7 +66,7 @@ void spi_free_slave(struct spi_slave *slave)
>  int spi_claim_bus(struct spi_slave *slave)
>  {
>  	struct davinci_spi_slave *ds = to_davinci_spi(slave);
> -	unsigned int scalar, data1_reg_val = 0;
> +	unsigned int scalar;
> 
>  	/* Enable the SPI hardware */
>  	writel(SPIGCR0_SPIRST_MASK, &ds->regs->gcr0);
> @@ -93,11 +93,6 @@ int spi_claim_bus(struct spi_slave *slave)
>  	writel(8 | (scalar << SPIFMT_PRESCALE_SHIFT) |
>  		(1 << SPIFMT_PHASE_SHIFT), &ds->regs->fmt0);
> 
> -	/* hold cs active at end of transfer until explicitly de-asserted */
> -	data1_reg_val = (1 << SPIDAT1_CSHOLD_SHIFT) |
> -			(slave->cs << SPIDAT1_CSNR_SHIFT);
> -	writel(data1_reg_val, &ds->regs->dat1);
> -
>  	/*
>  	 * Including a minor delay. No science here. Should be good even
> with
>  	 * no delay
> @@ -113,8 +108,7 @@ int spi_claim_bus(struct spi_slave *slave)
>  	writel(0, &ds->regs->lvl);
> 
>  	/* enable SPI */
> -	writel((readl(&ds->regs->gcr1) |
> -		SPIGCR1_SPIENA_MASK), &ds->regs->gcr1);
> +	writel((readl(&ds->regs->gcr1) | SPIGCR1_SPIENA_MASK), &ds->regs-
> >gcr1);
> 
>  	return 0;
>  }
> @@ -127,14 +121,125 @@ void spi_release_bus(struct spi_slave *slave)
>  	writel(SPIGCR0_SPIRST_MASK, &ds->regs->gcr0);
>  }
> 
> +/*
> + * This functions needs to act like a macro to avoid pipeline reloads in
> the
> + * loops below. Use always_inline. This gains us about 160KiB/s and the
> bloat
> + * appears to be zero bytes (da830).
> + */
> +__attribute__((always_inline))
> +static inline u32 davinci_spi_xfer_data(struct davinci_spi_slave *ds, u32
> data)
> +{
> +	u32	buf_reg_val;
> +
> +	/* send out data */
> +	writel(data, &ds->regs->dat1);
> +
> +	/* wait for the data to clock in/out */
> +	while ((buf_reg_val = readl(&ds->regs->buf)) & SPIBUF_RXEMPTY_MASK)
> +		;
> +
> +	return buf_reg_val;
> +}
> +
> +static int davinci_spi_read(struct spi_slave *slave, unsigned int len,
> +			    u8 *rxp, unsigned long flags)
> +{
> +	struct davinci_spi_slave *ds = to_davinci_spi(slave);
> +	unsigned int data1_reg_val;
> +
> +	/* enable CS hold, CS[n] and clear the data bits */
> +	data1_reg_val = ((1 << SPIDAT1_CSHOLD_SHIFT) |
> +			 (slave->cs << SPIDAT1_CSNR_SHIFT));
> +
> +	/* wait till TXFULL is deasserted */
> +	while (readl(&ds->regs->buf) & SPIBUF_TXFULL_MASK)
> +		;
> +
> +	/* preload the TX buffer to avoid clock starvation */
> +	writel(data1_reg_val, &ds->regs->dat1);
> +
> +	/* keep reading 1 byte until only 1 byte left */
> +	while ((len--) > 1)
> +		*rxp++ = davinci_spi_xfer_data(ds, data1_reg_val);
> +
> +	/* clear CS hold when we reach the end */
> +	if (flags & SPI_XFER_END)
> +		data1_reg_val &= ~(1 << SPIDAT1_CSHOLD_SHIFT);
> +
> +	/* read the last byte */
> +	*rxp = davinci_spi_xfer_data(ds, data1_reg_val);
> +
> +	return 0;
> +}
> +
> +static int davinci_spi_write(struct spi_slave *slave, unsigned int len,
> +		const u8 *txp, unsigned long flags)
> +{
> +	struct davinci_spi_slave *ds = to_davinci_spi(slave);
> +	unsigned int data1_reg_val;
> +
> +	/* enable CS hold and clear the data bits */
> +	data1_reg_val = ((1 << SPIDAT1_CSHOLD_SHIFT) |
> +			 (slave->cs << SPIDAT1_CSNR_SHIFT));
> +
> +	/* wait till TXFULL is deasserted */
> +	while (readl(&ds->regs->buf) & SPIBUF_TXFULL_MASK)
> +		;
> +
> +	/* preload the TX buffer to avoid clock starvation */
> +	if (len > 2) {
> +		writel(data1_reg_val | *txp++, &ds->regs->dat1);
> +		len--;
> +	}
> +
> +	/* keep writing 1 byte until only 1 byte left */
> +	while ((len--) > 1)
> +		davinci_spi_xfer_data(ds, data1_reg_val | *txp++);
> +
> +	/* clear CS hold when we reach the end */
> +	if (flags & SPI_XFER_END)
> +		data1_reg_val &= ~(1 << SPIDAT1_CSHOLD_SHIFT);
> +
> +	/* write the last byte */
> +	davinci_spi_xfer_data(ds, data1_reg_val | *txp);
> +
> +	return 0;
> +}
> +
> +#ifndef CONFIG_SPI_HALF_DUPLEX
> +static int davinci_spi_read_write(struct spi_slave *slave, unsigned int
> len,
> +				  u8 *rxp, const u8 *txp, unsigned long flags)
> +{
> +	struct davinci_spi_slave *ds = to_davinci_spi(slave);
> +	unsigned int data1_reg_val;
> +
> +	/* enable CS hold and clear the data bits */
> +	data1_reg_val = ((1 << SPIDAT1_CSHOLD_SHIFT) |
> +			 (slave->cs << SPIDAT1_CSNR_SHIFT));
> +
> +	/* wait till TXFULL is deasserted */
> +	while (readl(&ds->regs->buf) & SPIBUF_TXFULL_MASK)
> +		;
> +
> +	/* keep reading and writing 1 byte until only 1 byte left */
> +	while ((len--) > 1)
> +		*rxp++ = davinci_spi_xfer_data(ds, data1_reg_val | *txp++);
> +
> +	/* clear CS hold when we reach the end */
> +	if (flags & SPI_XFER_END)
> +		data1_reg_val &= ~(1 << SPIDAT1_CSHOLD_SHIFT);
> +
> +	/* read and write the last byte */
> +	*rxp = davinci_spi_xfer_data(ds, data1_reg_val | *txp);
> +
> +	return 0;
> +}
> +#endif
> +
>  int spi_xfer(struct spi_slave *slave, unsigned int bitlen,
>  		const void *dout, void *din, unsigned long flags)
>  {
> -	struct davinci_spi_slave *ds = to_davinci_spi(slave);
> -	unsigned int	len, data1_reg_val = readl(&ds->regs->dat1);
> -	unsigned int	i_cnt = 0, o_cnt = 0, buf_reg_val;
> -	const u8	*txp = dout; /* dout can be NULL for read operation */
> -	u8		*rxp = din;  /* din can be NULL for write operation */
> +	unsigned int len;
> 
>  	if (bitlen == 0)
>  		/* Finish any previously submitted transfers */
> @@ -154,63 +259,19 @@ int spi_xfer(struct spi_slave *slave, unsigned int
> bitlen,
> 
>  	len = bitlen / 8;
> 
> -	/* do an empty read to clear the current contents */
> -	readl(&ds->regs->buf);
> -
> -	/* keep writing and reading 1 byte until done */
> -	while ((i_cnt < len) || (o_cnt < len)) {
> -		/* read RX buffer and flags */
> -		buf_reg_val = readl(&ds->regs->buf);
> -
> -		/* if data is available */
> -		if ((i_cnt < len) &&
> -			(buf_reg_val & SPIBUF_RXEMPTY_MASK) == 0) {
> -			/*
> -			 * If there is no read buffer simply
> -			 * ignore the read character
> -			 */
> -			if (rxp)
> -				*rxp++ = buf_reg_val & 0xFF;
> -			/* increment read words count */
> -			i_cnt++;
> -		}
> -
> -		/*
> -		 * if the tx buffer is empty and there
> -		 * is still data to transmit
> -		 */
> -		if ((o_cnt < len) &&
> -			((buf_reg_val & SPIBUF_TXFULL_MASK) == 0)) {
> -			/* write the data */
> -			data1_reg_val &= ~0xFFFF;
> -			if (txp)
> -				data1_reg_val |= *txp++;
> -			/*
> -			 * Write to DAT1 is required to keep
> -			 * the serial transfer going.
> -			 * We just terminate when we reach the end.
> -			 */
> -			if ((o_cnt == (len - 1)) && (flags & SPI_XFER_END)) {
> -				/* clear CS hold */
> -				writel(data1_reg_val &
> -						~(1 << SPIDAT1_CSHOLD_SHIFT),
> -						&ds->regs->dat1);
> -			} else {
> -				/* enable CS hold and write TX register */
> -				data1_reg_val |= ((1 << SPIDAT1_CSHOLD_SHIFT) |
> -					(slave->cs << SPIDAT1_CSNR_SHIFT));
> -				writel(data1_reg_val, &ds->regs->dat1);
> -			}
> -			/* increment written words count */
> -			o_cnt++;
> -		}
> -	}
> -	return 0;
> +	if (!dout)
> +		return davinci_spi_read(slave, len, din, flags);
> +	else if (!din)
> +		return davinci_spi_write(slave, len, dout, flags);
> +#ifndef CONFIG_SPI_HALF_DUPLEX
> +	else
> +		return davinci_spi_read_write(slave, len, din, dout, flags);
> +#endif
> 
>  out:
>  	if (flags & SPI_XFER_END) {
> -		writel(data1_reg_val &
> -			~(1 << SPIDAT1_CSHOLD_SHIFT), &ds->regs->dat1);
> +		u8 dummy = 0;
> +		davinci_spi_write(slave, 1, &dummy, flags);
>  	}
>  	return 0;
>  }
> --
> 1.7.0.4