[U-Boot] [PATCH V2] net: fec_mxc: allow use with cache enabled

Tue Mar 6 18:06:47 CET 2012

On 03/05/2012 01:06 PM, Marek Vasut wrote:
> Dear Eric Nelson,
>
>> 	ensure that transmit and receive buffers are cache-line aligned
>>          invalidate cache after each packet received
>>          flush cache before transmitting
>>
>> 	Original patch by Marek:
>> 		http://lists.denx.de/pipermail/u-boot/2012-February/117695.html
>
> Would be cool to Cc me :-p
>

Sorry about that.

> All right, let's review this!
>
>>
>> Signed-off-by: Eric Nelson<eric.nelson at boundarydevices.com>
>> ---
>>   drivers/net/fec_mxc.c |  243
>> +++++++++++++++++++++++++++++++++++++------------ drivers/net/fec_mxc.h |
>>   19 +----
>>   2 files changed, 184 insertions(+), 78 deletions(-)
>>
>> diff --git a/drivers/net/fec_mxc.c b/drivers/net/fec_mxc.c
>> index 1fdd071..0e35377 100644
>> --- a/drivers/net/fec_mxc.c
>> +++ b/drivers/net/fec_mxc.c
>> @@ -50,6 +50,33 @@ DECLARE_GLOBAL_DATA_PTR;
>>   #define	CONFIG_FEC_MXC_SWAP_PACKET
>>   #endif
>>
>> +#ifndef	CONFIG_FEC_DESC_ALIGNMENT
>> +#define	CONFIG_FEC_DESC_ALIGNMENT	ARCH_DMA_MINALIGN
>> +#endif
>> +
>> +#ifndef	CONFIG_FEC_DATA_ALIGNMENT
>> +#define	CONFIG_FEC_DATA_ALIGNMENT	ARCH_DMA_MINALIGN
>> +#endif
>
> The above shoul be the bigger of CONFIG_SYS_CACHELINE_SIZE and ARCH_DMA_ALIGN
> (use max() ).
>
> Eventually, we should unify cache and DMA stuff and make only one macro (ALIGN)
> and be done with it.
>
> btw. I can't seem to find a platform using different alignment for DATA and
> DESC, let's make it one ( ... #define CONFIG_FEC_ALIGN max(ARCH_DMA_MINALIGN,
> CONFIG_SYS_CACHELINE_SIZE), ok?
>

I was thinking the same thing but concerned that I missed something.

>> +
>> +/* Check various alignment issues at compile time */
>> +#if ((CONFIG_FEC_DESC_ALIGNMENT<  16) || (CONFIG_FEC_DESC_ALIGNMENT % 16
>> != 0)) +#error	"CONFIG_FEC_DESC_ALIGNMENT must be multiple of 16!"
>> +#endif
>> +
>> +#if ((CONFIG_FEC_DATA_ALIGNMENT<  16) || (CONFIG_FEC_DATA_ALIGNMENT % 16
>> != 0)) +#error	"CONFIG_FEC_DATA_ALIGNMENT must be multiple of 16!"
>> +#endif
>> +
>> +#if ((PKTALIGN<  CONFIG_FEC_DATA_ALIGNMENT) || \
>> +	(PKTALIGN % CONFIG_FEC_DATA_ALIGNMENT != 0))
>> +#error	"PKTALIGN must be multiple of CONFIG_FEC_DATA_ALIGNMENT!"
>> +#endif
>> +
>> +#if ((PKTSIZE_ALIGN<  CONFIG_FEC_DATA_ALIGNMENT) || \
>> +	(PKTSIZE_ALIGN % CONFIG_FEC_DATA_ALIGNMENT != 0))
>> +#error	"PKTSIZE_ALIGN must be multiple of CONFIG_FEC_DATA_ALIGNMENT!"
>> +#endif
>> +
>>   #undef DEBUG
>>
>>   struct nbuf {
>> @@ -259,43 +286,52 @@ static int fec_tx_task_disable(struct fec_priv *fec)
>>    * Initialize receive task's buffer descriptors
>>    * @param[in] fec all we know about the device yet
>>    * @param[in] count receive buffer count to be allocated
>> - * @param[in] size size of each receive buffer
>> + * @param[in] dsize desired size of each receive buffer
>>    * @return 0 on success
>>    *
>>    * For this task we need additional memory for the data buffers. And each
>>    * data buffer requires some alignment. Thy must be aligned to a specific
>> - * boundary each (DB_DATA_ALIGNMENT).
>> + * boundary each.
>>    */
>> -static int fec_rbd_init(struct fec_priv *fec, int count, int size)
>> +static int fec_rbd_init(struct fec_priv *fec, int count, int dsize)
>>   {
>> -	int ix;
>> -	uint32_t p = 0;
>> -
>> -	/* reserve data memory and consider alignment */
>> -	if (fec->rdb_ptr == NULL)
>> -		fec->rdb_ptr = malloc(size * count + DB_DATA_ALIGNMENT);
>> -	p = (uint32_t)fec->rdb_ptr;
>> -	if (!p) {
>> -		puts("fec_mxc: not enough malloc memory\n");
>> -		return -ENOMEM;
>> -	}
>> -	memset((void *)p, 0, size * count + DB_DATA_ALIGNMENT);
>> -	p += DB_DATA_ALIGNMENT-1;
>> -	p&= ~(DB_DATA_ALIGNMENT-1);
>> -
>> -	for (ix = 0; ix<  count; ix++) {
>> -		writel(p,&fec->rbd_base[ix].data_pointer);
>> -		p += size;
>> -		writew(FEC_RBD_EMPTY,&fec->rbd_base[ix].status);
>> -		writew(0,&fec->rbd_base[ix].data_length);
>> -	}
>> +	uint32_t size;
>> +	int i;
>> +
>>   	/*
>> -	 * mark the last RBD to close the ring
>> +	 * Allocate memory for the buffers. This allocation respects the
>> +	 * alignment
>>   	 */
>> -	writew(FEC_RBD_WRAP | FEC_RBD_EMPTY,&fec->rbd_base[ix - 1].status);
>> +	size = roundup(dsize, CONFIG_FEC_DATA_ALIGNMENT);
>> +	for (i = 0; i<  count; i++) {
>> +		uint32_t data_ptr = readl(&fec->rbd_base[i].data_pointer);
>> +		if (0 == data_ptr) {
>
> We should fix this yoda condition while at it.
>

Wise, Yoda was...

I've been in the habit of putting constants on the left ever since reading
"Code Complete" many years ago.

That said, I'll swap it in V3.

>> +			uint8_t *data = memalign(CONFIG_FEC_DATA_ALIGNMENT,
>> +						 size);
>
> NOTE[1]: This is good, each RECEIVE DATA BUFFER is properly aligned at
> allocation time and therefore IS CACHE SAFE.
>
>> +			if (!data) {
>> +				printf("%s: error allocating rxbuf %d\n",
>> +				       __func__, i);
>> +				goto err;
>> +			}
>> +			writel((uint32_t)data,&fec->rbd_base[i].data_pointer);
>
> I think this "writel()" call is bogus and should be removed in subsequent patch
> and replaced with simple assignment. It was here probably due to cache issues on
> PPC?
>

The RBD has me puzzled. Do we treat them like registers and use readx/writex
or like in-RAM data structures...

>> +		} /* needs allocation */
>> +		writew(FEC_RBD_EMPTY,&fec->rbd_base[i].status);
>> +		writew(0,&fec->rbd_base[i].data_length);
>> +	}
>> +
>> +	/* Mark the last RBD to close the ring. */
>> +	writew(FEC_RBD_WRAP | FEC_RBD_EMPTY,&fec->rbd_base[i - 1].status);
>>   	fec->rbd_index = 0;
>>
>>   	return 0;
>> +
>> +err:
>> +	for (; i>= 0; i--) {
>> +		uint32_t data_ptr = readl(&fec->rbd_base[i].data_pointer);
>> +		free((void *)data_ptr);
>> +	}
>> +
>> +	return -ENOMEM;
>>   }
>>
>>   /**
>> @@ -387,12 +423,25 @@ static int fec_open(struct eth_device *edev)
>>   {
>>   	struct fec_priv *fec = (struct fec_priv *)edev->priv;
>>   	int speed;
>> +	uint32_t addr, size;
>> +	int i;
>>
>>   	debug("fec_open: fec_open(dev)\n");
>>   	/* full-duplex, heartbeat disabled */
>>   	writel(1<<  2,&fec->eth->x_cntrl);
>>   	fec->rbd_index = 0;
>>
>> +	/* Invalidate all descriptors */
>> +	for (i = 0; i<  FEC_RBD_NUM - 1; i++)
>> +		fec_rbd_clean(0,&fec->rbd_base[i]);
>> +	fec_rbd_clean(1,&fec->rbd_base[i]);
>> +
>> +	/* Flush the descriptors into RAM */
>> +	size = roundup(FEC_RBD_NUM * sizeof(struct fec_bd),
>> +			CONFIG_FEC_DATA_ALIGNMENT);
>> +	addr = (uint32_t)fec->rbd_base;
>> +	flush_dcache_range(addr, addr + size);
>> +
>>   #ifdef FEC_QUIRK_ENET_MAC
>>   	/* Enable ENET HW endian SWAP */
>>   	writel(readl(&fec->eth->ecntrl) | FEC_ECNTRL_DBSWAP,
>> @@ -478,38 +527,44 @@ static int fec_open(struct eth_device *edev)
>>
>>   static int fec_init(struct eth_device *dev, bd_t* bd)
>>   {
>> -	uint32_t base;
>>   	struct fec_priv *fec = (struct fec_priv *)dev->priv;
>>   	uint32_t mib_ptr = (uint32_t)&fec->eth->rmon_t_drop;
>>   	uint32_t rcntrl;
>> -	int i;
>> +	uint32_t size;
>> +	int i, ret;
>>
>>   	/* Initialize MAC address */
>>   	fec_set_hwaddr(dev);
>>
>>   	/*
>> -	 * reserve memory for both buffer descriptor chains at once
>> -	 * Datasheet forces the startaddress of each chain is 16 byte
>> -	 * aligned
>> +	 * Allocate transmit descriptors, there are two in total. This
>> +	 * allocation respects cache alignment.
>>   	 */
>> -	if (fec->base_ptr == NULL)
>> -		fec->base_ptr = malloc((2 + FEC_RBD_NUM) *
>> -				sizeof(struct fec_bd) + DB_ALIGNMENT);
>> -	base = (uint32_t)fec->base_ptr;
>> -	if (!base) {
>> -		puts("fec_mxc: not enough malloc memory\n");
>> -		return -ENOMEM;
>> +	if (!fec->tbd_base) {
>> +		size = roundup(2 * sizeof(struct fec_bd),
>> +				CONFIG_FEC_DATA_ALIGNMENT);
>> +		fec->tbd_base = memalign(CONFIG_FEC_DESC_ALIGNMENT, size);
>
> NOTE[2]: This allocates exactly 2 * 8 bytes == 2 * sizeof(struct fec_bd). This
> means on systems with cache size of 32 bytes, we need to be especially careful.
> This matter is handled by the up-alignment of this data, so this part IS CACHE
> SAFE.
>
>> +		if (!fec->tbd_base) {
>> +			ret = -ENOMEM;
>> +			goto err1;
>> +		}
>> +		memset(fec->tbd_base, 0, size);
>
> We might want to flush the descriptors to memory after they have been inited?
>

Good catch.

>>   	}
>> -	memset((void *)base, 0, (2 + FEC_RBD_NUM) *
>> -			sizeof(struct fec_bd) + DB_ALIGNMENT);
>> -	base += (DB_ALIGNMENT-1);
>> -	base&= ~(DB_ALIGNMENT-1);
>> -
>> -	fec->rbd_base = (struct fec_bd *)base;
>>
>> -	base += FEC_RBD_NUM * sizeof(struct fec_bd);
>> -
>> -	fec->tbd_base = (struct fec_bd *)base;
>> +	/*
>> +	 * Allocate receive descriptors. This allocation respects cache
>> +	 * alignment.
>> +	 */
>> +	if (!fec->rbd_base) {
>> +		size = roundup(FEC_RBD_NUM * sizeof(struct fec_bd),
>> +				CONFIG_FEC_DATA_ALIGNMENT);
>> +		fec->rbd_base = memalign(CONFIG_FEC_DESC_ALIGNMENT, size);
>
> NOTE[3]: This allocates exactly 64 * 8 bytes == 64 * sizeof(struct fec_bd). This
> means on systems with cache size of 32 bytes, we need to be especially careful.
> This matter is handled by the up-alignment of this data, so this part IS CACHE
> SAFE.
>
>> +		if (!fec->rbd_base) {
>> +			ret = -ENOMEM;
>> +			goto err2;
>> +		}
>> +		memset(fec->rbd_base, 0, size);
>> +	}
>
> We might want to flush the descriptors to memory after they have been inited?
 >

Again, good catch.

On this topic (initialization of RBD), I had a bit of a concern
regarding the initialization of things.

In fec_open, the receive buffer descriptors are initialized and the
last one set is to 'wrap'. If this code were to execute when the
controller is live, bad things would surely happen.

I traced through all of the paths I can see, and I believe that
we're safe. It appears that fec_halt() will be called prior to
any call to fec_init() and fec_open().

In fec_open() a number of calls to fec_rbd_clean() are made and
a single flush_dcache() is made afterwards.

While this works and causes less thrashing than per-RBD flushes,
I think the code would be simpler if fec_rbd_clean just did the
flush itself. This would also remove the need for a separate
flush in fec_recv.

>>
>>   	/*
>>   	 * Set interrupt mask register
>> @@ -570,9 +625,8 @@ static int fec_init(struct eth_device *dev, bd_t* bd)
>>   	 * Initialize RxBD/TxBD rings
>>   	 */
>>   	if (fec_rbd_init(fec, FEC_RBD_NUM, FEC_MAX_PKT_SIZE)<  0) {
>> -		free(fec->base_ptr);
>> -		fec->base_ptr = NULL;
>> -		return -ENOMEM;
>> +		ret = -ENOMEM;
>> +		goto err3;
>>   	}
>>   	fec_tbd_init(fec);
>>
>> @@ -583,6 +637,13 @@ static int fec_init(struct eth_device *dev, bd_t* bd)
>>   #endif
>>   	fec_open(dev);
>>   	return 0;
>> +
>> +err3:
>> +	free(fec->rbd_base);
>> +err2:
>> +	free(fec->tbd_base);
>> +err1:
>> +	return ret;
>>   }
>>
>
> #####
> To sum it up, so far, we have allocated and properly aligned both the
> descriptors and the receive data buffers.
>
> Important remark, there will be likely more descriptors in one cacheline!
> #####
>
>>   /**
>> @@ -631,9 +692,11 @@ static void fec_halt(struct eth_device *dev)
>>    * @param[in] length Data count in bytes
>>    * @return 0 on success
>>    */
>> -static int fec_send(struct eth_device *dev, volatile void* packet, int
>> length) +static int fec_send(struct eth_device *dev, void *packet, int
>> length) {
>>   	unsigned int status;
>> +	uint32_t size;
>> +	uint32_t addr;
>>
>>   	/*
>>   	 * This routine transmits one frame.  This routine only accepts
>> @@ -650,15 +713,21 @@ static int fec_send(struct eth_device *dev, volatile
>> void* packet, int length) }
>>
>>   	/*
>> -	 * Setup the transmit buffer
>> -	 * Note: We are always using the first buffer for transmission,
>> -	 * the second will be empty and only used to stop the DMA engine
>> +	 * Setup the transmit buffer. We are always using the first buffer for
>> +	 * transmission, the second will be empty and only used to stop the DMA
>> +	 * engine. We also flush the packet to RAM here to avoid cache trouble.
>>   	 */
>>   #ifdef	CONFIG_FEC_MXC_SWAP_PACKET
>>   	swap_packet((uint32_t *)packet, length);
>>   #endif
>> +
>> +	addr = (uint32_t)packet;
>> +	size = roundup(length, CONFIG_FEC_DATA_ALIGNMENT);
>> +	flush_dcache_range(addr, addr + size);
>> +
>
> ERROR[1]: The above seems wrong, it might flush more than necessary. Drop the
> roundup() call here so flush_dcache_range() can scream about cache issues and so
> we can find them.
>

I did this and found that all of the places which transmit packets are
aligned to PKT_ALIGN (32 bytes).

>>   	writew(length,&fec->tbd_base[fec->tbd_index].data_length);
>> -	writel((uint32_t)packet,&fec->tbd_base[fec->tbd_index].data_pointer);
>> +	writel(addr,&fec->tbd_base[fec->tbd_index].data_pointer);
>> +
>>   	/*
>>   	 * update BD's status now
>>   	 * This block:
>> @@ -672,16 +741,30 @@ static int fec_send(struct eth_device *dev, volatile
>> void* packet, int length) writew(status,
>> &fec->tbd_base[fec->tbd_index].status);
>>
>>   	/*
>> +	 * Flush data cache. This code flushes both TX descriptors to RAM.
>> +	 * After this code this code, the descritors will be safely in RAM
>
> Please fix "this code this code".
>
Done. Also 'descritors'.

>> +	 * and we can start DMA.
>> +	 */
>> +	size = roundup(2 * sizeof(struct fec_bd), CONFIG_FEC_DATA_ALIGNMENT);
>> +	addr = (uint32_t)fec->tbd_base;
>> +	flush_dcache_range(addr, addr + size);
>
> NOTE[4]: This is correct, we must flush both TX descriptors here.
>
>> +
>> +	/*
>>   	 * Enable SmartDMA transmit task
>>   	 */
>>   	fec_tx_task_enable(fec);
>>
>>   	/*
>> -	 * wait until frame is sent .
>> +	 * Wait until frame is sent. On each turn of the wait cycle, we must
>> +	 * invalidate data cache to see what's really in RAM. Also, we need
>> +	 * barrier here.
>>   	 */
>> +	invalidate_dcache_range(addr, addr + size);
>>   	while (readw(&fec->tbd_base[fec->tbd_index].status)&  FEC_TBD_READY) {
>>   		udelay(1);
>> +		invalidate_dcache_range(addr, addr + size);
>>   	}
>
> NOTE[5]: This is correct, we must invalidate cache every time we want to read
> fresh version of the descriptor.
>
>> +
>>   	debug("fec_send: status 0x%x index %d\n",
>>   			readw(&fec->tbd_base[fec->tbd_index].status),
>>   			fec->tbd_index);
>> @@ -707,6 +790,8 @@ static int fec_recv(struct eth_device *dev)
>>   	int frame_length, len = 0;
>>   	struct nbuf *frame;
>>   	uint16_t bd_status;
>> +	uint32_t addr, size;
>> +	int i;
>>   	uchar buff[FEC_MAX_PKT_SIZE];
>>
>>   	/*
>> @@ -737,8 +822,23 @@ static int fec_recv(struct eth_device *dev)
>>   	}
>>
>>   	/*
>> -	 * ensure reading the right buffer status
>> +	 * Read the buffer status. Before the status can be read, the data cache
>> +	 * must be invalidated, because the data in RAM might have been changed
>> +	 * by DMA. The descriptors are properly aligned to cachelines so there's
>> +	 * no need to worry they'd overlap.
>> +	 *
>> +	 * WARNING: By invalidating the descriptor here, we also invalidate
>> +	 * the descriptors surrounding this one. Therefore we can NOT change the
>> +	 * contents of this descriptor nor the surrounding ones. The problem is
>> +	 * that in order to mark the descriptor as processed, we need to change
>> +	 * the descriptor. The solution is to mark the whole cache line when all
>> +	 * descriptors in the cache line are processed.
>>   	 */
>> +	addr = (uint32_t)rbd;
>> +	addr&= ~(CONFIG_FEC_DATA_ALIGNMENT - 1);
>
> Aka some kind of round_down()
>
>> +	size = roundup(sizeof(struct fec_bd), CONFIG_FEC_DATA_ALIGNMENT);
>
> Aka roundup.
>
>> +	invalidate_dcache_range(addr, addr + size);
>> +
>
> The idea here is the following (demo uses 32byte long cachelines):
>
> [DESC1][DESC2][DESC3][DESC4][DESC5][DESC6]
> `------- cacheline --------'
>
> We want to start retrieving contents of DESC3, therefore "addr" points to DESC1,
> "size" is size of cacheline (I hope there's no hardware with 8 byte cachelines,
> but this should be ok here).
>
> NOTE[5]: Here we can invalidate the whole cacheline, because the descriptors so
> far were modified only be hardware, not by us. We are not writing anything there
> so we won't loose any information.
>
> NOTE[6]: This invalidation ensures that we always have a fresh copy of the
> cacheline containing all the descriptors, therefore we always have a fresh
> status of the descriptors we are about to pick. Since this is a sequential
> execution, the cache eviction should not kick in here (or might it?).
>
>
Another way to look at this is this:

	After fec_open(), the hardware owns the rbd, and all we should do
	is read it. In order to make sure we don't have a stale copy, we
	need to call invalidate() before looking at the values.

Tracing the code to find out whether this is true, the only write I see
is within fec_recv() when the last descriptor is full, when the driver
takes ownership of **all** of the descriptors, calling fec_rbd_clean()
on each.

The only thing that looks funky is this:

		size = (CONFIG_FEC_ALIGN / sizeof(struct fec_bd)) - 1;
		if ((fec->rbd_index & size) == size) {

Wouldn't a test of rbd_index against FEC_RBD_NUM be more appropriate?
i.e.
		if (fec->rbd_index == FEC_RBD_NUM-1) {

>>   	bd_status = readw(&rbd->status);
>>   	debug("fec_recv: status 0x%x\n", bd_status);
>>
>> @@ -751,6 +851,13 @@ static int fec_recv(struct eth_device *dev)
>>   			frame = (struct nbuf *)readl(&rbd->data_pointer);
>>   			frame_length = readw(&rbd->data_length) - 4;
>
> NOTE[7]: We received a frame, we know how long it is. The frame is loaded into
> the rbd->data_pointer, which IS CACHE ALIGNED.
>
> I detect a problem here that frame_length might overflow, but that's not related
> to caches and might be just false accusation.
>
>>   			/*
>> +			 * Invalidate data cache over the buffer
>> +			 */
>> +			addr = (uint32_t)frame;
>> +			size = roundup(frame_length, CONFIG_FEC_DATA_ALIGNMENT);
>> +			invalidate_dcache_range(addr, addr + size);
>
> NOTE[8]: The roundup here is a valid operation, we can flush up to the size of
> rbd->data_pointer, which is cache aligned and considering frame_length is less
> or equal to the memory allocated for rbd->data_pointer, the invalidation of
> cache here IS SAFE.
>
>> +
>> +			/*
>>   			 *  Fill the buffer and pass it to upper layers
>>   			 */
>>   #ifdef	CONFIG_FEC_MXC_SWAP_PACKET
>> @@ -765,11 +872,27 @@ static int fec_recv(struct eth_device *dev)
>>   						(ulong)rbd->data_pointer,
>>   						bd_status);
>>   		}
>> +
>>   		/*
>> -		 * free the current buffer, restart the engine
>> -		 * and move forward to the next buffer
>> +		 * Free the current buffer, restart the engine and move forward
>> +		 * to the next buffer. Here we check if the whole cacheline of
>> +		 * descriptors was already processed and if so, we mark it free
>> +		 * as whole.
>>   		 */
>
> NOTE[9]: This is the most problematic part, handling the marking of RX
> descriptors with a flag that they are ready again. Obviously, directly writing
> to the desciptor won't work, because:
>
> 1) There are multiple descriptors on a cacheline, therefore by writing one, we'd
> need to flush the whole cacheline back into DRAM immediatelly so the hardware
> knows about it. But that'd mean we can loose some information from the hardware,
> refer to demo before NOTE[5]:
>
> We modify DESC2 and hardware is concurently changing DESC3. We flush DESC2 and
> the whole cacheline, we loose part of DESC3.
>
> 2) Cache eviction algorithm might flush data for us, therefore causing loss of
> information as well.
>
> The solution is the following:
>
> 1) Compute how many descriptors are per-cache line
> 2) Make sure FEC_RBD_NUM * sizeof(struct fec_bd) is at least 2 *
> CONFIG_FEC_DATA_ALIGNMENT in size, see NOTE[11].
> 3) Once the last RX buffer in the cacheline is processed, mark them all clean
> and flush them all, see NOTE[10].
>
> NOTE[10]: This is legal, because the hardware won't use RX descriptors that it
> marked dirty (which means not picked up by software yet). We clean the
> desciptors in an order the hardware would pick them up again so there's no
> problem with race condition either. The only possible issue here is if there was
> hardware with cacheline size smaller than descriptor size (we should add a check
> for this at the begining of the file).
>
> NOTE[11]: This is because we want the FEC to overwrite descriptors below the
> other cacheline while we're marking the one containing retrieved descriptors
> clean.
>

Ahah! Now I see what the size calculation is doing.

A well-named constant, maybe "RXDESC_PER_CACHELINE" would be useful here.

	#define RXDESC_PER_CACHELINE	(CONFIG_FEC_ALIGN/sizeof(struct fec_bd))

>> -		fec_rbd_clean(fec->rbd_index == (FEC_RBD_NUM - 1) ? 1 : 0, rbd);
>> +		size = (CONFIG_FEC_DATA_ALIGNMENT / sizeof(struct fec_bd)) - 1;
		size = RXDESC_PER_CACHELINE-1;
>> +		if ((fec->rbd_index&  size) == size) {

The line above only works if RXDESC_PER_CACHELINE is a multiple of 2, which
is likely to work because sizeof(struct fec_bd) == 8.

>> +			i = fec->rbd_index - size;
>> +			addr = (uint32_t)&fec->rbd_base[i];
>> +			for (; i<= fec->rbd_index + size; i++) {

This flushes too many descriptors! This should be:
			for (; i<= fec->rbd_index; i++) {

>> +				if (i == (FEC_RBD_NUM - 1))
>> +					fec_rbd_clean(1,&fec->rbd_base[i]);
>> +				else
>> +					fec_rbd_clean(0,&fec->rbd_base[i]);
>> +			}
>> +			flush_dcache_range(addr,
>> +				addr + CONFIG_FEC_DATA_ALIGNMENT);
>> +		}
>> +
>>   		fec_rx_task_enable(fec);
>>   		fec->rbd_index = (fec->rbd_index + 1) % FEC_RBD_NUM;
>>   	}
>> diff --git a/drivers/net/fec_mxc.h b/drivers/net/fec_mxc.h
>> index 2eb7803..852b2e0 100644
>> --- a/drivers/net/fec_mxc.h
>> +++ b/drivers/net/fec_mxc.h
>> @@ -234,22 +234,6 @@ struct ethernet_regs {
>>   #endif
>>
>>   /**
>> - * @brief Descriptor buffer alignment
>> - *
>> - * i.MX27 requires a 16 byte alignment (but for the first element only)
>> - */
>> -#define DB_ALIGNMENT		16
>> -
>> -/**
>> - * @brief Data buffer alignment
>> - *
>> - * i.MX27 requires a four byte alignment for transmit and 16 bits
>> - * alignment for receive so take 16
>> - * Note: Valid for member data_pointer in struct buffer_descriptor
>> - */
>> -#define DB_DATA_ALIGNMENT	16
>> -
>> -/**
>>    * @brief Receive&  Transmit Buffer Descriptor definitions
>>    *
>>    * Note: The first BD must be aligned (see DB_ALIGNMENT)
>> @@ -282,8 +266,7 @@ struct fec_priv {
>>   	struct fec_bd *tbd_base;	/* TBD ring */
>>   	int tbd_index;			/* next transmit BD to write */
>>   	bd_t *bd;
>> -	void *rdb_ptr;
>> -	void *base_ptr;
>> +	uint8_t *tdb_ptr;
>>   	int dev_id;
>>   	int phy_id;
>>   	struct mii_dev *bus;
> Uh, this was tough.

How bad do we want cache?