[PATCH v1 01/10] mips: octeon: Initial minimal support for the Marvell Octeon SoC

Daniel Schwierzeck daniel.schwierzeck at gmail.com
Thu May 14 01:43:13 CEST 2020



Am 02.05.20 um 10:59 schrieb Stefan Roese:
> From: Aaron Williams <awilliams at marvell.com>
> 
> This patch adds very basic support for the Octeon III SoCs. Only
> CFI parallel NOR flash and UART is supported for now.
> 
> Please note that the basic Octeon port does not include the DDR3/4
> initialization yet. This will be added in some follow-up patches
> later. To still use U-Boot on with this port, the L2 cache (4MiB on
> Octeon III CN73xx) is used as RAM. This way, U-Boot can boot to the
> prompt on such boards.
> 
> Signed-off-by: Aaron Williams <awilliams at marvell.com>
> Signed-off-by: Stefan Roese <sr at denx.de>
> ---
> 
>  MAINTAINERS                                  |    6 +
>  arch/Kconfig                                 |    1 +
>  arch/mips/Kconfig                            |   49 +-
>  arch/mips/Makefile                           |    7 +
>  arch/mips/cpu/Makefile                       |    4 +-
>  arch/mips/include/asm/arch-octeon/cavm-reg.h |   42 +
>  arch/mips/include/asm/arch-octeon/clock.h    |   24 +
>  arch/mips/mach-octeon/Kconfig                |   92 ++
>  arch/mips/mach-octeon/Makefile               |   10 +
>  arch/mips/mach-octeon/clock.c                |   22 +
>  arch/mips/mach-octeon/cpu.c                  |   55 +
>  arch/mips/mach-octeon/dram.c                 |   27 +
>  arch/mips/mach-octeon/include/ioremap.h      |   30 +
>  arch/mips/mach-octeon/start.S                | 1241 ++++++++++++++++++
>  14 files changed, 1608 insertions(+), 2 deletions(-)
>  create mode 100644 arch/mips/include/asm/arch-octeon/cavm-reg.h
>  create mode 100644 arch/mips/include/asm/arch-octeon/clock.h
>  create mode 100644 arch/mips/mach-octeon/Kconfig
>  create mode 100644 arch/mips/mach-octeon/Makefile
>  create mode 100644 arch/mips/mach-octeon/clock.c
>  create mode 100644 arch/mips/mach-octeon/cpu.c
>  create mode 100644 arch/mips/mach-octeon/dram.c
>  create mode 100644 arch/mips/mach-octeon/include/ioremap.h
>  create mode 100644 arch/mips/mach-octeon/start.S
> 

I couldn't completely understand the start.S. There is too much stuff in
it for an initial merge. But I don't see a hard reason against using the
generic start.S. So the first patch series should only implement the
bare minimum needed to boot from flash, init the boot CPU core, maybe
suspend all other cores and relocate to L2 cache.

I know the current start.S is not really suited yet but I'm working on a
refactoring to add some more hooks which a SoC/CPU can implement. Once
we have your initial patch series and the refactoring in mainline, it
should be possible to gradually add more Octeon stuff like memory init.

Basic idea for refactoring is something like this:

reset:
    - mips_cpu_early_init()       # custom early init, fix errata
    - init CP0 registers, Watch registers
    - mips_cache_disable()        # set K0 CCA to uncached
    - mips_cpu_core_init()        # per CPU core init
                                  # -> generic code issues wait instr.
                                  # -> custom code can do custom init
                                  #    or custom boot protocols
    - mips_cm_map()               # init CM if available
    - mips_cache_init()           # init caches, set K0 CCA to non-coh.
    - mips_sram_init()            # init SRAM, Scratch RAM if avail
    - setup initial stack and global_data
    - debug_uart_init()
    - mips_mem_init()             # init external memory, C env avail.
    - init malloc_f
    - board_init_f()

> +
> +#endif /* __ASM_MACH_OCTEON_IOREMAP_H */
> diff --git a/arch/mips/mach-octeon/start.S b/arch/mips/mach-octeon/start.S
> new file mode 100644
> index 0000000000..acb967201a
> --- /dev/null
> +++ b/arch/mips/mach-octeon/start.S
> @@ -0,0 +1,1241 @@
> +/* SPDX-License-Identifier: GPL-2.0+ */
> +/*
> + *  Startup Code for OCTEON 64-bit CPU-core
> + *
> + *  Copyright (c) 2003	Wolfgang Denk <wd at denx.de>
> + *  Copyright 2004, 2005, 2010 - 2015 Cavium Inc..
> + */
> +
> +#include <asm-offsets.h>
> +#include <config.h>
> +#include <asm/regdef.h>
> +#include <asm/mipsregs.h>
> +#include <asm/asm.h>
> +
> +#define BOOT_VECTOR_NUM_WORDS		8
> +
> +#define OCTEON_BOOT_MOVEABLE_MAGIC_OFFSET	0x70
> +#define OCTEON_BOOT_VECTOR_MOVEABLE_OFFSET	0x78
> +
> +#define OCTEON_BOOT_MOVEABLE_MAGIC1_RAW	0xdb00110ad358eacd
> +#define OCTEON_BOOT_MOVEABLE_MAGIC1	OCTEON_BOOT_MOVEABLE_MAGIC1_RAW
> +
> +#define OCTEON_CIU_SOFT_RST		0x8001070000000740
> +
> +#define	OCTEON_L2C_WPAR_PP0		0x8001180080840000
> +#define OCTEON_MIO_BOOT_BASE		0x8001180000000000
> +#define OCTEON_MIO_BOOT_REG_CFG0_OFF	0x0000
> +#define OCTEON_MIO_BOOT_LOC_CFG0_OFF	0x0080
> +#define OCTEON_MIO_BOOT_LOC_ADR_OFF	0x0090
> +#define OCTEON_MIO_BOOT_LOC_DAT_OFF	0x0098
> +#define	OCTEON_MIO_RST_BOOT		0x8001180000001600
> +#define OCTEON_MIO_BOOT_REG_CFG0	0x8001180000000000
> +#define	OCTEON_MIO_BOOT_REG_TIM0	0x8001180000000040
> +#define OCTEON_MIO_BOOT_LOC_CFG0	0x8001180000000080
> +#define OCTEON_MIO_BOOT_LOC_ADR		0x8001180000000090
> +#define OCTEON_MIO_BOOT_LOC_DAT		0x8001180000000098
> +#define	OCTEON_MIO_FUSE_DAT3		0x8001180000001418
> +#define OCTEON_L2D_FUS3			0x80011800800007B8
> +#define	OCTEON_LMC0_DDR_PLL_CTL		0x8001180088000258
> +
> +#define OCTEON_RST			0x8001180006000000
> +#define OCTEON_RST_BOOT_OFFSET		0x1600
> +#define OCTEON_RST_SOFT_RST_OFFSET	0x1680
> +#define OCTEON_RST_COLD_DATAX_OFFSET(X)	(0x17C0 + (X) * 8)
> +#define OCTEON_RST_BOOT			0x8001180006001600
> +#define OCTEON_RST_SOFT_RST		0x8001180006001680
> +#define OCTEON_RST_COLD_DATAX(X)	(0x80011800060017C0 + (X) * 8)
> +
> +#define OCTEON_OCX_COM_NODE		0x8001180011000000
> +#define OCTEON_L2C_OCI_CTL		0x8001180080800020
> +#define OCTEON_L2C_TAD_CTL		0x8001180080800018
> +#define OCTEON_L2C_CTL			0x8001180080800000
> +
> +#define OCTEON_DBG_DATA			0x80011F00000001E8
> +#define OCTEON_PCI_READ_CMD_E		0x80011F0000001188
> +#define OCTEON_NPEI_DBG_DATA		0x80011F0000008510
> +#define OCTEON_CIU_WDOG(X)		(0x8001070000000500 + (X) * 8)
> +#define OCTEON_CIU_PP_POKE(X)		(0x8001070000000580 + (X) * 8)
> +#define OCTEON_CIU3_WDOG(X)		(0x8001010000020000 + (X) * 8)
> +#define OCTEON_CIU3_PP_POKE(X)		(0x8001010000030000 + (X) * 8)
> +#define OCTEON_OCX_COM_LINKX_CTL(X)	(0x8001180011000020 + (X) * 8)
> +#define OCTEON_SLI_CTL_STATUS		0x80011F0000028570
> +#define OCTEON_GSERX_SCRATCH(X)		(0x8001180090000020 + (X) * 0x1000000)
> +
> +/** PRID for CN56XX */
> +#define OCTEON_PRID_CN56XX		0x04
> +/** PRID for CN52XX */
> +#define OCTEON_PRID_CN52XX		0x07
> +/** PRID for CN63XX */
> +#define OCTEON_PRID_CN63XX		0x90
> +/** PRID for CN68XX */
> +#define OCTEON_PRID_CN68XX		0x91
> +/** PRID for CN66XX */
> +#define OCTEON_PRID_CN66XX		0x92
> +/** PRID for CN61XX */
> +#define OCTEON_PRID_CN61XX		0x93
> +/** PRID for CNF71XX */
> +#define OCTEON_PRID_CNF71XX		0x94
> +/** PRID for CN78XX */
> +#define OCTEON_PRID_CN78XX		0x95
> +/** PRID for CN70XX */
> +#define OCTEON_PRID_CN70XX		0x96
> +/** PRID for CN73XX */
> +#define OCTEON_PRID_CN73XX		0x97
> +/** PRID for CNF75XX */
> +#define OCTEON_PRID_CNF75XX		0x98
> +
> +/* func argument is used to create a  mark, must be unique */
> +#define GETOFFSET(reg, func)	\
> +	.balign	8;		\
> +	bal	func ##_mark;	\
> +	nop;			\
> +	.dword	.;		\
> +func ##_mark:			\
> +	ld	reg, 0(ra);	\
> +	dsubu	reg, ra, reg;
> +
> +#define JAL(func)		\
> +	.balign	8;		\
> +	bal	func ##_mark;	\
> +	 nop;			\
> +	.dword .;		\
> +func ##_mark:			\
> +	ld	t8, 0(ra);	\
> +	dsubu	t8, ra, t8;	\
> +	dla	t9, func;	\
> +	daddu	t9, t9, t8;	\
> +	jalr	t9;		\
> +	 nop;
> +
> +	.set	arch=octeon3
> +	.set	noreorder
> +
> +	.macro uhi_mips_exception
> +	move	k0, t9		# preserve t9 in k0
> +	move	k1, a0		# preserve a0 in k1
> +	li	t9, 15		# UHI exception operation
> +	li	a0, 0		# Use hard register context
> +	sdbbp	1		# Invoke UHI operation
> +	.endm
> +
> +	.macro setup_stack_gd
> +	li	t0, -16
> +	PTR_LI	t1, big_stack_start
> +	and	sp, t1, t0		# force 16 byte alignment
> +	PTR_SUBU \
> +		sp, sp, GD_SIZE		# reserve space for gd
> +	and	sp, sp, t0		# force 16 byte alignment
> +	move	k0, sp			# save gd pointer
> +#if CONFIG_VAL(SYS_MALLOC_F_LEN) && \
> +    !CONFIG_IS_ENABLED(INIT_STACK_WITHOUT_MALLOC_F)
> +	li	t2, CONFIG_VAL(SYS_MALLOC_F_LEN)
> +	PTR_SUBU \
> +		sp, sp, t2		# reserve space for early malloc
> +	and	sp, sp, t0		# force 16 byte alignment
> +#endif
> +	move	fp, sp
> +
> +	/* Clear gd */
> +	move	t0, k0
> +1:
> +	PTR_S	zero, 0(t0)
> +	PTR_ADDIU t0, PTRSIZE
> +	blt	t0, t1, 1b
> +	 nop
> +
> +#if CONFIG_VAL(SYS_MALLOC_F_LEN) && \
> +    !CONFIG_IS_ENABLED(INIT_STACK_WITHOUT_MALLOC_F)
> +	PTR_S	sp, GD_MALLOC_BASE(k0)	# gd->malloc_base offset
> +#endif
> +	.endm
> +
> +/* Saved register usage:
> + * s0:	not used
> + * s1:	not used
> + * s2:	Address U-Boot loaded into in L2 cache
> + * s3:	Start address
> + * s4:	flags
> + *		1:	booting from RAM
> + *		2:	executing out of cache
> + *		4:	booting from flash
> + * s5:	u-boot size (data end - _start)
> + * s6:	offset in flash.
> + * s7:	_start physical address
> + * s8:
> + */
> +
> +ENTRY(_start)
> +	/* U-Boot entry point */
> +	b	reset
> +
> +	/* The above jump instruction/nop are considered part of the
> +	 * bootloader_header_t structure but are not changed when the header is
> +	 * updated.
> +	 */
> +
> +	/* Leave room for bootloader_header_t header at start of binary.  This
> +	 * header is used to identify the board the bootloader is for, what
> +	 * address it is linked at, failsafe/normal, etc.  It also contains a
> +	 * CRC of the entire image.
> +	 */
> +
> +#if defined(CONFIG_ROM_EXCEPTION_VECTORS)
> +	/*
> +	 * Exception vector entry points. When running from ROM, an exception
> +	 * cannot be handled. Halt execution and transfer control to debugger,
> +	 * if one is attached.
> +	 */
> +	.org 0x200
> +	/* TLB refill, 32 bit task */
> +	uhi_mips_exception
> +
> +	.org 0x280
> +	/* XTLB refill, 64 bit task */
> +	uhi_mips_exception
> +
> +	.org 0x300
> +	/* Cache error exception */
> +	uhi_mips_exception
> +
> +	.org 0x380
> +	/* General exception */
> +	uhi_mips_exception
> +
> +	.org 0x400
> +	/* Catch interrupt exceptions */
> +	uhi_mips_exception
> +
> +	.org 0x480
> +	/* EJTAG debug exception */
> +1:	b	1b
> +	 nop
> +
> +	.org 0x500
> +#endif
> +
> +/* Reserve extra space so that when we use the boot bus local memory
> + * segment to remap the debug exception vector we don't overwrite
> + * anything useful
> + */
> +
> +/* Basic exception handler (dump registers) in all ASM.	 When using the TLB for
> + * mapping u-boot C code, we can't branch to that C code for exception handling
> + * (TLB is disabled for some exceptions.
> + */
> +
> +/* RESET/start here */
> +	.balign	8
> +reset:
> +	nop
> +	synci	0(zero)
> +	mfc0	k0, CP0_STATUS
> +	ori	k0, 0x00E0		/* enable 64 bit mode for CSR access */
> +	mtc0	k0, CP0_STATUS
> +
> +	/* Save the address we're booting from, strip off low bits */
> +	bal	1f
> +	 nop
> +1:
> +	move	s3, ra
> +	dins	s3, zero, 0, 12
> +
> +	/* Disable boot bus moveable regions */
> +	PTR_LI	k0, OCTEON_MIO_BOOT_LOC_CFG0
> +	sd	zero, 0(k0)
> +	sd	zero, 8(k0)
> +
> +	/* Disable the watchdog timer
> +	 * First we check if we're running on CN78XX, CN73XX or CNF75XX to see
> +	 * if we use CIU3 or CIU.
> +	 */
> +	mfc0	t0, CP0_PRID
> +	ext	t0, t0, 8, 8
> +	/* Assume CIU */
> +	PTR_LI	t1, OCTEON_CIU_WDOG(0)
> +	PTR_LI	t2, OCTEON_CIU_PP_POKE(0)
> +	blt	t0, OCTEON_PRID_CN78XX, wd_use_ciu
> +	 nop
> +	beq	t0, OCTEON_PRID_CN70XX, wd_use_ciu
> +	 nop
> +	/* Use CIU3 */
> +	PTR_LI	t1, OCTEON_CIU3_WDOG(0)
> +	PTR_LI	t2, OCTEON_CIU3_PP_POKE(0)
> +wd_use_ciu:
> +	sd	zero, 0(t2)		/* Pet the dog */
> +	sd	zero, 0(t1)		/* Disable watchdog timer */
> +
> +	/* Errata: CN76XX has a node ID of 3. change it to zero here.
> +	 * This needs to be done before we relocate to L2 as addresses change
> +	 * For 76XX pass 1.X we need to zero out the OCX_COM_NODE[ID],
> +	 * L2C_OCI_CTL[GKSEGNODE] and CP0 of Root.CvmMemCtl2[KSEGNODE].
> +	 */
> +	mfc0	a4, CP0_PRID
> +	/* Check for 78xx pass 1.x processor ID */
> +	andi	a4, 0xffff
> +	blt	a4, (OCTEON_PRID_CN78XX << 8), 1f
> +	 nop
> +
> +	/* Zero out alternate package for now */
> +	dins	a4, zero, 6, 1
> +	bge	a4, ((OCTEON_PRID_CN78XX << 8) | 0x08), 1f
> +	 nop
> +
> +	/* 78xx or 76xx here, first check for bug #27141 */
> +	PTR_LI	a5, OCTEON_SLI_CTL_STATUS
> +	ld	a6, 0(a5)
> +	andi	a7, a4, 0xff
> +	andi	a6, a6, 0xff
> +
> +	beq	a6, a7, not_bug27141
> +	 nop
> +
> +	/* core 0 proc_id rev_id field does not match SLI_CTL_STATUS rev_id */
> +	/* We just hit bug #27141.  Need to reset the chip and try again */
> +
> +	PTR_LI	a4, OCTEON_RST_SOFT_RST
> +	ori	a5, zero, 0x1	/* set the reset bit */
> +
> +reset_78xx_27141:
> +	sync
> +	synci	0(zero)
> +	cache	9, 0(zero)
> +	sd	a5, 0(a4)
> +	wait
> +	b	reset_78xx_27141
> +	 nop
> +
> +not_bug27141:
> +	/* 76XX pass 1.x has the node number set to 3 */
> +	mfc0	a4, CP0_EBASE
> +	ext	a4, a4, 0, 10
> +	bne	a4, 0x180, 1f	/* Branch if not node 3 core 0 */
> +	 nop
> +
> +	/* Clear OCX_COM_NODE[ID] */
> +	PTR_LI	a5, OCTEON_OCX_COM_NODE
> +	ld	a4, 0(a5)
> +	dins	a4, zero, 0, 2
> +	sd	a4, 0(a5)
> +	ld	zero, 0(a5)
> +
> +	/* Clear L2C_OCI_CTL[GKSEGNODE] */
> +	PTR_LI	a5, OCTEON_L2C_OCI_CTL
> +	ld	a4, 0(a5)
> +	dins	a4, zero, 4, 2
> +	sd	a4, 0(a5)
> +	ld	zero, 0(a5)
> +
> +	/* Clear CP0 Root.CvmMemCtl2[KSEGNODE] */
> +	dmfc0	a4, CP0_CVMMEMCTL2
> +	dins	a4, zero, 12, 2
> +	dmtc0	a4, CP0_CVMMEMCTL2
> +
> +	/* Put the flash address in the start of the EBASE register to
> +	 * enable our exception handler but only for core 0.
> +	 */
> +	mfc0	a4, CP0_EBASE
> +	dext	a4, a4, 0, 10
> +	bnez	a4, no_flash
> +	/* OK in delay slot */
> +	dext	a6, a6, 0, 16		/* Get the base address in flash */
> +	sll	a6, a6, 16
> +	mtc0	a6, CP0_EBASE	/* Enable exceptions */
> +
> +no_flash:
> +	/* Zero out various registers */
> +	mtc0	zero, CP0_DEPC
> +	mtc0	zero, CP0_EPC
> +	mtc0	zero, CP0_CAUSE
> +	mfc0	a4, CP0_PRID
> +	ext	a4, a4, 8, 8
> +	mtc0	zero, CP0_DESAVE
> +
> +	/* The following are only available on Octeon 2 or later */
> +	mtc0	zero, CP0_KSCRATCH1
> +	mtc0	zero, CP0_KSCRATCH2
> +	mtc0	zero, CP0_KSCRATCH3
> +	mtc0	zero, CP0_USERLOCAL
> +
> +	/* Turn off ROMEN bit to disable ROM */
> +	PTR_LI	a1, OCTEON_MIO_RST_BOOT
> +	/* For OCTEON 3 we use RST_BOOT instead of MIO_RST_BOOT.
> +	 * The difference is bits 24-26 are 6 instead of 0 for the address.
> +	 */
> +	/* For Octeon 2 and CN70XX we can ignore the watchdog */
> +	blt	a4, OCTEON_PRID_CN78XX, watchdog_ok
> +	 nop
> +
> +	PTR_LI	a1, OCTEON_RST_BOOT
> +
> +	beq	a4, OCTEON_PRID_CN70XX, watchdog_ok
> +	 nop
> +
> +	ld	a2, 0(a1)
> +	/* There is a bug where some registers don't get properly reset when
> +	 * the watchdog timer causes a reset.  In this case we need to force
> +	 * a reset.
> +	 */
> +	bbit0	a2, 11, watchdog_ok	/* Skip if watchdog not hit */
> +	 dins	a2, zero, 2, 18	/* Don't clear LBOOT, LBOOT_EXT or LBOOT_OCI */
> +	/* Clear bit indicating reset due to watchdog */
> +	ori	a2, 1 << 11
> +	sd	a2, 0(a1)
> +
> +	/* Disable watchdog */
> +	PTR_LI	a1, OCTEON_CIU3_PP_POKE(0)
> +	sd	zero, 0(a1)
> +	PTR_LI	a1, OCTEON_CIU3_WDOG(0)
> +	sd	zero, 0(a1)
> +
> +	/* Record this in the GSER0_SCRATCH register in bit 11 */
> +	PTR_LI	a1, OCTEON_GSERX_SCRATCH(0)
> +	ld	a2, 0(a1)
> +	ori	a2, 1 << 11
> +	sd	a2, 0(a1)
> +
> +	PTR_LI	a1, OCTEON_RST_SOFT_RST
> +	li	a2, 1
> +	sd	a2, 0(a1)
> +	wait
> +
> +	/* We should never get here */
> +
> +watchdog_ok:
> +	ld	a2, 0(a1)
> +	/* Don't clear LBOOT/LBOOT_EXT or LBOOT_OCI */
> +	dins	a2, zero, 2, 18
> +	dins	a2, zero, 60, 1	/* Clear ROMEN bit */
> +	sd	a2, 0(a1)
> +
> +	/* Start of Octeon setup */
> +
> +	/* Check what core we are - if core 0, branch to init tlb
> +	 * loop in flash.  Otherwise, look up address of init tlb
> +	 * loop that was saved in the boot vector block.
> +	 */
> +	mfc0	a0, CP0_EBASE
> +	andi	a0, EBASE_CPUNUM		/* get core */
> +	beqz	a0, InitTLBStart_local
> +	 nop
> +
> +	break
> +	/* We should never get here - non-zero cores now go directly to
> +	 * tlb init from the boot stub in movable region.
> +	 */
> +
> +	.globl InitTLBStart
> +InitTLBStart:
> +InitTLBStart_local:
> +	/* If we don't have working memory yet configure a bunch of
> +	 * scratch memory, and set the stack pointer to the top
> +	 * of it.  This allows us to go to C code without having
> +	 * memory set up
> +	 *
> +	 * Warning: do not change SCRATCH_STACK_LINES as this can impact the
> +	 * transition from start.S to crti.asm. crti requires 590 bytes of
> +	 * stack space.
> +	 */
> +	cache	1,0(zero)	/* Clear Dcache so cvmseg works right */
> +#if CONFIG_OCTEON_BIG_STACK_SIZE
> +	rdhwr	v0, $0
> +	bnez	v0, 1f
> +	 nop
> +	PTR_LA	sp, big_stack_start - 16
> +	b	stack_clear_done
> +	 nop
> +1:
> +#endif
> +#define SCRATCH_STACK_LINES 0x36   /* MAX is 0x36 */
> +	dmfc0	v0, CP0_CVMMEMCTL
> +	dins	v0, zero, 0, 9
> +	/* setup SCRATCH_STACK_LINES scratch lines of scratch */
> +	ori	v0, 0x100 | SCRATCH_STACK_LINES
> +	dmtc0	v0, CP0_CVMMEMCTL
> +	/* set stack to top of scratch memory */
> +	li	sp, 0xffffffffffff8000 + (SCRATCH_STACK_LINES * 128)
> +	/* Clear scratch for CN63XX pass 2.0 errata Core-15169*/
> +	li	t0, 0xffffffffffff8000
> +clear_scratch:
> +	sd	zero, 0(t0)
> +	addiu	t0, 8
> +	bne	t0, sp, clear_scratch
> +	 nop
> +
> +	/* This code run on all cores - core 0 from flash,
> +	 * the rest from DRAM.	When booting from PCI, non-zero cores
> +	 * come directly here from the boot vector - no earlier code in this
> +	 * file is executed.
> +	 */
> +
> +	/* Some generic initialization is done here as well, as we need this
> +	 * done on all cores even when booting from PCI
> +	 */
> +stack_clear_done:
> +	/* Clear watch registers. */
> +	mtc0	zero, CP0_WATCHLO
> +	mtc0	zero, CP0_WATCHHI
> +
> +	/* STATUS register */
> +	mfc0	k0, CP0_STATUS
> +	li	k1, ~ST0_IE
> +	and	k0, k1
> +	mtc0	k0, CP0_STATUS
> +
> +	/* CAUSE register */
> +	mtc0	zero, CP0_CAUSE
> +
> +	/* Init Timer */
> +	dmtc0	zero, CP0_COUNT
> +	dmtc0	zero, CP0_COMPARE
> +
> +
> +	mfc0	a5, CP0_STATUS
> +	li	v0, 0xE0		/* enable 64 bit mode for CSR access */
> +	or	v0, v0, a5
> +	mtc0	v0, CP0_STATUS
> +
> +
> +	dli	v0, 1 << 29  /* Enable large physical address support in TLB */
> +	mtc0	v0, CP0_PAGEGRAIN
> +
> +InitTLB:
> +	dmtc0	zero, CP0_ENTRYLO0
> +	dmtc0	zero, CP0_ENTRYLO1
> +	mtc0	zero, CP0_PAGEMASK
> +	dmtc0	zero, CP0_CONTEXT
> +	/* Use an offset into kseg0 so we won't conflict with Mips1 legacy
> +	 * TLB clearing
> +	 */
> +	PTR_LI	v0, 0xFFFFFFFF90000000
> +	mfc0	a0, CP0_CONFIG1
> +	srl	a0, a0, 25
> +	/* Check if config4 reg present */
> +	mfc0	a1, CP0_CONFIG3
> +	bbit0	a1, 31, 2f
> +	 and	a0, a0, 0x3F		/* a0 now has the max mmu entry index */
> +	mfc0	a1, CP0_CONFIG4
> +	bbit0	a1, 14, 2f		/* check config4[MMUExtDef] */
> +	 nop
> +	/* append config4[MMUSizeExt] to most significant bit of
> +	 * config1[MMUSize-1]
> +	 */
> +	ins	a0, a1, 6, 8
> +	and	a0, a0, 0x3fff	/* a0 now includes max entries for cn6xxx */
> +2:
> +	dmtc0	zero, CP0_XCONTEXT
> +	mtc0	zero, CP0_WIRED
> +
> +InitTLBloop:
> +	dmtc0	v0, CP0_ENTRYHI
> +	tlbp
> +	mfc0	v1, CP0_INDEX
> +	daddiu	v0, v0, 1<<13
> +	bgez	v1, InitTLBloop
> +
> +	mtc0	a0, CP0_INDEX
> +	tlbwi
> +	bnez	a0, InitTLBloop
> +	 daddiu	a0, -1
> +
> +	mthi	zero
> +	mtlo	zero
> +
> +	/* Set up status register */
> +	mfc0	v0, CP0_STATUS
> +	/* Enable COP0 and COP2 access */
> +	li	a4, (1 << 28) | (1 << 30)
> +	or	v0, a4
> +
> +	/* Must leave BEV set here, as DRAM is not configured for core 0.
> +	 * Also, BEV must be 1 later on when the exception base address is set.
> +	 */
> +
> +	/* Mask all interrupts */
> +	ins	v0, zero, 0, 16
> +	/* Clear NMI (used to start cores other than core 0) */
> +	ori	v0, 0xE4		/* enable 64 bit, disable interrupts */
> +	mtc0	v0, CP0_STATUS
> +
> +	dli	v0,0xE000000F		/* enable all readhw locations */
> +	mtc0	v0, CP0_HWRENA
> +
> +	dmfc0	v0, CP0_CVMCTL
> +	ori	v0, 1<<14	/* enable fixup of unaligned mem access */
> +	dmtc0	v0, CP0_CVMCTL
> +
> +	/* Setup scratch memory.  This is also done in
> +	 * cvmx_user_app_init, and this code will be removed
> +	 * from the bootloader in the near future.
> +	 */
> +
> +	/* Set L2C_LAD_CTL[MAXLFB] = 0 on CN73XX */
> +	mfc0	a4, CP0_PRID
> +	ext	a4, a4, 8, 8
> +	blt	a4, OCTEON_PRID_CN73XX, 72f
> +	nop
> +	PTR_LI	v0, OCTEON_L2C_TAD_CTL
> +	ld	t1, 0(v0)
> +	dins	t1, zero, 0, 4
> +	sd	t1, 0(v0)
> +	ld	zero, 0(v0)
> +
> +72:
> +
> +	/* clear these to avoid immediate interrupt in noperf mode */
> +	dmtc0	zero, CP0_COMPARE	/* clear timer interrupt */
> +	dmtc0	zero, CP0_COUNT		/* clear timer interrupt */
> +	dmtc0	zero, CP0_PERF_CNT0	/* clear perfCnt0 */
> +	dmtc0	zero, CP0_PERF_CNT1	/* clear perfCnt1 */
> +	dmtc0	zero, CP0_PERF_CNT2
> +	dmtc0	zero, CP0_PERF_CNT3
> +
> +	/* If we're running on a node other than 0 then we need to set KSEGNODE
> +	 * to 0.  The nice thing with this code is that it also autodetects if
> +	 * we're running on a processor that supports CVMMEMCTL2 or not since
> +	 * only processors that have this will have a non-zero node ID.  Because
> +	 * of this there's no need to check if we're running on a 78XX.
> +	 */
> +	mfc0    t1, CP0_EBASE
> +	dext    t1, t1, 7, 3            /* Extract node number */
> +	beqz    t1, is_node0            /* If non-zero then we're not node 0 */
> +	 nop
> +	dmfc0   t1, CP0_CVMMEMCTL2
> +	dins    t1, zero, 12, 4
> +	dmtc0   t1, CP0_CVMMEMCTL2
> +is_node0:
> +
> +	/* Set up TLB mappings for u-boot code in flash. */
> +
> +	/* Use a bal to get the current PC into ra.  Since this bal is to
> +	 * the address immediately following the delay slot, the ra is
> +	 * the address of the label.  We then use this to get the actual
> +	 * address that we are executing from.
> +	 */
> +	bal	__dummy
> +	 nop
> +
> +__dummy:
> +	/* Get the actual address that we are running at */
> +	PTR_LA	a6, _start		/* Linked address of _start */
> +	PTR_LA	a7, __dummy
> +	dsubu	t0, a7, a6		/* offset of __dummy label from _start*/
> +	dsubu	a7, ra, t0		/* a7 now has actual address of _start*/
> +
> +	/* Save actual _start address in s7.  This is where we
> +	 * are executing from, as opposed to where the code is
> +	 * linked.
> +	 */
> +	move	s7, a7
> +	move	s4, zero
> +
> +	/* s7 has actual address of _start.  If this is
> +	 * on the boot bus, it will be between 0xBFC000000 and 0xBFFFFFFF.
> +	 * If it is on the boot bus, use 0xBFC00000 as the physical address
> +	 * for the TLB mapping, as we will be adjusting the boot bus
> +	 * to make this adjustment.
> +	 * If we are running from DRAM (remote-boot), then we want to use the
> +	 * real address in DRAM.
> +	 */
> +
> +	/* Check to see if we are running from flash - we expect that to
> +	 * be 0xffffffffb0000000-0xffffffffbfffffff
> +	 * (0x10000000-0x1fffffff, unmapped/uncached)
> +	 */
> +	dli	t2, 0xffffffffb0000000
> +	dsubu	t2, s7
> +	slt	s4, s7, t2
> +	bltz	t2, uboot_in_flash
> +	 nop
> +
> +	/* If we're not core 0 then we don't care about cache */
> +	mfc0	t2, CP0_EBASE
> +	andi	t2, EBASE_CPUNUM
> +	bnez	t2, uboot_in_ram
> +	 nop
> +
> +	/* Find out if we're OCTEON I or OCTEON + which don't support running
> +	 * out of cache.
> +	 */
> +	mfc0	t2, CP0_PRID
> +	ext	t2, t2, 8, 8
> +	li	s4, 1
> +	blt	t2, 0x90, uboot_in_ram
> +	 nop
> +
> +	/* U-Boot can be executing either in RAM or L2 cache.  Now we need to
> +	 * check if DRAM is initialized.  The way we do that is to look at
> +	 * the reset bit of the LMC0_DDR_PLL_CTL register (bit 7)
> +	 */
> +	PTR_LI	t2, OCTEON_LMC0_DDR_PLL_CTL
> +	ld	t2, 0(t2)
> +	bbit1	t2, 7, uboot_in_ram
> +	 nop
> +
> +	/* We must be executing out of cache */
> +	b	uboot_in_ram
> +	 li	s4, 2
> +
> +uboot_in_flash:
> +	/* Set s4 to 4 to indicate we're running in FLASH */
> +	li	s4, 4
> +
> +#if defined(CONFIG_OCTEON_DISABLE_L2_CACHE_INDEX_ALIASING)
> +	/* By default, L2C index aliasing is enabled.  In some cases it may
> +	 * need to be disabled.  The L2C index aliasing can only be disabled
> +	 * if U-Boot is running out of L2 cache and the L2 cache has not been
> +	 * used to store anything.
> +	 */
> +	PTR_LI	t1, OCTEON_L2C_CTL
> +	ld	t2, 0(t1)
> +	ori	t2, 1
> +	sd	t2, 0(t1)
> +#endif
> +
> +	/* Use BFC00000 as physical address for TLB mappings when booting
> +	 * from flash, as we will adjust the boot bus mappings to make this
> +	 * mapping correct.
> +	 */
> +	dli	a7, 0xFFFFFFFFBFC00000
> +	dsubu	s6, s7, a7  /* Save flash offset in s6 */
> +
> +#if defined(CONFIG_OCTEON_COPY_FROM_FLASH_TO_L2)
> +	/* For OCTEON II we check to see if the L2 cache is big enough to hold
> +	 * U-Boot.  If it is big enough then we copy ourself from flash to the
> +	 * L2 cache in order to speed up execution.
> +	 */
> +
> +	/* Check for OCTEON 2 */
> +	mfc0	t1, CP0_PRID
> +	ext	t1, t1, 8, 8
> +	/* Get number of L2 cache sets */
> +	beq	t1, OCTEON_PRID_CNF71XX, got_l2_sets	/* CNF71XX */
> +	 li	t2, 1 << 9
> +	beq	t1, OCTEON_PRID_CN78XX, got_l2_sets	/* CN78XX */
> +	 li	t2, 1 << 13
> +	beq	t1, OCTEON_PRID_CN70XX, got_l2_sets	/* CN70XX */
> +	 li	t2, 1 << 10
> +	beq	t1, OCTEON_PRID_CN73XX, got_l2_sets	/* CN73XX */
> +	 li	t2, 1 << 11
> +	beq	t1, OCTEON_PRID_CNF75XX, got_l2_sets	/* CNF75XX */
> +	 li	t2, 1 << 11
> +	b	l2_cache_too_small	/* Unknown OCTEON model */
> +	 nop
> +
> +got_l2_sets:
> +	/* Get number of associations */
> +	PTR_LI	t0, OCTEON_MIO_FUSE_DAT3
> +	ld	t0, 0(t0)
> +	dext	t0, t0, 32, 3
> +
> +	beq	t1, OCTEON_PRID_CN70XX, process_70xx_l2sets
> +	 nop
> +	/* 0 = 16-way, 1 = 12-way, 2 = 8-way, 3 = 4-way, 4-7 reserved */
> +	beqz	t0, got_l2_ways
> +	 li	t3, 16
> +	beq	t0, 1, got_l2_ways
> +	 li	t3, 12
> +	beq	t0, 2, got_l2_ways
> +	 li	t3, 8
> +	beq	t0, 3, got_l2_ways
> +	 li	t3, 4
> +	b	l2_cache_too_small
> +	 nop
> +
> +process_70xx_l2sets:
> +	/* For 70XX, the number of ways is defined as:
> +	 * 0 - full cache (4-way) 512K
> +	 * 1 - 3/4 ways (3-way) 384K
> +	 * 2 - 1/2 ways (2-way) 256K
> +	 * 3 - 1/4 ways (1-way) 128K
> +	 * 4-7 illegal (aliased to 0-3)
> +	 */
> +	andi	t0, 3
> +	beqz	t0, got_l2_ways
> +	 li	t3, 4
> +	beq	t0, 1, got_l2_ways
> +	 li	t3, 3
> +	beq	t0, 2, got_l2_ways
> +	 li	t3, 2
> +	li	t3, 1
> +
> +got_l2_ways:
> +	dmul	a1, t2, t3		/* Calculate cache size */
> +	dsll	a1, 7			/* Ways * Sets * cache line sz (128) */
> +	daddiu	a1, a1, -128		/* Adjust cache size for copy code */
> +
> +	/* Calculate size of U-Boot image */
> +	/*
> +	 * "uboot_end - _start" is not correct, as the image also
> +	 * includes the DTB appended to the end (OF_EMBED is deprecated).
> +	 * Lets use a defined max for now here.
> +	 */
> +	PTR_LI	s5, CONFIG_BOARD_SIZE_LIMIT
> +
> +	daddu	t2, s5, s7	/* t2 = end address */
> +	daddiu	t2, t2, 127
> +	ins	t2, zero, 0, 7	/* Round up to cache line for memcpy */
> +
> +	slt	t1, a1, s5	/* See if we're bigger than the L2 cache */
> +	bnez	t1, l2_cache_too_small
> +	 nop
> +	/* Address we plan to load at in the L2 cache */
> +	PTR_LI	t9, CONFIG_OCTEON_L2_UBOOT_ADDR
> +# ifdef CONFIG_OCTEON_L2_MEMCPY_IN_CACHE
> +	/* Enable all ways for PP0.  Authentik ROM may have disabled these */
> +	PTR_LI	a1, OCTEON_L2C_WPAR_PP0
> +	sd	zero, 0(a1)
> +
> +	/* Address to place our memcpy code */
> +	PTR_LI	a0, CONFIG_OCTEON_L2_MEMCPY_ADDR
> +	/* The following code writes a simple memcpy routine into the cache
> +	 * to copy ourself from flash into the L2 cache.  This makes the
> +	 * memcpy routine a lot faster since each instruction can potentially
> +	 * require four read cycles to flash over the boot bus.
> +	 */
> +	/* Zero cache line in the L2 cache */
> +	zcb	(a0)
> +	synci	0(zero)
> +	dli	a1, 0xdd840000dd850008	/* ld a0, 0(t0);  ld a1, 8(t0) */
> +	sd	a1, 0(a0)
> +	dli	a1, 0xdd860010dd870018	/* ld a2, 16(t0); ld a3, 24(t0) */
> +	sd	a1, 8(a0)
> +	dli	a1, 0xfda40000fda50008	/* sd a0, 0(t1);  sd a1, 8(t1) */
> +	sd	a1, 16(a0)
> +	dli	a1, 0xfda60010fda70018	/* sd a2, 16(t1); sd a3, 24(t1) */
> +	sd	a1, 24(a0)
> +	dli	a1, 0x258c0020158efff6	/* addiu t0, 32; bne t0, t2, -40 */
> +	sd	a1, 32(a0)
> +	dli	a1, 0x25ad002003e00008	/* addiu t1, 32; jr ra */
> +	sd	a1, 40(a0)
> +	sd	zero, 48(a0)		/* nop; nop */
> +
> +	/* Synchronize the caches */
> +	sync
> +	synci	0(zero)
> +
> +	move	t0, s7
> +	move	t1, t9
> +
> +	/* Do the memcpy operation in L2 cache to copy ourself from flash
> +	 * to the L2 cache.
> +	 */
> +	jalr	a0
> +	 nop
> +
> +# else
> +	/* Copy ourself to the L2 cache from flash, 32 bytes at a time */
> +	/* This code is now written to the L2 cache using the code above */
> +1:
> +	ld	a0, 0(t0)
> +	ld	a1, 8(t0)
> +	ld	a2, 16(t0)
> +	ld	a3, 24(t0)
> +	sd	a0, 0(t1)
> +	sd	a1, 8(t1)
> +	sd	a2, 16(t1)
> +	sd	a3, 24(t1)
> +	addiu	t0, 32
> +	bne	t0, t2, 1b
> +	addiu	t1, 32
> +# endif	/* CONFIG_OCTEON_L2_MEMCPY_IN_CACHE */
> +
> +	/* Adjust the start address of U-Boot and the global pointer */
> +	subu	t0, s7, t9	/* t0 = address difference */
> +	move	s7, t9		/* Update physical address */
> +	move	s2, t9
> +	sync
> +	synci	0(zero)
> +
> +	/* Now we branch to the L2 cache.  We first get our PC then adjust it
> +	 */
> +	bal	3f
> +	 nop
> +3:
> +	/* Don't add any instructions here! */
> +	subu	t9, ra, t0
> +	/* Give ourself 16 bytes */
> +	addiu	t9, 0x10
> +
> +	jal	t9		/* Branch to address in L2 cache */
> +
> +	 nop
> +	nop
> +	/* Add instructions after here */
> +
> +	move	a7, s7
> +
> +	b	uboot_in_ram
> +	 ori	s4, 2		/* Running out of L2 cache */
> +
> +l2_cache_too_small:	/* We go here if we can't copy ourself to L2 */
> +#endif /* CONFIG_OCTEON_COPY_FROM_FLASH_TO_L2 */
> +
> +	/* This code is only executed if booting from flash. */
> +	/*  For flash boot (_not_ RAM boot), we do a workaround for
> +	 * an LLM errata on CN38XX and CN58XX parts.
> +	 */
> +
> +uboot_in_ram:
> +	/* U-boot address is now in reg a7, and is 4 MByte aligned.
> +	 * (boot bus addressing has been adjusted to make this happen for flash,
> +	 * and for DRAM this alignment must be provided by the remote boot
> +	 * utility.
> +	 */
> +	/* See if we're in KSEG0 range, if so set EBASE register to handle
> +	 * exceptions.
> +	 */
> +	dli	a1, 0x20000000
> +	bge	a7, a1, 1f
> +	 nop
> +	/* Convert our physical address to KSEG0 */
> +	PTR_LI	a1, 0xffffffff80000000
> +	or	a1, a1, a7
> +	mtc0	a1, CP0_EBASE
> +1:
> +	/* U-boot now starts at 0xBFC00000.  Use a single 4 MByte TLB mapping
> +	 * to map u-boot.
> +	 */
> +	move	a0, a6		/* Virtual addr in a0 */
> +	dins	a0, zero, 0, 16	/* Zero out offset bits */
> +	move	a1, a7		/* Physical addr in a1 */
> +
> +	/* Now we need to remove the MIPS address space bits.  For this we
> +	 * need to determine if it is a 32 bit compatibility address or not.
> +	 */
> +
> +	/* 'lowest' address in compatibility space */
> +	PTR_LI	t0, 0xffffffff80000000
> +	dsubu	t0, t0, a1
> +	bltz	t0, compat_space
> +	 nop
> +
> +	/* We have a xkphys address, so strip off top bit */
> +	b	addr_fixup_done
> +	 dins	a1, zero, 63, 1
> +
> +compat_space:
> +	PTR_LI	a2, 0x1fffffff
> +	and	a1, a1, a2  /* Mask phy addr to remove address space bits */
> +
> +addr_fixup_done:
> +	/* Currenty the u-boot image size is limited to 4 MBytes.  In order to
> +	 * support larger images the flash mapping will need to be changed to
> +	 * be able to access more than that before C code is run.  Until that
> +	 * is done, we just use a 4 MByte mapping for the secondary cores as
> +	 * well.
> +	 */
> +	/* page size (only support 4 Meg binary size for now for core 0)
> +	 * This limitation is due to the fact that the boot vector is
> +	 * 0xBFC00000 which only makes 4MB available.  Later more flash
> +	 * address space will be available after U-Boot has been copied to
> +	 * RAM.	 For now assume that it is in flash.
> +	 */
> +	li	a2, 2*1024*1024
> +
> +	mfc0	a4, CP0_EBASE
> +	andi	a4, EBASE_CPUNUM		/* get core */
> +	beqz	a4, core_0_tlb
> +	 nop
> +
> +	/* Now determine how big a mapping to use for secondary cores,
> +	 * which need to map all of u-boot + heap in DRAM
> +	 */
> +	/* Here we look at the alignment of the the physical address,
> +	 * and use the largest page size possible.  In some cases
> +	 * this can result in an oversize mapping, but for secondary cores
> +	 * this mapping is very short lived.
> +	 */
> +
> +	/* Physical address in a1 */
> +	li	a2, 1
> +1:
> +	sll	a2, 1
> +	and	a5, a1, a2
> +	beqz	a5, 1b
> +	 nop
> +
> +	/* a2 now contains largest page size we can use */
> +core_0_tlb:
> +	JAL(single_tlb_setup)
> +
> +	/* Check if we're running from cache */
> +	bbit1	s4, 1, uboot_in_cache
> +	 nop
> +
> +	/* If we are already running from ram, we don't need to muck
> +	 * with boot bus mappings.
> +	 */
> +	PTR_LI	t2, 0xffffffffb0000000
> +	dsubu	t2, s7
> +	/* See if our starting address is lower than the boot bus */
> +	bgez	t2, uboot_in_ram2	/* If yes, booting from RAM */
> +	 nop
> +
> +uboot_in_cache:
> +#if CONFIG_OCTEON_BIG_STACK_SIZE
> +	/* The large stack is only for core 0.  For all other cores we need to
> +	 * use the L1 cache otherwise the other cores will stomp on top of each
> +	 * other unless even more space is reserved for the stack space for
> +	 * each core.  With potentially 96 cores this gets excessive.
> +	 */
> +	mfc0	v0, CP0_EBASE
> +	andi	a0, EBASE_CPUNUM
> +	bnez	a0, no_big_stack
> +	 nop
> +	PTR_LA	sp, big_stack_start
> +	daddiu	sp, -16
> +
> +no_big_stack:
> +#endif
> +	/* We now have the TLB set up, so we need to remap the boot bus.
> +	 * This is tricky, as we are running from flash, and will be changing
> +	 * the addressing of the flash.
> +	 */
> +	/* Enable movable boot bus region 0, at address 0x10000000 */
> +	PTR_LI	a4, OCTEON_MIO_BOOT_BASE
> +	dli	a5, 0x81000000	/* EN + base address 0x11000000 */
> +	sd	a5, OCTEON_MIO_BOOT_LOC_CFG0_OFF(a4)
> +
> +	/* Copy code to that remaps the boot bus to movable region */
> +	sd	zero, OCTEON_MIO_BOOT_LOC_DAT_OFF(a4)
> +
> +	PTR_LA	a6, change_boot_mappings
> +	GETOFFSET(a5, change_boot_mappings);
> +	daddu	a5, a5, a6
> +
> +	/* The code is 16 bytes (2 DWORDS) */
> +	ld	a7, 0(a5)
> +	sd	a7, OCTEON_MIO_BOOT_LOC_DAT_OFF(a4)
> +	ld	a7, 8(a5)
> +	sd	a7, OCTEON_MIO_BOOT_LOC_DAT_OFF(a4)
> +
> +	/* Read from an RML register to ensure that the previous writes have
> +	 * completed before we branch to the movable region.
> +	 */
> +	ld	zero, OCTEON_MIO_BOOT_LOC_CFG0_OFF(a4)
> +
> +	/* Compute value for boot bus configuration register */
> +	/* Read region 0 config so we can _modify_ the base address field */
> +	PTR_LI	a4, OCTEON_MIO_BOOT_REG_CFG0	/* region 0 config */
> +	ld	a0, 0(a4)
> +	dli	a4, 0xf0000000		/* Mask off bits we want to save */
> +	and	a4, a4, a0
> +	dli	a0, 0x0fff0000		/* Force size to max */
> +	or	a4, a4, a0
> +
> +	move	a5, s6
> +	/* Convert to 64k blocks, as used by boot bus config */
> +	srl	a5, 16
> +	li	a6, 0x1fc0	/* 'normal' boot bus base config value */
> +	subu	a6, a6, a5	/* Subtract offset */
> +	/* combine into register value to pass to boot bus routine */
> +	or	a0, a4, a6
> +
> +	/* Branch there */
> +	PTR_LA	a1, __mapped_continue_label
> +	PTR_LI	a2, OCTEON_MIO_BOOT_REG_CFG0
> +	/* If region 0 is not enabled we can skip it */
> +	ld	a4, 0(a2)
> +	bbit0	a4, 31, __mapped_continue_label
> +	 nop
> +	li	a4, 0x10000000
> +	j	a4
> +	 synci	0(zero)
> +
> +	/* We never get here, as we go directly to __mapped_continue_label */
> +	break
> +
> +
> +uboot_in_ram2:
> +
> +	/* Now jump to address in TLB mapped memory to continue execution */
> +	PTR_LA	a4, __mapped_continue_label
> +	synci	0(a4)
> +	j	a4
> +	 nop
> +
> +__mapped_continue_label:
> +	/* Check if we are core 0, if we are not then we need
> +	 * to vector to code in DRAM to do application setup, and
> +	 * skip the rest of the bootloader.  Only core 0 runs the bootloader
> +	 * and sets up the tables that the other cores will use for
> +	 * configuration.
> +	 */
> +	mfc0	a0, CP0_EBASE
> +	andi	a0, EBASE_CPUNUM   /* get core */
> +	/* if (__all_cores_are_equal==0 && core==0),
> +	 * then jump to execute BL on core 0; else 'go to next line'
> +	 * (core_0_cont1 is executed ONLY when k0=a0=0(core0_ID))
> +	 */
> +	lw	t0, __all_cores_are_equal
> +	beq	a0, t0, core_0_cont1
> +	 nop
> +
> +	/* other cores look up addr from dram */
> +        /* DRAM controller already set up by first core */
> +        li      a1, (BOOT_VECTOR_NUM_WORDS * 4)
> +        mul     a0, a0, a1
> +
> +        /* Now find out the boot vector base address from the moveable boot
> +         * bus region.
> +         */
> +
> +        /* Get the address of the boot bus moveable region */
> +        PTR_LI     t8, OCTEON_MIO_BOOT_BASE
> +        ld      t9, OCTEON_MIO_BOOT_LOC_CFG0_OFF(t8)
> +        /* Make sure it's enabled */
> +        bbit0   t9, 31, invalid_boot_vector
> +         dext   t9, t9, 3, 24
> +        dsll    t9, t9, 7
> +        /* Make address XKPHYS */
> +	li	t0, 1
> +	dins	t9, t0, 63, 1
> +
> +        ld      t0, OCTEON_BOOT_MOVEABLE_MAGIC_OFFSET(t9)
> +        dli     t1, OCTEON_BOOT_MOVEABLE_MAGIC1
> +        bne     t0, t1, invalid_boot_vector
> +         nop
> +
> +        /* Load base address of boot vector table */
> +        ld      t0, OCTEON_BOOT_VECTOR_MOVEABLE_OFFSET(t9)
> +        /* Add offset for core */
> +        daddu   a1, t0, a0
> +
> +	mfc0	v0, CP0_STATUS
> +	move	v1, v0
> +	ins	v1, zero, 19, 1		/* Clear NMI bit */
> +	mtc0	v1, CP0_STATUS
> +
> +        /* Get app start function address */
> +        lw      t9, 8(a1)
> +        beqz    t9, invalid_boot_vector
> +         nop
> +
> +        j       t9
> +         lw      k0, 12(a1)      /* Load global data (deprecated) */
> +
> +invalid_boot_vector:
> +        wait
> +        b       invalid_boot_vector
> +         nop
> +
> +__all_cores_are_equal:
> +	/* The following .word tell if 'all_cores_are_equal' or core0 is special
> +	 * By default (for the first execution) the core0 should be special,
> +	 * in order to behave like the old(existing not-modified) bootloader
> +	 * and run the bootloader on core 0 to follow the existing design.
> +	 * However after that we make 'all_cores_equal' which allows to run SE
> +	 * applications on core0 like on any other core. NOTE that value written
> +	 * to '__all_cores_are_equal' should not match any core ID.
> +	 */
> +	.word 	0
> +
> +core_0_cont1:
> +	li	t0, 0xffffffff
> +	sw	t0, __all_cores_are_equal
> +	/* From here on, only core 0 runs, other cores have branched
> +	 * away.
> +	 */
> +#ifdef CONFIG_MIPS_INIT_STACK_IN_SRAM
> +	/* Set up initial stack and global data */
> +	setup_stack_gd
> +# ifdef CONFIG_DEBUG_UART
> +	PTR_LA	t9, debug_uart_init
> +	jalr	t9
> +	 nop
> +# endif
> +#endif
> +	move	a0, zero		# a0 <-- boot_flags = 0
> +	PTR_LA	t9, board_init_f
> +
> +	jr	t9
> +	 move	ra, zero
> +	END(_start)
> +
> +	.balign	8
> +	.globl	single_tlb_setup
> +	.ent	single_tlb_setup
> +	/* Sets up a single TLB entry.	Virtual/physical addresses
> +	 * must be properly aligned.
> +	 * a0  Virtual address
> +	 * a1  Physical address
> +	 * a2  page (_not_ mapping) size
> +	 */
> +single_tlb_setup:
> +	/* Determine the number of TLB entries available, and
> +	 * use the top one.
> +	 */
> +	mfc0	a3, CP0_CONFIG1
> +	dext	a3, a3, 25, 6		/* a3 now has the max mmu entry index */
> +	mfc0	a5, CP0_CONFIG3		/* Check if config4 reg present */
> +	bbit0	a5, 31, single_tlb_setup_cont
> +	 nop
> +	mfc0	a5, CP0_CONFIG4
> +	bbit0	a5, 14, single_tlb_setup_cont	/* check config4[MMUExtDef] */
> +	 nop
> +	/* append config4[MMUSizeExt] to most significant bit of
> +	 * config1[MMUSize-1]
> +	 */
> +	dins	a3, a5, 6, 8
> +	and	a3, a3, 0x3fff	/* a3 now includes max entries for cn6xxx */
> +
> +single_tlb_setup_cont:
> +
> +	/* Format physical address for entry low */
> +	nop
> +	dsrl	a1, a1, 12
> +	dsll	a1, a1, 6
> +	ori	a1, a1, 0x7	/* set DVG bits */
> +
> +	move	a4, a2
> +	daddu	a5, a4, a4	/* mapping size */
> +	dsll	a6, a4, 1
> +	daddiu	a6, a6, -1	/* pagemask */
> +	dsrl	a4, a4, 6	/* adjust for adding with entrylo */
> +
> +	/* Now set up mapping */
> +	mtc0	a6, CP0_PAGEMASK
> +	mtc0	a3, CP0_INDEX
> +
> +	dmtc0	a1, CP0_ENTRYLO0
> +	daddu	a1, a1, a4
> +
> +	dmtc0	a1, CP0_ENTRYLO1
> +	daddu	a1, a1, a4
> +
> +	dmtc0	a0, CP0_ENTRYHI
> +	daddu	a0, a0, a5
> +
> +	ehb
> +	tlbwi
> +	jr  ra
> +	 nop
> +	.end   single_tlb_setup
> +
> +
> +/**
> + * This code is moved to a movable boot bus region,
> + * and it is responsible for changing the flash mappings and
> + * jumping to run from the TLB mapped address.
> + *
> + * @param a0	New address for boot bus region 0
> + * @param a1	Address to branch to afterwards
> + * @param a2	Address of MIO_BOOT_REG_CFG0
> + */
> +	.balign	8
> +change_boot_mappings:
> +	sd	a0, 0(a2)
> +	sync
> +	j a1	    /* Jump to new TLB mapped location */
> +	 synci	0(zero)
> +
> +/* If we need a large stack, allocate it here. */
> +#if CONFIG_OCTEON_BIG_STACK_SIZE
> +	/* Allocate the stack here so it's in L2 cache or DRAM */
> +	.balign	16
> +big_stack_end:
> +	.skip	CONFIG_OCTEON_BIG_STACK_SIZE, 0
> +big_stack_start:
> +	.dword	0
> +#endif
> 

-- 
- Daniel


More information about the U-Boot mailing list