[U-Boot] [Patch v3 2/4] ARMv8/FSL_LSCH3: Add FSL_LSCH3 SoC

Thu May 29 15:19:02 CEST 2014

On Wed, May 28, 2014 at 6:46 PM, York Sun <yorksun at freescale.com> wrote:
> Freescale LayerScape with Chassis Generation 3 is a set of SoCs with
> ARMv8 cores and 3rd generation of Chassis. We use different MMU setup
> to support memory map and cache attribute for these SoCs. MMU and cache
> are enabled very early to bootst performance, especially for early
> development on emulators. After u-boot relocates to DDR, a new MMU
> table with QBMan cache access is created in DDR. SMMU pagesize is set
> in SMMU_sACR register. Both DDR3 and DDR4 are supported.
>
> Signed-off-by: York Sun <yorksun at freescale.com>
> Signed-off-by: Varun Sethi <Varun.Sethi at freescale.com>
> Signed-off-by: Arnab Basu <arnab.basu at freescale.com>
> ---
> Change log:
>  v3: Remove blank lines at the of files
>      Fix cluster PLL GSR register for accessing beyond array size
>      Update final MMU table to support QBMan memory with cache
>      Set SMMU pagesize in SMMU_sACR register in lowlevel init.
>      Add DDR4 support
>      Remove forcing L3 cache flusing
>      Update GICv3 redistributor base address
>
>  Some of these changes are caused by model change.
>
>  arch/arm/cpu/armv8/cache_v8.c                     |    7 +-
>  arch/arm/cpu/armv8/fsl-lsch3/Makefile             |   10 +
>  arch/arm/cpu/armv8/fsl-lsch3/README               |   10 +
>  arch/arm/cpu/armv8/fsl-lsch3/cpu.c                |  474 +++++++++++++++++++++
>  arch/arm/cpu/armv8/fsl-lsch3/cpu.h                |    7 +
>  arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S           |   65 +++
>  arch/arm/cpu/armv8/fsl-lsch3/speed.c              |  176 ++++++++
>  arch/arm/cpu/armv8/fsl-lsch3/speed.h              |    7 +
>  arch/arm/cpu/armv8/fsl-lsch3/timer.c              |   62 +++
>  arch/arm/include/asm/arch-fsl-lsch3/clock.h       |   23 +
>  arch/arm/include/asm/arch-fsl-lsch3/config.h      |   65 +++
>  arch/arm/include/asm/arch-fsl-lsch3/gpio.h        |    9 +
>  arch/arm/include/asm/arch-fsl-lsch3/immap_lsch3.h |  116 +++++
>  arch/arm/include/asm/arch-fsl-lsch3/imx-regs.h    |   13 +
>  arch/arm/include/asm/arch-fsl-lsch3/mmu.h         |   10 +
>  arch/arm/include/asm/config.h                     |    4 +
>  arch/arm/include/asm/system.h                     |    2 +
>  drivers/i2c/mxc_i2c.c                             |    5 +
>  include/common.h                                  |    5 +-
>  19 files changed, 1066 insertions(+), 4 deletions(-)
>  create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/Makefile
>  create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/README
>  create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/cpu.c
>  create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/cpu.h
>  create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S
>  create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/speed.c
>  create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/speed.h
>  create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/timer.c
>  create mode 100644 arch/arm/include/asm/arch-fsl-lsch3/clock.h
>  create mode 100644 arch/arm/include/asm/arch-fsl-lsch3/config.h
>  create mode 100644 arch/arm/include/asm/arch-fsl-lsch3/gpio.h
>  create mode 100644 arch/arm/include/asm/arch-fsl-lsch3/immap_lsch3.h
>  create mode 100644 arch/arm/include/asm/arch-fsl-lsch3/imx-regs.h
>  create mode 100644 arch/arm/include/asm/arch-fsl-lsch3/mmu.h
>
> diff --git a/arch/arm/cpu/armv8/cache_v8.c b/arch/arm/cpu/armv8/cache_v8.c
> index a96ecda..c47acba 100644
> --- a/arch/arm/cpu/armv8/cache_v8.c
> +++ b/arch/arm/cpu/armv8/cache_v8.c
> @@ -83,12 +83,17 @@ void invalidate_dcache_all(void)
>         __asm_invalidate_dcache_all();
>  }
>
> +void __weak flush_l3_cache(void)
> +{
> +}
> +
>  /*
>   * Performs a clean & invalidation of the entire data cache at all levels
>   */
>  void flush_dcache_all(void)
>  {
>         __asm_flush_dcache_all();
> +       flush_l3_cache();
>  }
>
>  /*
> @@ -221,7 +226,7 @@ void invalidate_icache_all(void)
>   * Enable dCache & iCache, whether cache is actually enabled
>   * depend on CONFIG_SYS_DCACHE_OFF and CONFIG_SYS_ICACHE_OFF
>   */
> -void enable_caches(void)
> +void __weak enable_caches(void)
>  {
>         icache_enable();
>         dcache_enable();
> diff --git a/arch/arm/cpu/armv8/fsl-lsch3/Makefile b/arch/arm/cpu/armv8/fsl-lsch3/Makefile
> new file mode 100644
> index 0000000..4b859cf
> --- /dev/null
> +++ b/arch/arm/cpu/armv8/fsl-lsch3/Makefile
> @@ -0,0 +1,10 @@
> +#
> +# Copyright 2014, Freescale Semiconductor
> +#
> +# SPDX-License-Identifier:     GPL-2.0+
> +#
> +
> +obj-y += cpu.o
> +obj-y += timer.o
> +obj-y += lowlevel.o
> +obj-y += speed.o
> diff --git a/arch/arm/cpu/armv8/fsl-lsch3/README b/arch/arm/cpu/armv8/fsl-lsch3/README
> new file mode 100644
> index 0000000..de34a91
> --- /dev/null
> +++ b/arch/arm/cpu/armv8/fsl-lsch3/README
> @@ -0,0 +1,10 @@
> +#
> +# Copyright 2014 Freescale Semiconductor
> +#
> +# SPDX-License-Identifier:      GPL-2.0+
> +#
> +
> +Freescale LayerScape with Chassis Generation 3
> +
> +This architecture supports Freescale ARMv8 SoCs with Chassis generation 3,
> +for example LS2100A.
> diff --git a/arch/arm/cpu/armv8/fsl-lsch3/cpu.c b/arch/arm/cpu/armv8/fsl-lsch3/cpu.c
> new file mode 100644
> index 0000000..2780390
> --- /dev/null
> +++ b/arch/arm/cpu/armv8/fsl-lsch3/cpu.c
> @@ -0,0 +1,474 @@
> +/*
> + * Copyright 2014 Freescale Semiconductor, Inc.
> + *
> + * SPDX-License-Identifier:    GPL-2.0+
> + */
> +
> +#include <common.h>
> +#include <asm/io.h>
> +#include <asm/system.h>
> +#include <asm/armv8/mmu.h>
> +#include <asm/io.h>
> +#include <asm/arch-fsl-lsch3/immap_lsch3.h>
> +#include "cpu.h"
> +#include "speed.h"
> +
> +DECLARE_GLOBAL_DATA_PTR;
> +
> +#ifndef CONFIG_SYS_DCACHE_OFF
> +/*
> + * To start MMU before DDR is available, we create MMU table in SRAM.
> + * The base address of SRAM is CONFIG_SYS_FSL_OCRAM_BASE. We use three
> + * levels of translation tables here to cover 40-bit address space.
> + * We use 4KB granule size, with 40 bits physical address, T0SZ=24
> + * Level 0 IA[39], table address @0
> + * Level 1 IA[31:30], table address @01000, 0x2000
> + * Level 2 IA[29:21], table address @0x3000
> + */
> +
> +#define EARLY_SECTION_SHIFT_L0 39UL
> +#define EARLY_SECTION_SHIFT_L1 30UL
> +#define EARLY_SECTION_SHIFT_L2 21UL
> +#define EARLY_BLOCK_SIZE_L0    0x8000000000UL
> +#define EARLY_BLOCK_SIZE_L1    (1 << EARLY_SECTION_SHIFT_L1)
> +#define EARLY_BLOCK_SIZE_L2    (1 << EARLY_SECTION_SHIFT_L2)
> +#define CONFIG_SYS_IFC_BASE    0x30000000
> +#define CONFIG_SYS_IFC_SIZE    0x10000000
> +#define CONFIG_SYS_IFC_BASE2   0x500000000
> +#define CONFIG_SYS_IFC_SIZE2   0x100000000
> +#define TCR_EL2_PS_40BIT       (2 << 16)
> +#define EARLY_VA_BITS          (40)
> +#define EARLY_TCR      (TCR_TG0_4K             | \
> +                       TCR_EL2_PS_40BIT        | \
> +                       TCR_SHARED_NON          | \
> +                       TCR_ORGN_NC             | \
> +                       TCR_IRGN_NC             | \
> +                       TCR_T0SZ(EARLY_VA_BITS))
> +
> +/*
> + * Final MMU
> + * Let's start from the same layout as early MMU and modify as needed.
> + * IFC regions will be cache-inhibit.
> + */
> +#define FINAL_SECTION_SHIFT_L0 39UL
> +#define FINAL_SECTION_SHIFT_L1 30UL
> +#define FINAL_SECTION_SHIFT_L2 21UL
> +#define FINAL_BLOCK_SIZE_L0    0x8000000000UL
> +#define FINAL_BLOCK_SIZE_L1    (1 << FINAL_SECTION_SHIFT_L1)
> +#define FINAL_BLOCK_SIZE_L2    (1 << FINAL_SECTION_SHIFT_L2)
> +#define FINAL_QBMAN_CACHED_MEM 0x818000000UL
> +#define FINAL_QBMAN_CACHED_SIZE        0x4000000
> +#define TCR_EL2_PS_40BIT       (2 << 16)
> +#define FINAL_VA_BITS          (40)
> +#define FINAL_TCR      (TCR_TG0_4K             | \
> +                       TCR_EL2_PS_40BIT        | \
> +                       TCR_SHARED_NON          | \
> +                       TCR_ORGN_NC             | \
> +                       TCR_IRGN_NC             | \
> +                       TCR_T0SZ(FINAL_VA_BITS))
> +
> +
> +static void set_pgtable_section(u64 *page_table, u64 index, u64 section,
> +                               u8 memory_type)
> +{
> +       u64 value;
> +
> +       value = section | PMD_TYPE_SECT | PMD_SECT_AF;
> +       value |= PMD_ATTRINDX(memory_type);
> +       page_table[index] = value;
> +}

This function looks like it should be common.

> +
> +static inline void early_mmu_setup(void)
> +{
> +       int el;
> +       u64 i;
> +       u64 section_l1t0, section_l1t1, section_l2;
> +       u64 *level0_table = (u64 *)CONFIG_SYS_FSL_OCRAM_BASE;
> +       u64 *level1_table_0 = (u64 *)(CONFIG_SYS_FSL_OCRAM_BASE + 0x1000);
> +       u64 *level1_table_1 = (u64 *)(CONFIG_SYS_FSL_OCRAM_BASE + 0x2000);
> +       u64 *level2_table = (u64 *)(CONFIG_SYS_FSL_OCRAM_BASE + 0x3000);
> +
> +
> +       level0_table[0] =
> +               (u64)level1_table_0 | PMD_TYPE_TABLE;
> +       level0_table[1] =
> +               (u64)level1_table_1 | PMD_TYPE_TABLE;
> +
> +       /*
> +        * set level 1 table 0 to cache_inhibit, covering 0 to 512GB
> +        * set level 1 table 1 to cache enabled, covering 512GB to 1TB
> +        * set level 2 table to cache-inhibit, covering 0 to 1GB
> +        */
> +       section_l1t0 = 0;
> +       section_l1t1 = EARLY_BLOCK_SIZE_L0;
> +       section_l2 = 0;
> +       for (i = 0; i < 512; i++) {
> +               set_pgtable_section(level1_table_0, i, section_l1t0,
> +                                   MT_DEVICE_NGNRNE);
> +               set_pgtable_section(level1_table_1, i, section_l1t1,
> +                                   MT_NORMAL);
> +               set_pgtable_section(level2_table, i, section_l2,
> +                                   MT_DEVICE_NGNRNE);
> +               section_l1t0 += EARLY_BLOCK_SIZE_L1;
> +               section_l1t1 += EARLY_BLOCK_SIZE_L1;
> +               section_l2 += EARLY_BLOCK_SIZE_L2;
> +       }
> +
> +       level1_table_0[0] =
> +               (u64)level2_table | PMD_TYPE_TABLE;
> +       level1_table_0[1] =
> +               0x40000000 | PMD_SECT_AF | PMD_TYPE_SECT |
> +               PMD_ATTRINDX(MT_DEVICE_NGNRNE);
> +       level1_table_0[2] =
> +               0x80000000 | PMD_SECT_AF | PMD_TYPE_SECT |
> +               PMD_ATTRINDX(MT_NORMAL);
> +       level1_table_0[3] =
> +               0xc0000000 | PMD_SECT_AF | PMD_TYPE_SECT |
> +               PMD_ATTRINDX(MT_NORMAL);
> +
> +       /* Rewrite table to enable cache */
> +       set_pgtable_section(level2_table,
> +                           CONFIG_SYS_FSL_OCRAM_BASE >> EARLY_SECTION_SHIFT_L2,
> +                           CONFIG_SYS_FSL_OCRAM_BASE,
> +                           MT_NORMAL);
> +       for (i = CONFIG_SYS_IFC_BASE >> EARLY_SECTION_SHIFT_L2;
> +            i < (CONFIG_SYS_IFC_BASE + CONFIG_SYS_IFC_SIZE)
> +            >> EARLY_SECTION_SHIFT_L2; i++) {
> +               section_l2 = i << EARLY_SECTION_SHIFT_L2;
> +               set_pgtable_section(level2_table, i,
> +                                   section_l2, MT_NORMAL);
> +       }
> +
> +       el = current_el();

We really can't have u-boot running at random ELs in v8 for different
platforms. It's a mess on v7. You should never be at EL3. u-boot could
be defined to run at EL1, but then you need to be able to go back to
EL2 to boot the kernel. So really u-boot should always run at EL2
unless you are running in a VM, but that would be a different
platform.

> +       if (el == 1) {
> +               asm volatile("dsb sy;isb");
> +               asm volatile("msr ttbr0_el1, %0"
> +                            : : "r" ((u64)level0_table) : "memory");
> +               asm volatile("msr tcr_el1, %0"
> +                            : : "r" (EARLY_TCR) : "memory");
> +               asm volatile("msr mair_el1, %0"
> +                            : : "r" (MEMORY_ATTRIBUTES) : "memory");

These should all be inline functions or macros.

> +       } else if (el == 2) {
> +               asm volatile("dsb sy;isb");
> +               asm volatile("msr ttbr0_el2, %0"
> +                            : : "r" ((u64)level0_table) : "memory");
> +               asm volatile("msr tcr_el2, %0"
> +                            : : "r" (EARLY_TCR) : "memory");
> +               asm volatile("msr mair_el2, %0"
> +                            : : "r" (MEMORY_ATTRIBUTES) : "memory");
> +       } else if (el == 3) {
> +               asm volatile("dsb sy;isb");
> +               asm volatile("msr ttbr0_el3, %0"
> +                            : : "r" ((u64)level0_table) : "memory");
> +               asm volatile("msr tcr_el3, %0"
> +                            : : "r" (EARLY_TCR) : "memory");
> +               asm volatile("msr mair_el3, %0"
> +                            : : "r" (MEMORY_ATTRIBUTES) : "memory");
> +       } else {
> +               hang();
> +       }
> +
> +       set_sctlr(get_sctlr() | CR_M);
> +}
> +
> +static inline void final_mmu_setup(void)

Looks like nearly the same code repeated...

> +{
> +       int el;
> +       u64 i, tbl_base, tbl_limit, section_base;
> +       u64 section_l1t0, section_l1t1, section_l2;
> +       u64 *level0_table = (u64 *)gd->arch.tlb_addr;
> +       u64 *level1_table_0 = (u64 *)(gd->arch.tlb_addr + 0x1000);
> +       u64 *level1_table_1 = (u64 *)(gd->arch.tlb_addr + 0x2000);
> +       u64 *level2_table_0 = (u64 *)(gd->arch.tlb_addr + 0x3000);
> +       u64 *level2_table_1 = (u64 *)(gd->arch.tlb_addr + 0x4000);
> +
> +
> +       level0_table[0] =
> +               (u64)level1_table_0 | PMD_TYPE_TABLE;
> +       level0_table[1] =
> +               (u64)level1_table_1 | PMD_TYPE_TABLE;
> +
> +       /*
> +        * set level 1 table 0 to cache_inhibit, covering 0 to 512GB
> +        * set level 1 table 1 to cache enabled, covering 512GB to 1TB
> +        * set level 2 table 0 to cache-inhibit, covering 0 to 1GB
> +        */
> +       section_l1t0 = 0;
> +       section_l1t1 = FINAL_BLOCK_SIZE_L0;
> +       section_l2 = 0;
> +       for (i = 0; i < 512; i++) {
> +               set_pgtable_section(level1_table_0, i, section_l1t0,
> +                                   MT_DEVICE_NGNRNE);
> +               set_pgtable_section(level1_table_1, i, section_l1t1,
> +                                   MT_NORMAL);
> +               set_pgtable_section(level2_table_0, i, section_l2,
> +                                   MT_DEVICE_NGNRNE);
> +               section_l1t0 += FINAL_BLOCK_SIZE_L1;
> +               section_l1t1 += FINAL_BLOCK_SIZE_L1;
> +               section_l2 += FINAL_BLOCK_SIZE_L2;
> +       }
> +
> +       level1_table_0[0] =
> +               (u64)level2_table_0 | PMD_TYPE_TABLE;
> +       level1_table_0[2] =
> +               0x80000000 | PMD_SECT_AF | PMD_TYPE_SECT |
> +               PMD_ATTRINDX(MT_NORMAL);
> +       level1_table_0[3] =
> +               0xc0000000 | PMD_SECT_AF | PMD_TYPE_SECT |
> +               PMD_ATTRINDX(MT_NORMAL);
> +
> +       /* Rewrite table to enable cache */
> +       set_pgtable_section(level2_table_0,
> +                           CONFIG_SYS_FSL_OCRAM_BASE >> FINAL_SECTION_SHIFT_L2,
> +                           CONFIG_SYS_FSL_OCRAM_BASE,
> +                           MT_NORMAL);
> +
> +       /*
> +        * Fill in other part of tables if cache is needed
> +        * If finer granularity than 1GB is needed, sub table
> +        * should be created.
> +        */
> +       section_base = FINAL_QBMAN_CACHED_MEM & ~(FINAL_BLOCK_SIZE_L1 - 1);
> +       i = section_base >> FINAL_SECTION_SHIFT_L1;
> +       level1_table_0[i] = (u64)level2_table_1 | PMD_TYPE_TABLE;
> +       section_l2 = section_base;
> +       for (i = 0; i < 512; i++) {
> +               set_pgtable_section(level2_table_1, i, section_l2,
> +                                   MT_DEVICE_NGNRNE);
> +               section_l2 += FINAL_BLOCK_SIZE_L2;
> +       }
> +       tbl_base = FINAL_QBMAN_CACHED_MEM & (FINAL_BLOCK_SIZE_L1 - 1);
> +       tbl_limit = (FINAL_QBMAN_CACHED_MEM + FINAL_QBMAN_CACHED_SIZE) &
> +                   (FINAL_BLOCK_SIZE_L1 - 1);
> +       for (i = tbl_base >> FINAL_SECTION_SHIFT_L2;
> +            i < tbl_limit >> FINAL_SECTION_SHIFT_L2; i++) {
> +               section_l2 = section_base + (i << FINAL_SECTION_SHIFT_L2);
> +               set_pgtable_section(level2_table_1, i,
> +                                   section_l2, MT_NORMAL);
> +       }
> +
> +       el = current_el();
> +       if (el == 1) {
> +               asm volatile("dsb sy;isb");
> +               asm volatile("msr ttbr0_el1, %0"
> +                            : : "r" ((u64)level0_table) : "memory");
> +               asm volatile("msr tcr_el1, %0"
> +                            : : "r" (FINAL_TCR) : "memory");
> +               asm volatile("msr mair_el1, %0"
> +                            : : "r" (MEMORY_ATTRIBUTES) : "memory");
> +       } else if (el == 2) {
> +               asm volatile("dsb sy;isb");
> +               asm volatile("msr ttbr0_el2, %0"
> +                            : : "r" ((u64)level0_table) : "memory");
> +               asm volatile("msr tcr_el2, %0"
> +                            : : "r" (FINAL_TCR) : "memory");
> +               asm volatile("msr mair_el2, %0"
> +                            : : "r" (MEMORY_ATTRIBUTES) : "memory");
> +       } else if (el == 3) {
> +               asm volatile("dsb sy;isb");
> +               asm volatile("msr ttbr0_el3, %0"
> +                            : : "r" ((u64)level0_table) : "memory");
> +               asm volatile("msr tcr_el3, %0"
> +                            : : "r" (FINAL_TCR) : "memory");
> +               asm volatile("msr mair_el3, %0"
> +                            : : "r" (MEMORY_ATTRIBUTES) : "memory");
> +       } else {
> +               hang();
> +       }
> +
> +       set_sctlr(get_sctlr() | CR_M);
> +}
> +
> +int arch_cpu_init(void)
> +{
> +       icache_enable();
> +       __asm_invalidate_dcache_all();
> +       __asm_invalidate_tlb_all();
> +       early_mmu_setup();
> +       set_sctlr(get_sctlr() | CR_C);
> +       return 0;
> +}
> +
> +/*
> + * flush_l3_cache
> + * Dickens L3 cache can be flushed by transitioning from FAM to SFONLY power
> + * state, by writing to HP-F P-state request register.

Other SOCs will have Dickens. Are these registers FSL specific? If
not, this should be common.

Also, I believe the proper way to flush Dickens is using the
architected cache flushing method where you walk the levels out to
level 3.

> + */
> +#define HNF0_PSTATE_REQ 0x04200010
> +#define HNF1_PSTATE_REQ 0x04210010
> +#define HNF2_PSTATE_REQ 0x04220010
> +#define HNF3_PSTATE_REQ 0x04230010
> +#define HNF4_PSTATE_REQ 0x04240010
> +#define HNF5_PSTATE_REQ 0x04250010
> +#define HNF6_PSTATE_REQ 0x04260010
> +#define HNF7_PSTATE_REQ 0x04270010
> +#define HNFPSTAT_MASK (0xFFFFFFFFFFFFFFFC)
> +#define HNFPSTAT_FAM   0x3
> +#define HNFPSTAT_SFONLY 0x01
> +
> +static void hnf_pstate_req(u64 *ptr, u64 state)
> +{
> +       int timeout = 1000;
> +       out_le64(ptr, (in_le64(ptr) & HNFPSTAT_MASK) | (state & 0x3));
> +       ptr++;
> +       /* checking if the transition is completed */
> +       while (timeout > 0) {
> +               if (((in_le64(ptr) & 0x0c) >> 2) == (state & 0x3))
> +                       break;
> +               udelay(100);
> +               timeout--;
> +       }
> +}
> +
> +void flush_l3_cache(void)
> +{
> +       hnf_pstate_req((u64 *)HNF0_PSTATE_REQ, HNFPSTAT_SFONLY);
> +       hnf_pstate_req((u64 *)HNF1_PSTATE_REQ, HNFPSTAT_SFONLY);
> +       hnf_pstate_req((u64 *)HNF2_PSTATE_REQ, HNFPSTAT_SFONLY);
> +       hnf_pstate_req((u64 *)HNF3_PSTATE_REQ, HNFPSTAT_SFONLY);
> +       hnf_pstate_req((u64 *)HNF4_PSTATE_REQ, HNFPSTAT_SFONLY);
> +       hnf_pstate_req((u64 *)HNF5_PSTATE_REQ, HNFPSTAT_SFONLY);
> +       hnf_pstate_req((u64 *)HNF6_PSTATE_REQ, HNFPSTAT_SFONLY);
> +       hnf_pstate_req((u64 *)HNF7_PSTATE_REQ, HNFPSTAT_SFONLY);
> +       hnf_pstate_req((u64 *)HNF0_PSTATE_REQ, HNFPSTAT_FAM);
> +       hnf_pstate_req((u64 *)HNF1_PSTATE_REQ, HNFPSTAT_FAM);
> +       hnf_pstate_req((u64 *)HNF2_PSTATE_REQ, HNFPSTAT_FAM);
> +       hnf_pstate_req((u64 *)HNF3_PSTATE_REQ, HNFPSTAT_FAM);
> +       hnf_pstate_req((u64 *)HNF4_PSTATE_REQ, HNFPSTAT_FAM);
> +       hnf_pstate_req((u64 *)HNF5_PSTATE_REQ, HNFPSTAT_FAM);
> +       hnf_pstate_req((u64 *)HNF6_PSTATE_REQ, HNFPSTAT_FAM);
> +       hnf_pstate_req((u64 *)HNF7_PSTATE_REQ, HNFPSTAT_FAM);
> +}
> +
> +/*
> + * This function is called from lib/board.c.
> + * It recreates MMU table in main memory. MMU and d-cache are enabled earlier.
> + * There is no need to disable d-cache for this operation.
> + */
> +void enable_caches(void)
> +{
> +       final_mmu_setup();
> +       flush_dcache_range(gd->arch.tlb_addr,
> +                          gd->arch.tlb_addr +  gd->arch.tlb_size);
> +       __asm_invalidate_tlb_all();
> +}
> +#endif
> +
> +static inline u32 init_type(u32 cluster, int init_id)

init_type? That's a great name.

> +{
> +       struct ccsr_gur *gur = (void *)(CONFIG_SYS_FSL_GUTS_ADDR);
> +       u32 idx = (cluster >> (init_id * 8)) & TP_CLUSTER_INIT_MASK;
> +       u32 type = in_le32(&gur->tp_ityp[idx]);
> +
> +       if (type & TP_ITYP_AV)
> +               return type;
> +
> +       return 0;
> +}
> +
> +u32 cpu_mask(void)
> +{
> +       struct ccsr_gur __iomem *gur = (void *)(CONFIG_SYS_FSL_GUTS_ADDR);
> +       int i = 0, count = 0;
> +       u32 cluster, type, mask = 0;
> +
> +       do {
> +               int j;
> +               cluster = in_le32(&gur->tp_cluster[i].lower);
> +               for (j = 0; j < TP_INIT_PER_CLUSTER; j++) {
> +                       type = init_type(cluster, j);
> +                       if (type) {
> +                               if (TP_ITYP_TYPE(type) == TP_ITYP_TYPE_ARM)
> +                                       mask |= 1 << count;
> +                               count++;
> +                       }
> +               }
> +               i++;
> +       } while ((cluster & TP_CLUSTER_EOC) != TP_CLUSTER_EOC);
> +
> +       return mask;
> +}
> +
> +/*
> + * Return the number of cores on this SOC.
> + */
> +int cpu_numcores(void)
> +{
> +       return hweight32(cpu_mask());
> +}
> +
> +int fsl_qoriq_core_to_cluster(unsigned int core)
> +{
> +       struct ccsr_gur __iomem *gur =
> +               (void __iomem *)(CONFIG_SYS_FSL_GUTS_ADDR);
> +       int i = 0, count = 0;
> +       u32 cluster;
> +
> +       do {
> +               int j;
> +               cluster = in_le32(&gur->tp_cluster[i].lower);
> +               for (j = 0; j < TP_INIT_PER_CLUSTER; j++) {
> +                       if (init_type(cluster, j)) {
> +                               if (count == core)
> +                                       return i;
> +                               count++;
> +                       }
> +               }
> +               i++;
> +       } while ((cluster & TP_CLUSTER_EOC) != TP_CLUSTER_EOC);
> +
> +       return -1;      /* cannot identify the cluster */
> +}
> +
> +u32 fsl_qoriq_core_to_type(unsigned int core)
> +{
> +       struct ccsr_gur __iomem *gur =
> +               (void __iomem *)(CONFIG_SYS_FSL_GUTS_ADDR);
> +       int i = 0, count = 0;
> +       u32 cluster, type;
> +
> +       do {
> +               int j;
> +               cluster = in_le32(&gur->tp_cluster[i].lower);
> +               for (j = 0; j < TP_INIT_PER_CLUSTER; j++) {
> +                       type = init_type(cluster, j);
> +                       if (type) {
> +                               if (count == core)
> +                                       return type;
> +                               count++;
> +                       }
> +               }
> +               i++;
> +       } while ((cluster & TP_CLUSTER_EOC) != TP_CLUSTER_EOC);
> +
> +       return -1;      /* cannot identify the cluster */
> +}

Do you plan on supporting PSCI because all this core and cluster stuff
belongs there.

> +
> +#ifdef CONFIG_DISPLAY_CPUINFO
> +int print_cpuinfo(void)
> +{
> +       struct sys_info sysinfo;
> +       char buf[32];
> +       unsigned int i, core;
> +       u32 type;
> +
> +       get_sys_info(&sysinfo);
> +       puts("Clock Configuration:");
> +       for_each_cpu(i, core, cpu_numcores(), cpu_mask()) {
> +               if (!(i % 3))
> +                       puts("\n       ");
> +               type = TP_ITYP_VER(fsl_qoriq_core_to_type(core));
> +               printf("CPU%d(%s):%-4s MHz  ", core,
> +                      type == TY_ITYP_VER_A7 ? "A7 " :
> +                      (type == TY_ITYP_VER_A53 ? "A53" :
> +                       (type == TY_ITYP_VER_A57 ? "A57" : "   ")),
> +                      strmhz(buf, sysinfo.freq_processor[core]));
> +       }
> +       printf("\n       Bus:      %-4s MHz  ",
> +              strmhz(buf, sysinfo.freq_systembus));
> +       printf("DDR:      %-4s MHz", strmhz(buf, sysinfo.freq_ddrbus));
> +       puts("\n");
> +
> +       return 0;
> +}
> +#endif
> diff --git a/arch/arm/cpu/armv8/fsl-lsch3/cpu.h b/arch/arm/cpu/armv8/fsl-lsch3/cpu.h
> new file mode 100644
> index 0000000..28544d7
> --- /dev/null
> +++ b/arch/arm/cpu/armv8/fsl-lsch3/cpu.h
> @@ -0,0 +1,7 @@
> +/*
> + * Copyright 2014, Freescale Semiconductor
> + *
> + * SPDX-License-Identifier:    GPL-2.0+
> + */
> +
> +int fsl_qoriq_core_to_cluster(unsigned int core);
> diff --git a/arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S b/arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S
> new file mode 100644
> index 0000000..087d5d1
> --- /dev/null
> +++ b/arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S
> @@ -0,0 +1,65 @@
> +/*
> + * (C) Copyright 2014 Freescale Semiconductor
> + *
> + * SPDX-License-Identifier:    GPL-2.0+
> + *
> + * Extracted from armv8/start.S
> + */
> +
> +#include <config.h>
> +#include <linux/linkage.h>
> +#include <asm/macro.h>
> +
> +ENTRY(lowlevel_init)
> +       /* Initialize GIC Secure Bank Status */
> +       mov     x29, lr                 /* Save LR */
> +
> +       /* Set the SMMU page size in the sACR register */
> +       ldr     x1, =SMMU_BASE
> +       ldr     w0, [x1, #0x10]
> +       orr     w0, w0, #1 << 16  /* set sACR.pagesize to indicate 64K page */
> +       str     w0, [x1, #0x10]
> +
> +#if defined(CONFIG_GICV2) || defined(CONFIG_GICV3)

You can have either v2 or v3?

> +       branch_if_slave x0, 1f
> +       ldr     x0, =GICD_BASE
> +       bl      gic_init_secure
> +1:
> +#if defined(CONFIG_GICV3)
> +       ldr     x0, =GICR_BASE
> +       bl      gic_init_secure_percpu
> +#elif defined(CONFIG_GICV2)
> +       ldr     x0, =GICD_BASE
> +       ldr     x1, =GICC_BASE
> +       bl      gic_init_secure_percpu
> +#endif
> +#endif
> +
> +       branch_if_master x0, x1, 1f
> +
> +       /*
> +        * Slave should wait for master clearing spin table.
> +        * This sync prevent salves observing incorrect
> +        * value of spin table and jumping to wrong place.
> +        */
> +#if defined(CONFIG_GICV2) || defined(CONFIG_GICV3)
> +#ifdef CONFIG_GICV2
> +       ldr     x0, =GICC_BASE
> +#endif
> +       bl      gic_wait_for_interrupt
> +#endif
> +
> +       /*
> +        * All processors will enter EL2 and optionally EL1.
> +        */
> +       bl      armv8_switch_to_el2
> +#ifdef CONFIG_ARMV8_SWITCH_TO_EL1
> +       bl      armv8_switch_to_el1
> +#endif
> +       b       2f

This all looks like cut and paste from existing startup code. Can't
you refactor things?

> +
> +1:
> +2:
> +       mov     lr, x29                 /* Restore LR */
> +       ret
> +ENDPROC(lowlevel_init)
> diff --git a/arch/arm/cpu/armv8/fsl-lsch3/speed.c b/arch/arm/cpu/armv8/fsl-lsch3/speed.c
> new file mode 100644
> index 0000000..dc4a34b
> --- /dev/null
> +++ b/arch/arm/cpu/armv8/fsl-lsch3/speed.c
> @@ -0,0 +1,176 @@
> +/*
> + * Copyright 2014, Freescale Semiconductor, Inc.
> + *
> + * SPDX-License-Identifier:    GPL-2.0+
> + *
> + * Derived from arch/power/cpu/mpc85xx/speed.c
> + */
> +
> +#include <common.h>
> +#include <linux/compiler.h>
> +#include <fsl_ifc.h>
> +#include <asm/processor.h>
> +#include <asm/io.h>
> +#include <asm/arch-fsl-lsch3/immap_lsch3.h>
> +#include <asm/arch/clock.h>
> +#include "cpu.h"
> +
> +DECLARE_GLOBAL_DATA_PTR;
> +
> +#ifndef CONFIG_SYS_FSL_NUM_CC_PLLS
> +#define CONFIG_SYS_FSL_NUM_CC_PLLS     6
> +#endif
> +
> +
> +void get_sys_info(struct sys_info *sys_info)
> +{
> +       struct ccsr_gur __iomem *gur = (void *)(CONFIG_SYS_FSL_GUTS_ADDR);
> +#ifdef CONFIG_FSL_IFC
> +       struct fsl_ifc *ifc_regs = (void *)CONFIG_SYS_IFC_ADDR;
> +       u32 ccr;
> +#endif
> +       struct ccsr_clk_cluster_group __iomem *clk_grp[2] = {
> +               (void *)(CONFIG_SYS_FSL_CH3_CLK_GRPA_ADDR),
> +               (void *)(CONFIG_SYS_FSL_CH3_CLK_GRPB_ADDR)
> +       };
> +       struct ccsr_clk_ctrl __iomem *clk_ctrl =
> +               (void *)(CONFIG_SYS_FSL_CH3_CLK_CTRL_ADDR);
> +       unsigned int cpu;
> +       const u8 core_cplx_pll[16] = {
> +               [0] = 0,        /* CC1 PPL / 1 */
> +               [1] = 0,        /* CC1 PPL / 2 */
> +               [2] = 0,        /* CC1 PPL / 4 */
> +               [4] = 1,        /* CC2 PPL / 1 */
> +               [5] = 1,        /* CC2 PPL / 2 */
> +               [6] = 1,        /* CC2 PPL / 4 */
> +               [8] = 2,        /* CC3 PPL / 1 */
> +               [9] = 2,        /* CC3 PPL / 2 */
> +               [10] = 2,       /* CC3 PPL / 4 */
> +               [12] = 3,       /* CC4 PPL / 1 */
> +               [13] = 3,       /* CC4 PPL / 2 */
> +               [14] = 3,       /* CC4 PPL / 4 */
> +       };
> +
> +       const u8 core_cplx_pll_div[16] = {
> +               [0] = 1,        /* CC1 PPL / 1 */
> +               [1] = 2,        /* CC1 PPL / 2 */
> +               [2] = 4,        /* CC1 PPL / 4 */
> +               [4] = 1,        /* CC2 PPL / 1 */
> +               [5] = 2,        /* CC2 PPL / 2 */
> +               [6] = 4,        /* CC2 PPL / 4 */
> +               [8] = 1,        /* CC3 PPL / 1 */
> +               [9] = 2,        /* CC3 PPL / 2 */
> +               [10] = 4,       /* CC3 PPL / 4 */
> +               [12] = 1,       /* CC4 PPL / 1 */
> +               [13] = 2,       /* CC4 PPL / 2 */
> +               [14] = 4,       /* CC4 PPL / 4 */
> +       };
> +
> +       uint i, cluster;
> +       uint freq_c_pll[CONFIG_SYS_FSL_NUM_CC_PLLS];
> +       uint ratio[CONFIG_SYS_FSL_NUM_CC_PLLS];
> +       unsigned long sysclk = CONFIG_SYS_CLK_FREQ;
> +       int cc_group[12] = CONFIG_SYS_FSL_CLUSTER_CLOCKS;
> +       u32 c_pll_sel, cplx_pll;
> +       void *offset;
> +
> +       sys_info->freq_systembus = sysclk;
> +#ifdef CONFIG_DDR_CLK_FREQ
> +       sys_info->freq_ddrbus = CONFIG_DDR_CLK_FREQ;
> +#else
> +       sys_info->freq_ddrbus = sysclk;
> +#endif
> +
> +       sys_info->freq_systembus *= (in_le32(&gur->rcwsr[0]) >>
> +                       FSL_CHASSIS3_RCWSR0_SYS_PLL_RAT_SHIFT) &
> +                       FSL_CHASSIS3_RCWSR0_SYS_PLL_RAT_MASK;
> +       sys_info->freq_ddrbus *= (in_le32(&gur->rcwsr[0]) >>
> +                       FSL_CHASSIS3_RCWSR0_MEM_PLL_RAT_SHIFT) &
> +                       FSL_CHASSIS3_RCWSR0_MEM_PLL_RAT_MASK;
> +
> +       for (i = 0; i < CONFIG_SYS_FSL_NUM_CC_PLLS; i++) {
> +               /*
> +                * fixme: prefer to combine the following into one line, but
> +                * cannot pass compiling without warning about in_le32.
> +                */
> +               offset = (void *)((size_t)clk_grp[i/3] +
> +                        offsetof(struct ccsr_clk_cluster_group,
> +                                 pllngsr[i%3].gsr));
> +               ratio[i] = (in_le32(offset) >> 1) & 0x3f;
> +               if (ratio[i] > 4)
> +                       freq_c_pll[i] = sysclk * ratio[i];
> +               else
> +                       freq_c_pll[i] = sys_info->freq_systembus * ratio[i];
> +       }
> +
> +       for_each_cpu(i, cpu, cpu_numcores(), cpu_mask()) {
> +               cluster = fsl_qoriq_core_to_cluster(cpu);
> +               c_pll_sel = (in_le32(&clk_ctrl->clkcncsr[cluster].csr) >> 27)
> +                           & 0xf;
> +               cplx_pll = core_cplx_pll[c_pll_sel];
> +               cplx_pll += cc_group[cluster] - 1;
> +               sys_info->freq_processor[cpu] =
> +                       freq_c_pll[cplx_pll] / core_cplx_pll_div[c_pll_sel];
> +       }
> +
> +#if defined(CONFIG_FSL_IFC)
> +       ccr = in_le32(&ifc_regs->ifc_ccr);
> +       ccr = ((ccr & IFC_CCR_CLK_DIV_MASK) >> IFC_CCR_CLK_DIV_SHIFT) + 1;
> +
> +       sys_info->freq_localbus = sys_info->freq_systembus / ccr;
> +#endif
> +}
> +
> +
> +int get_clocks(void)
> +{
> +       struct sys_info sys_info;
> +       get_sys_info(&sys_info);
> +       gd->cpu_clk = sys_info.freq_processor[0];
> +       gd->bus_clk = sys_info.freq_systembus;
> +       gd->mem_clk = sys_info.freq_ddrbus;
> +
> +#if defined(CONFIG_FSL_ESDHC)
> +       gd->arch.sdhc_clk = gd->bus_clk / 2;
> +#endif /* defined(CONFIG_FSL_ESDHC) */
> +
> +       if (gd->cpu_clk != 0)
> +               return 0;
> +       else
> +               return 1;
> +}
> +
> +/********************************************
> + * get_bus_freq
> + * return system bus freq in Hz
> + *********************************************/
> +ulong get_bus_freq(ulong dummy)
> +{
> +       if (!gd->bus_clk)
> +               get_clocks();
> +
> +       return gd->bus_clk;
> +}
> +
> +/********************************************
> + * get_ddr_freq
> + * return ddr bus freq in Hz
> + *********************************************/
> +ulong get_ddr_freq(ulong dummy)
> +{
> +       if (!gd->mem_clk)
> +               get_clocks();
> +
> +       return gd->mem_clk;
> +}
> +
> +unsigned int mxc_get_clock(enum mxc_clock clk)
> +{
> +       switch (clk) {
> +       case MXC_I2C_CLK:
> +               return get_bus_freq(0) / 2;
> +       default:
> +               printf("Unsupported clock\n");
> +       }
> +       return 0;
> +}
> diff --git a/arch/arm/cpu/armv8/fsl-lsch3/speed.h b/arch/arm/cpu/armv8/fsl-lsch3/speed.h
> new file mode 100644
> index 0000000..15af5b9
> --- /dev/null
> +++ b/arch/arm/cpu/armv8/fsl-lsch3/speed.h
> @@ -0,0 +1,7 @@
> +/*
> + * Copyright 2014, Freescale Semiconductor, Inc.
> + *
> + * SPDX-License-Identifier:    GPL-2.0+
> + */
> +
> +void get_sys_info(struct sys_info *sys_info);
> diff --git a/arch/arm/cpu/armv8/fsl-lsch3/timer.c b/arch/arm/cpu/armv8/fsl-lsch3/timer.c
> new file mode 100644
> index 0000000..3adfa41
> --- /dev/null
> +++ b/arch/arm/cpu/armv8/fsl-lsch3/timer.c
> @@ -0,0 +1,62 @@
> +/*
> + * Copyright 2014, Freescale Semiconductor
> + *
> + * SPDX-License-Identifier:    GPL-2.0+
> + */
> +
> +#include <common.h>
> +#include <div64.h>
> +#include <linux/compiler.h>
> +
> +static inline u64 get_cntfrq(void)
> +{
> +       u64 cntfrq;
> +       asm volatile("mrs %0, cntfrq_el0" : "=r" (cntfrq));
> +       return cntfrq;
> +}
> +
> +static inline u64 tick_to_time(u64 tick)
> +{
> +       tick *= CONFIG_SYS_HZ;
> +       do_div(tick, get_cntfrq());
> +       return tick;
> +}
> +
> +static inline u64 time_to_tick(u64 time)
> +{
> +       time *= get_cntfrq();
> +       do_div(time, CONFIG_SYS_HZ);
> +       return time;
> +}
> +
> +static inline u64 us_to_tick(unsigned long long usec)
> +{
> +       usec = usec * get_cntfrq() + 999999;
> +       do_div(usec, 1000000);
> +
> +       return usec;
> +}
> +
> +u64 get_ticks(void)
> +{
> +       u64 cval;
> +
> +       asm volatile("isb;mrs %0, cntpct_el0" : "=r" (cval));
> +
> +       return cval;
> +}
> +
> +ulong get_timer(ulong base)
> +{
> +       return tick_to_time(get_ticks()) - base;
> +}
> +
> +void __udelay(unsigned long usec)
> +{
> +       u64 start, tmo;
> +
> +       start = get_ticks();
> +       tmo = us_to_tick(usec);
> +       while (get_ticks() < (start + tmo))
> +               ;
> +}

What's wrong with the existing arch timer code?

Rob