[U-Boot] [Patch v2 3/5] armv8/fsl-lsch3: Release secondary cores from boot hold off with Boot Page

Thu Aug 21 15:47:24 CEST 2014

Hi York,

I have mostly minor comments this time; this is looking pretty good.

On Tue, Aug 19, 2014 at 09:28:00PM +0100, York Sun wrote:
> Secondary cores need to be released from holdoff by boot release
> registers. With GPP bootrom, they can boot from main memory
> directly. Individual spin table is used for each core. If a single
> release address is needed, defining macro CONFIG_FSL_SMP_RELEASE_ALL
> will use the CPU_RELEASE_ADDR. Spin table and the boot page is reserved
> in device tree so OS won't overwrite.
>
> Signed-off-by: York Sun <yorksun at freescale.com>
> Signed-off-by: Arnab Basu <arnab.basu at freescale.com>
> ---
>  v2: Removed copying boot page. Use u-boot image as is in memory.
>      Added dealing with different size of addr_cell in device tree.
>      Added dealing with big- and little-endian.
>      Added flushing spin table after cpu_release().
>
>  arch/arm/cpu/armv8/fsl-lsch3/Makefile             |    2 +
>  arch/arm/cpu/armv8/fsl-lsch3/cpu.c                |   13 ++
>  arch/arm/cpu/armv8/fsl-lsch3/cpu.h                |    1 +
>  arch/arm/cpu/armv8/fsl-lsch3/fdt.c                |   56 +++++++
>  arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S           |  128 ++++++++++++---
>  arch/arm/cpu/armv8/fsl-lsch3/mp.c                 |  172 +++++++++++++++++++++
>  arch/arm/cpu/armv8/fsl-lsch3/mp.h                 |   36 +++++
>  arch/arm/cpu/armv8/transition.S                   |   63 +-------
>  arch/arm/include/asm/arch-fsl-lsch3/config.h      |    3 +-
>  arch/arm/include/asm/arch-fsl-lsch3/immap_lsch3.h |   35 +++++
>  arch/arm/include/asm/macro.h                      |   92 +++++++++++
>  arch/arm/lib/gic_64.S                             |   10 +-
>  common/board_f.c                                  |    2 +-
>  13 files changed, 525 insertions(+), 88 deletions(-)
>  create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/fdt.c
>  create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/mp.c
>  create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/mp.h
>
> diff --git a/arch/arm/cpu/armv8/fsl-lsch3/Makefile b/arch/arm/cpu/armv8/fsl-lsch3/Makefile
> index 9249537..f920eeb 100644
> --- a/arch/arm/cpu/armv8/fsl-lsch3/Makefile
> +++ b/arch/arm/cpu/armv8/fsl-lsch3/Makefile
> @@ -7,3 +7,5 @@
>  obj-y += cpu.o
>  obj-y += lowlevel.o
>  obj-y += speed.o
> +obj-$(CONFIG_MP) += mp.o
> +obj-$(CONFIG_OF_LIBFDT) += fdt.o
> diff --git a/arch/arm/cpu/armv8/fsl-lsch3/cpu.c b/arch/arm/cpu/armv8/fsl-lsch3/cpu.c
> index c129d03..47b947f 100644
> --- a/arch/arm/cpu/armv8/fsl-lsch3/cpu.c
> +++ b/arch/arm/cpu/armv8/fsl-lsch3/cpu.c
> @@ -11,6 +11,7 @@
>  #include <asm/io.h>
>  #include <asm/arch-fsl-lsch3/immap_lsch3.h>
>  #include "cpu.h"
> +#include "mp.h"
>  #include "speed.h"
>  #include <fsl_mc.h>
>
> @@ -434,3 +435,15 @@ int cpu_eth_init(bd_t *bis)
>  #endif
>         return error;
>  }
> +
> +
> +int arch_early_init_r(void)
> +{
> +       int rv;
> +       rv = fsl_lsch3_wake_seconday_cores();
> +
> +       if (rv)
> +               printf("Did not wake secondary cores\n");
> +
> +       return 0;
> +}
> diff --git a/arch/arm/cpu/armv8/fsl-lsch3/cpu.h b/arch/arm/cpu/armv8/fsl-lsch3/cpu.h
> index 28544d7..2e3312b 100644
> --- a/arch/arm/cpu/armv8/fsl-lsch3/cpu.h
> +++ b/arch/arm/cpu/armv8/fsl-lsch3/cpu.h
> @@ -5,3 +5,4 @@
>   */
>
>  int fsl_qoriq_core_to_cluster(unsigned int core);
> +u32 cpu_mask(void);
> diff --git a/arch/arm/cpu/armv8/fsl-lsch3/fdt.c b/arch/arm/cpu/armv8/fsl-lsch3/fdt.c
> new file mode 100644
> index 0000000..2dbcdcb
> --- /dev/null
> +++ b/arch/arm/cpu/armv8/fsl-lsch3/fdt.c
> @@ -0,0 +1,56 @@
> +/*
> + * Copyright 2014 Freescale Semiconductor, Inc.
> + *
> + * SPDX-License-Identifier:    GPL-2.0+
> + */
> +
> +#include <common.h>
> +#include <libfdt.h>
> +#include <fdt_support.h>
> +#include "mp.h"
> +
> +#ifdef CONFIG_MP
> +void ft_fixup_cpu(void *blob)
> +{
> +       int off;
> +       __maybe_unused u64 spin_tbl_addr = (u64)get_spin_tbl_addr();
> +       fdt32_t *reg;
> +       int addr_cells;
> +       u64 val;
> +       size_t *boot_code_size = &(__secondary_boot_code_size);
> +
> +       off = fdt_node_offset_by_prop_value(blob, -1, "device_type", "cpus", 4);

I didn't think /cpus had device_type = "cpus". I can't see any
instances in any DTs I have to hand. Can we not find /cpus by path?

> +       of_bus_default_count_cells(blob, off, &addr_cells, NULL);
> +
> +       off = fdt_node_offset_by_prop_value(blob, -1, "device_type", "cpu", 4);
> +       while (off != -FDT_ERR_NOTFOUND) {
> +               reg = (fdt32_t *)fdt_getprop(blob, off, "reg", 0);
> +               if (reg) {
> +                       val = spin_tbl_addr;
> +#ifndef CONFIG_FSL_SMP_RELEASE_ALL
> +                       val += id_to_core(of_read_number(reg, addr_cells))
> +                               * SPIN_TABLE_ELEM_SIZE;
> +#endif
> +                       val = cpu_to_fdt64(val);
> +                       fdt_setprop_string(blob, off, "enable-method",
> +                                          "spin-table");
> +                       fdt_setprop(blob, off, "cpu-release-addr",
> +                                   &val, sizeof(val));
> +               } else {
> +                       puts("cpu NULL\n");

Could we change that to "Warning: found cpu node without reg property"
or something like that which makes the problem clear?

> +               }
> +               off = fdt_node_offset_by_prop_value(blob, off, "device_type",
> +                                                   "cpu", 4);
> +       }
> +
> +       fdt_add_mem_rsv(blob, (uintptr_t)&secondary_boot_code,
> +                       *boot_code_size);
> +}
> +#endif
> +
> +void ft_cpu_setup(void *blob, bd_t *bd)
> +{
> +#ifdef CONFIG_MP
> +       ft_fixup_cpu(blob);
> +#endif
> +}
> diff --git a/arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S b/arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S
> index ad32b6c..629e6d4 100644
> --- a/arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S
> +++ b/arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S
> @@ -8,7 +8,9 @@
>
>  #include <config.h>
>  #include <linux/linkage.h>
> +#include <asm/gic.h>
>  #include <asm/macro.h>
> +#include "mp.h"
>
>  ENTRY(lowlevel_init)
>         mov     x29, lr                 /* Save LR */
> @@ -37,29 +39,119 @@ ENTRY(lowlevel_init)
>
>         branch_if_master x0, x1, 1f
>
> +       ldr     x0, =secondary_boot_func
> +       blr     x0
> +
> +1:
> +2:

Isn't the '2' label redundant?

> +       mov     lr, x29                 /* Restore LR */
> +       ret
> +ENDPROC(lowlevel_init)
> +
> +       /* Keep literals not used by the secondary boot code outside it */
> +       .ltorg
> +
> +       /* Using 64 bit alignment since the spin table is accessed as data */
> +       .align 4
> +       .global secondary_boot_code
> +       /* Secondary Boot Code starts here */
> +secondary_boot_code:
> +       .global __spin_table
> +__spin_table:
> +       .space CONFIG_MAX_CPUS*SPIN_TABLE_ELEM_SIZE
> +
> +       .align 2
> +ENTRY(secondary_boot_func)
>         /*
> -        * Slave should wait for master clearing spin table.
> -        * This sync prevent salves observing incorrect
> -        * value of spin table and jumping to wrong place.
> +        * MPIDR_EL1 Fields:
> +        * MPIDR[1:0] = AFF0_CPUID <- Core ID (0,1)
> +        * MPIDR[7:2] = AFF0_RES
> +        * MPIDR[15:8] = AFF1_CLUSTERID <- Cluster ID (0,1,2,3)
> +        * MPIDR[23:16] = AFF2_CLUSTERID
> +        * MPIDR[24] = MT
> +        * MPIDR[29:25] = RES0
> +        * MPIDR[30] = U
> +        * MPIDR[31] = ME
> +        * MPIDR[39:32] = AFF3
> +        *
> +        * Linear Processor ID (LPID) calculation from MPIDR_EL1:
> +        * (We only use AFF0_CPUID and AFF1_CLUSTERID for now
> +        * until AFF2_CLUSTERID and AFF3 have non-zero values)
> +        *
> +        * LPID = MPIDR[15:8] | MPIDR[1:0]
>          */
> -#if defined(CONFIG_GICV2) || defined(CONFIG_GICV3)
> -#ifdef CONFIG_GICV2
> -       ldr     x0, =GICC_BASE
> +       mrs     x0, mpidr_el1
> +       ubfm    x1, x0, #8, #15
> +       ubfm    x2, x0, #0, #1
> +       orr     x10, x2, x1, lsl #2     /* x10 has LPID */
> +       ubfm    x9, x0, #0, #15         /* x9 contains MPIDR[15:0] */
> +       /*
> +        * offset of the spin table element for this core from start of spin
> +        * table (each elem is padded to 64 bytes)
> +        */
> +       lsl     x1, x10, #6
> +       ldr     x0, =__spin_table
> +       /* physical address of this cpus spin table element */
> +       add     x11, x1, x0
> +
> +       str     x9, [x11, #16]  /* LPID */
> +       mov     x4, #1
> +       str     x4, [x11, #8]   /* STATUS */
> +       dsb     sy
> +#if defined(CONFIG_GICV3)
> +       gic_wait_for_interrupt_m x0
>  #endif
> -       bl      gic_wait_for_interrupt

Why do we only need to wait for GICv3? We previously did so for GICv2.

> +
> +       bl secondary_switch_to_el2
> +#ifdef CONFIG_ARMV8_SWITCH_TO_EL1
> +       secondary_switch_to_el1
>  #endif

Missing bl?

Is there any reason to have the SWITCH_TO_EL1 option other than for
debugging?

EL2 is the preferred EL to boot at for Linux and Xen (it gives far more
flexibility), and if dropping to EL1 is necessary I think it would make
more sense as a run-time option than a compile-time option.

>
> -       /*
> -        * All processors will enter EL2 and optionally EL1.
> +slave_cpu:
> +       wfe
> +#ifdef CONFIG_FSL_SMP_RELEASE_ALL
> +       /* All cores are released from the address in the 1st spin table
> +        * element
>          */
> -       bl      armv8_switch_to_el2
> -#ifdef CONFIG_ARMV8_SWITCH_TO_EL1
> -       bl      armv8_switch_to_el1
> +       ldr     x1, =__spin_table
> +       ldr     x0, [x1]
> +#else
> +       ldr     x0, [x11]
> +#endif
> +       cbz     x0, slave_cpu

Similarly is there any reason to have the option of a single release
addr if we can support unique addresses?

[...]

> diff --git a/arch/arm/cpu/armv8/fsl-lsch3/mp.c b/arch/arm/cpu/armv8/fsl-lsch3/mp.c
> new file mode 100644
> index 0000000..b29eda9
> --- /dev/null
> +++ b/arch/arm/cpu/armv8/fsl-lsch3/mp.c
> @@ -0,0 +1,172 @@
> +/*
> + * Copyright 2014 Freescale Semiconductor, Inc.
> + *
> + * SPDX-License-Identifier:    GPL-2.0+
> + */
> +
> +#include <common.h>
> +#include <asm/io.h>
> +#include <asm/system.h>
> +#include <asm/io.h>
> +#include <asm/arch-fsl-lsch3/immap_lsch3.h>
> +#include "mp.h"
> +
> +DECLARE_GLOBAL_DATA_PTR;
> +
> +void *get_spin_tbl_addr(void)
> +{
> +       void *ptr = (void *)&__spin_table;
> +
> +       return ptr;
> +}

I don't think the cast is necessary here.  As far as I can see this
could be just:

void *get_spin_tbl_addr(void)
{
	return &__spin_table;
}

[...]

> diff --git a/arch/arm/include/asm/macro.h b/arch/arm/include/asm/macro.h
> index f77e4b8..0009c28 100644
> --- a/arch/arm/include/asm/macro.h
> +++ b/arch/arm/include/asm/macro.h
> @@ -105,6 +105,98 @@ lr .req    x30
>         cbz     \xreg1, \master_label
>  .endm
>
> +.macro armv8_switch_to_el2_m, xreg1
> +       mov     \xreg1, #0x5b1  /* Non-secure EL0/EL1 | HVC | 64bit EL2 */
> +       msr     scr_el3, \xreg1

When dropping to EL1 from EL2 we disable HVC via HCR_EL2; presumably due
to lack of a handler. Would it make sense to do similarly here and
disable SMC here until we have a user (e.g. PSCI)?

> +       msr     cptr_el3, xzr   /* Disable coprocessor traps to EL3 */
> +       mov     \xreg1, #0x33ff
> +       msr     cptr_el2, \xreg1        /* Disable coprocessor traps to EL2 */
> +
> +       /* Initialize SCTLR_EL2 */
> +       /*
> +        * setting RES1 bits (29,28,23,22,18,16,11,5,4) to 1
> +        * and RES0 bits (31,30,27,26,24,21,20,17,15-13,10-6) +
> +        * EE,WXN,I,SA,C,A,M to 0
> +        */
> +       mov     \xreg1, #0x0830
> +       movk    \xreg1, #0x30C5, lsl #16
> +       msr     sctlr_el2, \xreg1
> +
> +       /* Return to the EL2_SP2 mode from EL3 */
> +       mov     \xreg1, sp
> +       msr     sp_el2, \xreg1  /* Migrate SP */
> +       mrs     \xreg1, vbar_el3
> +       msr     vbar_el2, \xreg1        /* Migrate VBAR */
> +       mov     \xreg1, #0x3c9
> +       msr     spsr_el3, \xreg1        /* EL2_SP2 | D | A | I | F */
> +       msr     elr_el3, lr
> +       eret
> +.endm

Otherwise this looks fine to me.

> +.macro armv8_switch_to_el1_m, xreg1, xreg2
> +       /* Initialize Generic Timers */
> +       mrs     \xreg1, cnthctl_el2
> +       orr     \xreg1, \xreg1, #0x3    /* Enable EL1 access to timers */
> +       msr     cnthctl_el2, \xreg1
> +       msr     cntvoff_el2, xzr
> +
> +       /* Initilize MPID/MPIDR registers */
> +       mrs     \xreg1, midr_el1
> +       mrs     \xreg2, mpidr_el1
> +       msr     vpidr_el2, \xreg1
> +       msr     vmpidr_el2, \xreg2
> +
> +       /* Disable coprocessor traps */
> +       mov     \xreg1, #0x33ff
> +       msr     cptr_el2, \xreg1        /* Disable coprocessor traps to EL2 */
> +       msr     hstr_el2, xzr           /* Disable coprocessor traps to EL2 */
> +       mov     \xreg1, #3 << 20
> +       msr     cpacr_el1, \xreg1       /* Enable FP/SIMD at EL1 */
> +
> +       /* Initialize HCR_EL2 */
> +       mov     \xreg1, #(1 << 31)              /* 64bit EL1 */
> +       orr     \xreg1, \xreg1, #(1 << 29)      /* Disable HVC */
> +       msr     hcr_el2, \xreg1
> +
> +       /* SCTLR_EL1 initialization */
> +       /*
> +        * setting RES1 bits (29,28,23,22,20,11) to 1
> +        * and RES0 bits (31,30,27,21,17,13,10,6) +
> +        * UCI,EE,EOE,WXN,nTWE,nTWI,UCT,DZE,I,UMA,SED,ITD,
> +        * CP15BEN,SA0,SA,C,A,M to 0
> +        */
> +       mov     \xreg1, #0x0800
> +       movk    \xreg1, #0x30d0, lsl #16
> +       msr     sctlr_el1, \xreg1
> +
> +       /* Return to the EL1_SP1 mode from EL2 */
> +       mov     \xreg1, sp
> +       msr     sp_el1, \xreg1          /* Migrate SP */
> +       mrs     \xreg1, vbar_el2
> +       msr     vbar_el1, \xreg1        /* Migrate VBAR */
> +       mov     \xreg1, #0x3c5
> +       msr     spsr_el2, \xreg1        /* EL1_SP1 | D | A | I | F */
> +       msr     elr_el2, lr
> +       eret
> +.endm

This looks fine to me.

Thanks,
Mark.