[U-Boot] [Patch v1 2/4] armv8/fsl-lsch3: Release secondary cores from boot hold off with Boot Page

Fri Jul 4 14:31:01 CEST 2014

Hi York,

I spotted a couple of generic issues below. Most of these are issues
with the existing code that you happen to be moving around, rather than
with the new code this patch introduces.

There are a couple of gotchas around secondary startup that are painful
with the bootwrapper for arm64 at present, and I think that we can avoid
them by construction for U-Boot. More on that below.

On Fri, Jun 27, 2014 at 05:54:08PM +0100, York Sun wrote:
> Secondary cores need to be released from holdoff by boot release
> registers. With GPP bootrom, they can boot from main memory
> directly. Individual spin table is used for each core. If a single
> release address is needed, defining macro CONFIG_FSL_SMP_RELEASE_ALL
> will use the CPU_RELEASE_ADDR. Spin table and the boot page is reserved
> in device tree so OS won't overwrite.
> 
> Signed-off-by: York Sun <yorksun at freescale.com>
> Signed-off-by: Arnab Basu <arnab.basu at freescale.com>
> ---
> This set depends on this bundle http://patchwork.ozlabs.org/bundle/yorksun/armv8_fsl-lsch3/
> 
>  arch/arm/cpu/armv8/fsl-lsch3/Makefile             |    2 +
>  arch/arm/cpu/armv8/fsl-lsch3/cpu.c                |   13 ++
>  arch/arm/cpu/armv8/fsl-lsch3/cpu.h                |    1 +
>  arch/arm/cpu/armv8/fsl-lsch3/fdt.c                |   56 +++++++
>  arch/arm/cpu/armv8/fsl-lsch3/lowlevel.S           |  119 +++++++++++---
>  arch/arm/cpu/armv8/fsl-lsch3/mp.c                 |  171 +++++++++++++++++++++
>  arch/arm/cpu/armv8/fsl-lsch3/mp.h                 |   36 +++++
>  arch/arm/cpu/armv8/transition.S                   |   63 +-------
>  arch/arm/include/asm/arch-fsl-lsch3/config.h      |    3 +-
>  arch/arm/include/asm/arch-fsl-lsch3/immap_lsch3.h |   35 +++++
>  arch/arm/include/asm/macro.h                      |   81 ++++++++++
>  arch/arm/lib/gic_64.S                             |   10 +-
>  common/board_f.c                                  |    2 +-
>  13 files changed, 502 insertions(+), 90 deletions(-)
>  create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/fdt.c
>  create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/mp.c
>  create mode 100644 arch/arm/cpu/armv8/fsl-lsch3/mp.h

[...]

> diff --git a/arch/arm/cpu/armv8/fsl-lsch3/fdt.c b/arch/arm/cpu/armv8/fsl-lsch3/fdt.c
> new file mode 100644
> index 0000000..cd34e16
> --- /dev/null
> +++ b/arch/arm/cpu/armv8/fsl-lsch3/fdt.c
> @@ -0,0 +1,56 @@
> +/*
> + * Copyright 2014 Freescale Semiconductor, Inc.
> + *
> + * SPDX-License-Identifier:    GPL-2.0+
> + */
> +
> +#include <common.h>
> +#include <libfdt.h>
> +#include <fdt_support.h>
> +#include "mp.h"
> +
> +#ifdef CONFIG_MP
> +void ft_fixup_cpu(void *blob)
> +{
> +       int off;
> +       __maybe_unused u64 spin_tbl_addr = (u64)get_spin_tbl_addr();
> +       u64 *reg;
> +       u64 val;
> +
> +       off = fdt_node_offset_by_prop_value(blob, -1, "device_type", "cpu", 4);
> +       while (off != -FDT_ERR_NOTFOUND) {
> +               reg = (u64 *)fdt_getprop(blob, off, "reg", 0);
> +               if (reg) {
> +                       val = spin_tbl_addr;
> +#ifndef CONFIG_FSL_SMP_RELEASE_ALL
> +                       val += id_to_core(fdt64_to_cpu(*reg)) * SIZE_BOOT_ENTRY;

In Linux we read /cpus/#address-cells to determine the size of a
CPU's reg property (and have dts where this is 1 cell). Will the above
work for that?

> +#endif
> +                       val = cpu_to_fdt64(val);
> +                       fdt_setprop_string(blob, off, "enable-method",
> +                                          "spin-table");
> +                       fdt_setprop(blob, off, "cpu-release-addr",
> +                                   &val, sizeof(val));
> +               } else {
> +                       puts("cpu NULL\n");
> +               }
> +               off = fdt_node_offset_by_prop_value(blob, off, "device_type",
> +                                                   "cpu", 4);
> +       }
> +       /*
> +        * Boot page and spin table can be reserved here if not done staticlly
> +        * in device tree.
> +        *
> +        * fdt_add_mem_rsv(blob, bootpg,
> +        *                 *((u64 *)&(__secondary_boot_page_size)));
> +        * If defined CONFIG_FSL_SMP_RELEASE_ALL, the release address should
> +        * also be reserved.
> +        */

I think that this reservation should _always_ be added by U-Boot unless
specifically overridden.

A problem I had with the arm64 bootwrapper when adding PSCI support and
now (as I am moving stuff about) was that the DTS in the kernel tree had
a memreserve out-of-sync with what the wrapper actually needed. While I
can add a new reservation, I can't remove any in case they are for
something else, so I end up protecting too much, wasting memory.

Given that the reservation is to protect data which U-Boot is in control
of choosing the address for, I think the only sane thing to do is for
U-Boot to always add the reservation.

That way U-Boot can change and existing DTBs will just work. We won't
end up protecting too much or too little.

[...]

> @@ -119,3 +107,94 @@ ENTRY(lowlevel_init)
>         mov     lr, x29                 /* Restore LR */
>         ret
>  ENDPROC(lowlevel_init)
> +
> +       /* Keep literals not used by the secondary boot page outside it */
> +       .ltorg
> +
> +       .align 4

That looks like a small alignment for a page.

Should this be larger? Or is the "page" a misnomer here?

> +       .global secondary_boot_page
> +secondary_boot_page:
> +       .global __spin_table
> +__spin_table:
> +       .space CONFIG_MAX_CPUS*ENTRY_SIZE
> +
> +       .align 4
> +       /* Secondary Boot Page starts here */
> +ENTRY(secondary_boot_func)
> +       /*
> +        * PIR calculation from MPIDR_EL1

Sorry if I'm asking a stupid question, but what is "PIR"?

> +        * MPIDR[1:0] = AFF0_CPUID <- Core ID (0,1)
> +        * MPIDR[7:2] = AFF0_RES
> +        * MPIDR[15:8] = AFF1_CLUSTERID <- Cluster ID (0,1,2,3)
> +        * MPIDR[23:16] = AFF2_CLUSTERID
> +        * MPIDR[24] = MT
> +        * MPIDR[29:25] =RES

Could we say RES0 here? That would match the documentation in the ARM
ARM and make things a bit clearer.

Also, missing space after '='?

> +        * MPIDR[30] = U
> +        * MPIDR[31] = ME

My ARMv8 ARM ARM shows this as RES1, but appears to be
self-contradictory. I'll query this internally. I don't think that
matters here anyway.

> +        * MPIDR[39:32] = AFF3
> +        * We only use AFF0_CPUID and AFF1_CLUSTERID for now
> +        * until AFF2_CLUSTERID and AFF3 have non-zero values.
> +        */
> +       mrs     x0, mpidr_el1
> +       ubfm    x1, x0, #8, #15
> +       ubfm    x2, x0, #0, #1
> +       orr     x10, x2, x1, lsl #2     /* x10 has PIR */
> +       ubfm    x9, x0, #0, #15         /* w9 has 16-bit original PIR */
> +       lsl     x1, x10, #6     /* spin table is padded to 64 byte each core */
> +       ldr     x0, =(SECONDARY_CPU_BOOT_PAGE)
> +       ldr     x3, =__spin_table
> +       ldr     x4, =secondary_boot_page
> +       sub     x3, x3, x4
> +       add     x0, x0, x3
> +       add     x11, x1, x0
> +
> +       str     x9, [x11, #16]  /* ENTRY_PIR */
> +       mov     x4, #1
> +       str     x4, [x11]       /* ENTRY_ADDR */
> +       dsb     sy
> +       isb

What is the isb intended to synchronize?

Could we get comments on barriers? Even when coming back to code oneself
wrote it's easy to miss a subtlety as to why one is needed.

> +#if defined(CONFIG_GICV3)
> +       gic_wait_for_interrupt_m x0
> +#endif
> +
> +       bl secondary_switch_to_el2
> +#ifdef CONFIG_ARMV8_SWITCH_TO_EL1
> +       secondary_switch_to_el1
> +#endif
> +
> +slave_cpu:
> +       wfe
> +#ifdef CONFIG_FSL_SMP_RELEASE_ALL
> +       ldr     x1, =CPU_RELEASE_ADDR
> +       ldr     x0, [x1]
> +#else
> +       ldr     x0, [x11]
> +       tbnz    x0, #0, slave_cpu
> +#endif
> +       cbz     x0, slave_cpu
> +       br      x0                      /* branch to the given address */

Just to check, I take it CPUs won't ever be in a big-endian mode at this
point?

> +ENDPROC(secondary_boot_func)
> +
> +ENTRY(secondary_switch_to_el2)
> +       switch_el x0, 1f, 0f, 0f
> +0:     ret
> +1:     armv8_switch_to_el2_m x0
> +ENDPROC(secondary_switch_to_el2)
> +
> +ENTRY(secondary_switch_to_el1)
> +       switch_el x0, 0f, 1f, 0f
> +0:     ret
> +1:     armv8_switch_to_el1_m x0, x1
> +ENDPROC(secondary_switch_to_el1)
> +
> +       /* Ensure that the literals used by the secondary boot page are
> +        * assembled within it
> +        */
> +       .ltorg
> +
> +       .align 4

Similarly to above, this looks like a small alignment for a page.

> +       .globl __secondary_boot_page_size
> +       .type __secondary_boot_page_size, %object
> +       /* Secondary Boot Page ends here */
> +__secondary_boot_page_size:
> +       .quad .-secondary_boot_page

[...]

> +int fsl_lsch3_wake_seconday_cores(void)
> +{
> +       struct ccsr_gur __iomem *gur = (void *)(CONFIG_SYS_FSL_GUTS_ADDR);
> +       struct ccsr_reset __iomem *rst = (void *)(CONFIG_SYS_FSL_RST_ADDR);
> +       void *boot_loc = (void *)SECONDARY_CPU_BOOT_PAGE;
> +       size_t *boot_page_size = &(__secondary_boot_page_size);
> +       u32 cores, cpu_up_mask = 1;
> +       int i, timeout = 10;
> +       u64 *table = get_spin_tbl_addr();
> +
> +       cores = cpu_mask();
> +       memcpy(boot_loc, &secondary_boot_page, *boot_page_size);
> +       /* Clear spin table so that secondary processors
> +        * observe the correct value after waking up from wfe.
> +        */
> +       memset(table, 0, CONFIG_MAX_CPUS*ENTRY_SIZE);
> +       flush_dcache_range((unsigned long)boot_loc,
> +                          (unsigned long)boot_loc + *boot_page_size);
> +
> +       printf("Waking secondary cores to start from %lx\n", gd->relocaddr);
> +       out_le32(&gur->bootlocptrh, (u32)(gd->relocaddr >> 32));
> +       out_le32(&gur->bootlocptrl, (u32)gd->relocaddr);
> +       out_le32(&gur->scratchrw[6], 1);
> +       asm volatile("dsb st" : : : "memory");
> +       rst->brrl = cores;
> +       asm volatile("dsb st" : : : "memory");
> +
> +       /* fixme: this is only needed for the simulator because secnodary cores
> +        * start to run without waiting for boot release register, then enter
> +        * "wfe" before the scratch register is set above.
> +        */
> +       asm volatile("sev");

That feels a little dodgy; a number of things could generate an event
before we got here. Is there no way to block them until we've set that
up?

> +
> +       while (timeout--) {
> +               flush_dcache_range((unsigned long)table, (unsigned long)table +
> +                                  CONFIG_MAX_CPUS * 64);
> +               for (i = 1; i < CONFIG_MAX_CPUS; i++) {
> +                       if (table[i * NUM_BOOT_ENTRY + BOOT_ENTRY_ADDR])
> +                               cpu_up_mask |= 1 << i;
> +               }
> +               if (hweight32(cpu_up_mask) == hweight32(cores))
> +                       break;
> +               udelay(10);
> +       }

Surely we need this before we expect the CPUs to read the values in the
table?

Or have I misunderstood?

> +       if (timeout <= 0) {
> +               printf("Not all cores (0x%x) are up (0x%x)\n",
> +                      cores, cpu_up_mask);
> +               return 1;
> +       }
> +       printf("All (%d) cores are up.\n", hweight32(cores));
> +
> +       return 0;
> +}

[...]

> diff --git a/arch/arm/include/asm/macro.h b/arch/arm/include/asm/macro.h
> index f77e4b8..16ba76e 100644
> --- a/arch/arm/include/asm/macro.h
> +++ b/arch/arm/include/asm/macro.h
> @@ -105,6 +105,87 @@ lr .req    x30
>         cbz     \xreg1, \master_label
>  .endm
> 
> +.macro armv8_switch_to_el2_m, xreg1
> +       mov     \xreg1, #0x5b1  /* Non-secure EL0/EL1 | HVC | 64bit EL2 */
> +       msr     scr_el3, \xreg1
> +       msr     cptr_el3, xzr   /* Disable coprocessor traps to EL3 */
> +       mov     \xreg1, #0x33ff
> +       msr     cptr_el2, \xreg1        /* Disable coprocessor traps to EL2 */
> +
> +       /* Initialize SCTLR_EL2 */
> +       msr     sctlr_el2, xzr

What about the RES1 bits (e.g. bits 29 & 28)?

We don't seem to initialise them before the eret.

> +
> +       /* Return to the EL2_SP2 mode from EL3 */
> +       mov     \xreg1, sp
> +       msr     sp_el2, \xreg1  /* Migrate SP */
> +       mrs     \xreg1, vbar_el3
> +       msr     vbar_el2, \xreg1        /* Migrate VBAR */
> +       mov     x0, #0x3c9
> +       msr     spsr_el3, \xreg1        /* EL2_SP2 | D | A | I | F */
> +       msr     elr_el3, lr
> +       eret
> +.endm
> +
> +.macro armv8_switch_to_el1_m, xreg1, xreg2
> +       /* Initialize Generic Timers */
> +       mrs     \xreg1, cnthctl_el2
> +       orr     \xreg1, \xreg1, #0x3    /* Enable EL1 access to timers */
> +       msr     cnthctl_el2, \xreg1

Is there any reason this can't be set to a precise known value? This
currently leaves EVNTDIR and EVNTEN in UNKNOWN states (which could
differ across CPUs).

An EL1 OS can enable the event stream if it wants in CNTKCTL_EL1, so is
there any reason to enable it at EL2?

> +       msr     cntvoff_el2, \xreg1

Please initialize cntvoff_el2 using xzr. Due to the aforementioned
UNKNOWN bits, this could leave CPUs with differing view of time, and
there's no point differing in the first place.

An EL1 OS will not be able to fix this.

I fixed this elsewhere in  b924d586d70b (arm64: zero cntvoff_el2).

> +       mrs     \xreg1, cntkctl_el1
> +       orr     \xreg1, \xreg1, #0x3    /* Enable EL0 access to timers */
> +       msr     cntkctl_el1, \xreg1

Likewise this leaves many bits UNKNOWN and potentially differing across
CPUs, though the OS at EL1 should be able to fix this up (and Linux
will).

> +
> +       /* Initilize MPID/MPIDR registers */
> +       mrs     \xreg1, midr_el1
> +       mrs     \xreg2, mpidr_el1
> +       msr     vpidr_el2, \xreg1
> +       msr     vmpidr_el2, \xreg2
> +
> +       /* Disable coprocessor traps */
> +       mov     \xreg1, #0x33ff
> +       msr     cptr_el2, \xreg1        /* Disable coprocessor traps to EL2 */
> +       msr     hstr_el2, xzr           /* Disable coprocessor traps to EL2 */
> +       mov     \xreg1, #3 << 20
> +       msr     cpacr_el1, \xreg1       /* Enable FP/SIMD at EL1 */
> +
> +       /* Initialize HCR_EL2 */
> +       mov     \xreg1, #(1 << 31)              /* 64bit EL1 */
> +       orr     \xreg1, \xreg1, #(1 << 29)      /* Disable HVC */
> +       msr     hcr_el2, \xreg1
> +
> +       /* SCTLR_EL1 initialization */
> +       mov     \xreg1, #0x0800
> +       movk    \xreg1, #0x30d0, lsl #16
> +       msr     sctlr_el1, \xreg1

That doesn't seem to set up all the RES1 bits (e.g. bit 29).

> +
> +       /* Return to the EL1_SP1 mode from EL2 */
> +       mov     \xreg1, sp
> +       msr     sp_el1, \xreg1          /* Migrate SP */
> +       mrs     \xreg1, vbar_el2
> +       msr     vbar_el1, \xreg1        /* Migrate VBAR */
> +       mov     \xreg1, #0x3c5
> +       msr     spsr_el2, \xreg1        /* EL1_SP1 | D | A | I | F */
> +       msr     elr_el2, lr
> +       eret
> +.endm
> +
> +#if defined(CONFIG_GICV3)
> +.macro gic_wait_for_interrupt_m xreg1
> +0 :    wfi
> +       mrs     \xreg1, ICC_IAR1_EL1
> +       msr     ICC_EOIR1_EL1, \xreg1
> +       cbnz    \xreg1, 0b
> +.endm
> +#elif defined(CONFIG_GICV2)
> +.macro gic_wait_for_interrupt_m xreg1, wreg2
> +0 :    wfi
> +       ldr     \wreg2, [\xreg1, GICC_AIAR]
> +       str     \wreg2, [\xreg1, GICC_AEOIR]
> +       cbnz    \wreg2, 0b
> +.endm

Assuming I've understood correctly, here we block until we receive SGI 0
from the CPU with GIC ID 0? Do we have a guarantee that the boot CPU
will have GIC ID 0?

Thanks,
Mark.