[U-Boot] [PATCH v3] arm: build arch memset/memcpy in Thumb2 mode
Stefan Agner
stefan at agner.ch
Mon Jan 5 23:21:41 CET 2015
Albert,
I guess it is too late for that now. Thought it would make it into
2015.01, since your last comment in v2 suggested that you would treat it
as bugfix...
--
Stefan
On 2014-12-18 18:10, Stefan Agner wrote:
> Resynchronize memcpy/memset with kernel 3.17 and build them in
> Thumb2 mode (unified syntax). Those assembler files can be built
> and linked in ARM mode too, however when calling them from Thumb2
> built code, the stack got corrupted and the copy did not succeed
> (the exact details have not been traced back). However, the Linux
> kernel builds those files in Thumb2 mode. Hence U-Boot should
> build them in Thumb2 mode too when CONFIG_SYS_THUMB_BUILD is set.
>
> To build the files without warning, some assembler instructions
> had to be replaced with their UAL compliant variant (thanks
> Jeroen for this input).
>
> To build the file in Thumb2 mode the implicit-it=always option need
> to be set to generate Thumb2 compliant IT instructions where needed.
> We add this option to the general AFLAGS when building for Thumb2.
>
> Reviewed-by: Simon Glass <sjg at chromium.org>
> Tested-by: Simon Glass <sjg at chromium.org>
> Signed-off-by: Stefan Agner <stefan at agner.ch>
> ---
> arch/arm/config.mk | 4 +-
> arch/arm/include/asm/assembler.h | 33 ++++++++++--
> arch/arm/lib/memcpy.S | 80 +++++++++++++++++++---------
> arch/arm/lib/memset.S | 112 ++++++++++++++++++++-------------------
> 4 files changed, 142 insertions(+), 87 deletions(-)
>
> diff --git a/arch/arm/config.mk b/arch/arm/config.mk
> index f0eafd6..9a4c270 100644
> --- a/arch/arm/config.mk
> +++ b/arch/arm/config.mk
> @@ -26,7 +26,9 @@ PLATFORM_CPPFLAGS += -D__ARM__
>
> # Choose between ARM/Thumb instruction sets
> ifeq ($(CONFIG_SYS_THUMB_BUILD),y)
> -PF_CPPFLAGS_ARM := $(call cc-option, -mthumb -mthumb-interwork,\
> +AFLAGS_IMPLICIT_IT := $(call as-option,-Wa$(comma)-mimplicit-it=always)
> +PF_CPPFLAGS_ARM := $(AFLAGS_IMPLICIT_IT) \
> + $(call cc-option, -mthumb -mthumb-interwork,\
> $(call cc-option,-marm,)\
> $(call cc-option,-mno-thumb-interwork,)\
> )
> diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
> index 5e4789b..11b80fb 100644
> --- a/arch/arm/include/asm/assembler.h
> +++ b/arch/arm/include/asm/assembler.h
> @@ -14,12 +14,14 @@
> * assembler source.
> */
>
> +#include <config.h>
> +
> /*
> * Endian independent macros for shifting bytes within registers.
> */
> #ifndef __ARMEB__
> -#define pull lsr
> -#define push lsl
> +#define lspull lsr
> +#define lspush lsl
> #define get_byte_0 lsl #0
> #define get_byte_1 lsr #8
> #define get_byte_2 lsr #16
> @@ -29,8 +31,8 @@
> #define put_byte_2 lsl #16
> #define put_byte_3 lsl #24
> #else
> -#define pull lsl
> -#define push lsr
> +#define lspull lsl
> +#define lspush lsr
> #define get_byte_0 lsr #24
> #define get_byte_1 lsr #16
> #define get_byte_2 lsr #8
> @@ -54,7 +56,28 @@
> #define PLD(code...)
> #endif
>
> + .irp c,,eq,ne,cs,cc,mi,pl,vs,vc,hi,ls,ge,lt,gt,le,hs,lo
> + .macro ret\c, reg
> +#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__)
> + mov\c pc, \reg
> +#else
> + .ifeqs "\reg", "lr"
> + bx\c \reg
> + .else
> + mov\c pc, \reg
> + .endif
> +#endif
> + .endm
> + .endr
> +
> /*
> - * Cache alligned
> + * Cache aligned, used for optimized memcpy/memset
> + * In the kernel this is only enabled for Feroceon CPU's...
> + * We disable it especially for Thumb builds since those instructions
> + * are not made in a Thumb ready way...
> */
> +#ifdef CONFIG_SYS_THUMB_BUILD
> +#define CALGN(code...)
> +#else
> #define CALGN(code...) code
> +#endif
> diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S
> index f655256..eeaf003 100644
> --- a/arch/arm/lib/memcpy.S
> +++ b/arch/arm/lib/memcpy.S
> @@ -10,9 +10,14 @@
> * published by the Free Software Foundation.
> */
>
> +#include <linux/linkage.h>
> #include <asm/assembler.h>
>
> +#ifdef CONFIG_SYS_THUMB_BUILD
> +#define W(instr) instr.w
> +#else
> #define W(instr) instr
> +#endif
>
> #define LDR1W_SHIFT 0
> #define STR1W_SHIFT 0
> @@ -30,7 +35,7 @@
> .endm
>
> .macro ldr1b ptr reg cond=al abort
> - ldr\cond\()b \reg, [\ptr], #1
> + ldrb\cond\() \reg, [\ptr], #1
> .endm
>
> .macro str1w ptr reg abort
> @@ -42,7 +47,7 @@
> .endm
>
> .macro str1b ptr reg cond=al abort
> - str\cond\()b \reg, [\ptr], #1
> + strb\cond\() \reg, [\ptr], #1
> .endm
>
> .macro enter reg1 reg2
> @@ -56,10 +61,12 @@
> .text
>
> /* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
> -
> -.globl memcpy
> -memcpy:
> -
> + .syntax unified
> +#ifdef CONFIG_SYS_THUMB_BUILD
> + .thumb
> + .thumb_func
> +#endif
> +ENTRY(memcpy)
> cmp r0, r1
> moveq pc, lr
>
> @@ -79,7 +86,7 @@ memcpy:
>
> CALGN( ands ip, r0, #31 )
> CALGN( rsb r3, ip, #32 )
> - CALGN( sbcnes r4, r3, r2 ) @ C is always set here
> + CALGN( sbcsne r4, r3, r2 ) @ C is always set here
> CALGN( bcs 2f )
> CALGN( adr r4, 6f )
> CALGN( subs r2, r2, r3 ) @ C gets set
> @@ -178,7 +185,7 @@ memcpy:
>
> CALGN( ands ip, r0, #31 )
> CALGN( rsb ip, ip, #32 )
> - CALGN( sbcnes r4, ip, r2 ) @ C is always set here
> + CALGN( sbcsne r4, ip, r2 ) @ C is always set here
> CALGN( subcc r2, r2, ip )
> CALGN( bcc 15f )
>
> @@ -193,24 +200,24 @@ memcpy:
>
> 12: PLD( pld [r1, #124] )
> 13: ldr4w r1, r4, r5, r6, r7, abort=19f
> - mov r3, lr, pull #\pull
> + mov r3, lr, lspull #\pull
> subs r2, r2, #32
> ldr4w r1, r8, r9, ip, lr, abort=19f
> - orr r3, r3, r4, push #\push
> - mov r4, r4, pull #\pull
> - orr r4, r4, r5, push #\push
> - mov r5, r5, pull #\pull
> - orr r5, r5, r6, push #\push
> - mov r6, r6, pull #\pull
> - orr r6, r6, r7, push #\push
> - mov r7, r7, pull #\pull
> - orr r7, r7, r8, push #\push
> - mov r8, r8, pull #\pull
> - orr r8, r8, r9, push #\push
> - mov r9, r9, pull #\pull
> - orr r9, r9, ip, push #\push
> - mov ip, ip, pull #\pull
> - orr ip, ip, lr, push #\push
> + orr r3, r3, r4, lspush #\push
> + mov r4, r4, lspull #\pull
> + orr r4, r4, r5, lspush #\push
> + mov r5, r5, lspull #\pull
> + orr r5, r5, r6, lspush #\push
> + mov r6, r6, lspull #\pull
> + orr r6, r6, r7, lspush #\push
> + mov r7, r7, lspull #\pull
> + orr r7, r7, r8, lspush #\push
> + mov r8, r8, lspull #\pull
> + orr r8, r8, r9, lspush #\push
> + mov r9, r9, lspull #\pull
> + orr r9, r9, ip, lspush #\push
> + mov ip, ip, lspull #\pull
> + orr ip, ip, lr, lspush #\push
> str8w r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f
> bge 12b
> PLD( cmn r2, #96 )
> @@ -221,10 +228,10 @@ memcpy:
> 14: ands ip, r2, #28
> beq 16f
>
> -15: mov r3, lr, pull #\pull
> +15: mov r3, lr, lspull #\pull
> ldr1w r1, lr, abort=21f
> subs ip, ip, #4
> - orr r3, r3, lr, push #\push
> + orr r3, r3, lr, lspush #\push
> str1w r0, r3, abort=21f
> bgt 15b
> CALGN( cmp r2, #0 )
> @@ -241,3 +248,24 @@ memcpy:
> 17: forward_copy_shift pull=16 push=16
>
> 18: forward_copy_shift pull=24 push=8
> +
> +
> +/*
> + * Abort preamble and completion macros.
> + * If a fixup handler is required then those macros must surround it.
> + * It is assumed that the fixup code will handle the private part of
> + * the exit macro.
> + */
> +
> + .macro copy_abort_preamble
> +19: ldmfd sp!, {r5 - r9}
> + b 21f
> +20: ldmfd sp!, {r5 - r8}
> +21:
> + .endm
> +
> + .macro copy_abort_end
> + ldmfd sp!, {r4, pc}
> + .endm
> +
> +ENDPROC(memcpy)
> diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S
> index 0cdf895..7208f20 100644
> --- a/arch/arm/lib/memset.S
> +++ b/arch/arm/lib/memset.S
> @@ -9,32 +9,25 @@
> *
> * ASM optimised string functions
> */
> +#include <linux/linkage.h>
> #include <asm/assembler.h>
>
> .text
> .align 5
> - .word 0
>
> -1: subs r2, r2, #4 @ 1 do we have enough
> - blt 5f @ 1 bytes to align with?
> - cmp r3, #2 @ 1
> - strltb r1, [r0], #1 @ 1
> - strleb r1, [r0], #1 @ 1
> - strb r1, [r0], #1 @ 1
> - add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3))
> -/*
> - * The pointer is now aligned and the length is adjusted. Try doing the
> - * memset again.
> - */
> -
> -.globl memset
> -memset:
> + .syntax unified
> +#ifdef CONFIG_SYS_THUMB_BUILD
> + .thumb
> + .thumb_func
> +#endif
> +ENTRY(memset)
> ands r3, r0, #3 @ 1 unaligned?
> - bne 1b @ 1
> + mov ip, r0 @ preserve r0 as return value
> + bne 6f @ 1
> /*
> - * we know that the pointer in r0 is aligned to a word boundary.
> + * we know that the pointer in ip is aligned to a word boundary.
> */
> - orr r1, r1, r1, lsl #8
> +1: orr r1, r1, r1, lsl #8
> orr r1, r1, r1, lsl #16
> mov r3, r1
> cmp r2, #16
> @@ -43,29 +36,28 @@ memset:
> #if ! CALGN(1)+0
>
> /*
> - * We need an extra register for this loop - save the return address and
> - * use the LR
> + * We need 2 extra registers for this loop - use r8 and the LR
> */
> - str lr, [sp, #-4]!
> - mov ip, r1
> + stmfd sp!, {r8, lr}
> + mov r8, r1
> mov lr, r1
>
> 2: subs r2, r2, #64
> - stmgeia r0!, {r1, r3, ip, lr} @ 64 bytes at a time.
> - stmgeia r0!, {r1, r3, ip, lr}
> - stmgeia r0!, {r1, r3, ip, lr}
> - stmgeia r0!, {r1, r3, ip, lr}
> + stmiage ip!, {r1, r3, r8, lr} @ 64 bytes at a time.
> + stmiage ip!, {r1, r3, r8, lr}
> + stmiage ip!, {r1, r3, r8, lr}
> + stmiage ip!, {r1, r3, r8, lr}
> bgt 2b
> - ldmeqfd sp!, {pc} @ Now <64 bytes to go.
> + ldmfdeq sp!, {r8, pc} @ Now <64 bytes to go.
> /*
> * No need to correct the count; we're only testing bits from now on
> */
> tst r2, #32
> - stmneia r0!, {r1, r3, ip, lr}
> - stmneia r0!, {r1, r3, ip, lr}
> + stmiane ip!, {r1, r3, r8, lr}
> + stmiane ip!, {r1, r3, r8, lr}
> tst r2, #16
> - stmneia r0!, {r1, r3, ip, lr}
> - ldr lr, [sp], #4
> + stmiane ip!, {r1, r3, r8, lr}
> + ldmfd sp!, {r8, lr}
>
> #else
>
> @@ -74,53 +66,63 @@ memset:
> * whole cache lines at once.
> */
>
> - stmfd sp!, {r4-r7, lr}
> + stmfd sp!, {r4-r8, lr}
> mov r4, r1
> mov r5, r1
> mov r6, r1
> mov r7, r1
> - mov ip, r1
> + mov r8, r1
> mov lr, r1
>
> cmp r2, #96
> - tstgt r0, #31
> + tstgt ip, #31
> ble 3f
>
> - and ip, r0, #31
> - rsb ip, ip, #32
> - sub r2, r2, ip
> - movs ip, ip, lsl #(32 - 4)
> - stmcsia r0!, {r4, r5, r6, r7}
> - stmmiia r0!, {r4, r5}
> - tst ip, #(1 << 30)
> - mov ip, r1
> - strne r1, [r0], #4
> + and r8, ip, #31
> + rsb r8, r8, #32
> + sub r2, r2, r8
> + movs r8, r8, lsl #(32 - 4)
> + stmiacs ip!, {r4, r5, r6, r7}
> + stmiami ip!, {r4, r5}
> + tst r8, #(1 << 30)
> + mov r8, r1
> + strne r1, [ip], #4
>
> 3: subs r2, r2, #64
> - stmgeia r0!, {r1, r3-r7, ip, lr}
> - stmgeia r0!, {r1, r3-r7, ip, lr}
> + stmiage ip!, {r1, r3-r8, lr}
> + stmiage ip!, {r1, r3-r8, lr}
> bgt 3b
> - ldmeqfd sp!, {r4-r7, pc}
> + ldmfdeq sp!, {r4-r8, pc}
>
> tst r2, #32
> - stmneia r0!, {r1, r3-r7, ip, lr}
> + stmiane ip!, {r1, r3-r8, lr}
> tst r2, #16
> - stmneia r0!, {r4-r7}
> - ldmfd sp!, {r4-r7, lr}
> + stmiane ip!, {r4-r7}
> + ldmfd sp!, {r4-r8, lr}
>
> #endif
>
> 4: tst r2, #8
> - stmneia r0!, {r1, r3}
> + stmiane ip!, {r1, r3}
> tst r2, #4
> - strne r1, [r0], #4
> + strne r1, [ip], #4
> /*
> * When we get here, we've got less than 4 bytes to zero. We
> * may have an unaligned pointer as well.
> */
> 5: tst r2, #2
> - strneb r1, [r0], #1
> - strneb r1, [r0], #1
> + strbne r1, [ip], #1
> + strbne r1, [ip], #1
> tst r2, #1
> - strneb r1, [r0], #1
> - mov pc, lr
> + strbne r1, [ip], #1
> + ret lr
> +
> +6: subs r2, r2, #4 @ 1 do we have enough
> + blt 5b @ 1 bytes to align with?
> + cmp r3, #2 @ 1
> + strblt r1, [ip], #1 @ 1
> + strble r1, [ip], #1 @ 1
> + strb r1, [ip], #1 @ 1
> + add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3))
> + b 1b
> +ENDPROC(memset)
More information about the U-Boot
mailing list