[U-Boot] [PATCH V2] arm: Use optimized memcpy and memset from linux
Matthias Weisser
weisserm at arcor.de
Wed Jan 26 11:45:57 CET 2011
Using optimized versions of memset and memcpy from linux brings a quite
noticeable speed (x2 or better) improvement for these two functions.
Here are some numbers for test done with jadecpu
| HEAD(1)| HEAD(1)| HEAD(2)| HEAD(2)|
| | +patch | | +patch |
---------------------------+--------+--------+--------+--------+
Reset to prompt | 438ms | 330ms | 228ms | 120ms |
| | | | |
TFTP a 3MB img | 4782ms | 3428ms | 3245ms | 2820ms |
| | | | |
FATLOAD USB a 3MB img* | 8515ms | 8510ms | ------ | ------ |
| | | | |
BOOTM LZO img in RAM | 3473ms | 3168ms | 592ms | 592ms |
where CRC is | 615ms | 615ms | 54ms | 54ms |
uncompress | 2460ms | 2462ms | 450ms | 451ms |
final boot_elf | 376ms | 68ms | 65ms | 65ms |
| | | | |
BOOTM LZO img in FLASH | 3207ms | 2902ms | 1050ms | 1050ms |
where CRC is | 600ms | 600ms | 135ms | 135ms |
uncompress | 2209ms | 2211ms | 828ms | 828ms |
| | | | |
Copy 1.4MB from NOR to RAM | 134ms | 72ms | 120ms | 70ms |
(1) No dcache
(2) dcache enabled in board_init
*Does not work when dcache is on
Size impact:
C version:
text data bss dec hex filename
202862 18912 266456 488230 77326 u-boot
ASM version:
text data bss dec hex filename
203798 18912 266288 488998 77626 u-boot
222712 u-boot.bin
Changes since V1:
- Made the usage of these functions optional be CONFIG_USE_ARCH_MEM
- Usage of PLD instruction on all architectures supporting it
- Added a README entry
- Minor style fixes
Signed-off-by: Matthias Weisser <weisserm at arcor.de>
---
README | 6 +
arch/arm/include/asm/assembler.h | 60 ++++++++++
arch/arm/include/asm/string.h | 10 ++-
arch/arm/lib/Makefile | 2 +
arch/arm/lib/memcpy.S | 241 ++++++++++++++++++++++++++++++++++++++
arch/arm/lib/memset.S | 126 ++++++++++++++++++++
6 files changed, 443 insertions(+), 2 deletions(-)
create mode 100644 arch/arm/include/asm/assembler.h
create mode 100644 arch/arm/lib/memcpy.S
create mode 100644 arch/arm/lib/memset.S
diff --git a/README b/README
index 755d17c..5c610f2 100644
--- a/README
+++ b/README
@@ -2885,6 +2885,12 @@ Low Level (hardware related) configuration options:
that is executed before the actual U-Boot. E.g. when
compiling a NAND SPL.
+- CONFIG_USE_ARCH_MEMCPY
+ CONFIG_USE_ARCH_MEMSET
+ If these options are used a optimized version of memcpy/memset will
+ be used if available. These functions may be faster under some
+ conditions but may increase the binary size.
+
Building the Software:
======================
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
new file mode 100644
index 0000000..418ee94
--- /dev/null
+++ b/arch/arm/include/asm/assembler.h
@@ -0,0 +1,60 @@
+/*
+ * arch/arm/include/asm/assembler.h
+ *
+ * Copyright (C) 1996-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This file contains arm architecture specific defines
+ * for the different processors.
+ *
+ * Do not include any C declarations in this file - it is included by
+ * assembler source.
+ */
+
+/*
+ * Endian independent macros for shifting bytes within registers.
+ */
+#ifndef __ARMEB__
+#define pull lsr
+#define push lsl
+#define get_byte_0 lsl #0
+#define get_byte_1 lsr #8
+#define get_byte_2 lsr #16
+#define get_byte_3 lsr #24
+#define put_byte_0 lsl #0
+#define put_byte_1 lsl #8
+#define put_byte_2 lsl #16
+#define put_byte_3 lsl #24
+#else
+#define pull lsl
+#define push lsr
+#define get_byte_0 lsr #24
+#define get_byte_1 lsr #16
+#define get_byte_2 lsr #8
+#define get_byte_3 lsl #0
+#define put_byte_0 lsl #24
+#define put_byte_1 lsl #16
+#define put_byte_2 lsl #8
+#define put_byte_3 lsl #0
+#endif
+
+/*
+ * Data preload for architectures that support it
+ */
+#if defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) || \
+ defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \
+ defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_6Z__) || \
+ defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_7A__) || \
+ defined(__ARM_ARCH_7R__)
+#define PLD(code...) code
+#else
+#define PLD(code...)
+#endif
+
+/*
+ * Cache alligned
+ */
+#define CALGN(code...) code
diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h
index c3ea582..c6dfb25 100644
--- a/arch/arm/include/asm/string.h
+++ b/arch/arm/include/asm/string.h
@@ -1,6 +1,8 @@
#ifndef __ASM_ARM_STRING_H
#define __ASM_ARM_STRING_H
+#include <config.h>
+
/*
* We don't do inline string functions, since the
* optimised inline asm versions are not small.
@@ -12,7 +14,9 @@ extern char * strrchr(const char * s, int c);
#undef __HAVE_ARCH_STRCHR
extern char * strchr(const char * s, int c);
-#undef __HAVE_ARCH_MEMCPY
+#ifdef CONFIG_USE_ARCH_MEMCPY
+#define __HAVE_ARCH_MEMCPY
+#endif
extern void * memcpy(void *, const void *, __kernel_size_t);
#undef __HAVE_ARCH_MEMMOVE
@@ -22,7 +26,9 @@ extern void * memmove(void *, const void *, __kernel_size_t);
extern void * memchr(const void *, int, __kernel_size_t);
#undef __HAVE_ARCH_MEMZERO
-#undef __HAVE_ARCH_MEMSET
+#ifdef CONFIG_USE_ARCH_MEMSET
+#define __HAVE_ARCH_MEMSET
+#endif
extern void * memset(void *, int, __kernel_size_t);
#if 0
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index 454440c..03b1b5e 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -44,6 +44,8 @@ COBJS-y += cache-cp15.o
endif
COBJS-y += interrupts.o
COBJS-y += reset.o
+SOBJS-$(CONFIG_USE_ARCH_MEMSET) += memset.o
+SOBJS-$(CONFIG_USE_ARCH_MEMCPY) += memcpy.o
SRCS := $(GLSOBJS:.o=.S) $(GLCOBJS:.o=.c) \
$(SOBJS-y:.o=.S) $(COBJS-y:.o=.c)
diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S
new file mode 100644
index 0000000..40db90e
--- /dev/null
+++ b/arch/arm/lib/memcpy.S
@@ -0,0 +1,241 @@
+/*
+ * linux/arch/arm/lib/memcpy.S
+ *
+ * Author: Nicolas Pitre
+ * Created: Sep 28, 2005
+ * Copyright: MontaVista Software, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <asm/assembler.h>
+
+#define W(instr) instr
+
+#define LDR1W_SHIFT 0
+#define STR1W_SHIFT 0
+
+ .macro ldr1w ptr reg abort
+ W(ldr) \reg, [\ptr], #4
+ .endm
+
+ .macro ldr4w ptr reg1 reg2 reg3 reg4 abort
+ ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
+ .endm
+
+ .macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+ ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+ .endm
+
+ .macro ldr1b ptr reg cond=al abort
+ ldr\cond\()b \reg, [\ptr], #1
+ .endm
+
+ .macro str1w ptr reg abort
+ W(str) \reg, [\ptr], #4
+ .endm
+
+ .macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+ stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+ .endm
+
+ .macro str1b ptr reg cond=al abort
+ str\cond\()b \reg, [\ptr], #1
+ .endm
+
+ .macro enter reg1 reg2
+ stmdb sp!, {r0, \reg1, \reg2}
+ .endm
+
+ .macro exit reg1 reg2
+ ldmfd sp!, {r0, \reg1, \reg2}
+ .endm
+
+ .text
+
+/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
+
+.globl memcpy
+memcpy:
+
+ enter r4, lr
+
+ subs r2, r2, #4
+ blt 8f
+ ands ip, r0, #3
+ PLD( pld [r1, #0] )
+ bne 9f
+ ands ip, r1, #3
+ bne 10f
+
+1: subs r2, r2, #(28)
+ stmfd sp!, {r5 - r8}
+ blt 5f
+
+ CALGN( ands ip, r0, #31 )
+ CALGN( rsb r3, ip, #32 )
+ CALGN( sbcnes r4, r3, r2 ) @ C is always set here
+ CALGN( bcs 2f )
+ CALGN( adr r4, 6f )
+ CALGN( subs r2, r2, r3 ) @ C gets set
+ CALGN( add pc, r4, ip )
+
+ PLD( pld [r1, #0] )
+2: PLD( subs r2, r2, #96 )
+ PLD( pld [r1, #28] )
+ PLD( blt 4f )
+ PLD( pld [r1, #60] )
+ PLD( pld [r1, #92] )
+
+3: PLD( pld [r1, #124] )
+4: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+ subs r2, r2, #32
+ str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+ bge 3b
+ PLD( cmn r2, #96 )
+ PLD( bge 4b )
+
+5: ands ip, r2, #28
+ rsb ip, ip, #32
+#if LDR1W_SHIFT > 0
+ lsl ip, ip, #LDR1W_SHIFT
+#endif
+ addne pc, pc, ip @ C is always clear here
+ b 7f
+6:
+ .rept (1 << LDR1W_SHIFT)
+ W(nop)
+ .endr
+ ldr1w r1, r3, abort=20f
+ ldr1w r1, r4, abort=20f
+ ldr1w r1, r5, abort=20f
+ ldr1w r1, r6, abort=20f
+ ldr1w r1, r7, abort=20f
+ ldr1w r1, r8, abort=20f
+ ldr1w r1, lr, abort=20f
+
+#if LDR1W_SHIFT < STR1W_SHIFT
+ lsl ip, ip, #STR1W_SHIFT - LDR1W_SHIFT
+#elif LDR1W_SHIFT > STR1W_SHIFT
+ lsr ip, ip, #LDR1W_SHIFT - STR1W_SHIFT
+#endif
+ add pc, pc, ip
+ nop
+ .rept (1 << STR1W_SHIFT)
+ W(nop)
+ .endr
+ str1w r0, r3, abort=20f
+ str1w r0, r4, abort=20f
+ str1w r0, r5, abort=20f
+ str1w r0, r6, abort=20f
+ str1w r0, r7, abort=20f
+ str1w r0, r8, abort=20f
+ str1w r0, lr, abort=20f
+
+ CALGN( bcs 2b )
+
+7: ldmfd sp!, {r5 - r8}
+
+8: movs r2, r2, lsl #31
+ ldr1b r1, r3, ne, abort=21f
+ ldr1b r1, r4, cs, abort=21f
+ ldr1b r1, ip, cs, abort=21f
+ str1b r0, r3, ne, abort=21f
+ str1b r0, r4, cs, abort=21f
+ str1b r0, ip, cs, abort=21f
+
+ exit r4, pc
+
+9: rsb ip, ip, #4
+ cmp ip, #2
+ ldr1b r1, r3, gt, abort=21f
+ ldr1b r1, r4, ge, abort=21f
+ ldr1b r1, lr, abort=21f
+ str1b r0, r3, gt, abort=21f
+ str1b r0, r4, ge, abort=21f
+ subs r2, r2, ip
+ str1b r0, lr, abort=21f
+ blt 8b
+ ands ip, r1, #3
+ beq 1b
+
+10: bic r1, r1, #3
+ cmp ip, #2
+ ldr1w r1, lr, abort=21f
+ beq 17f
+ bgt 18f
+
+
+ .macro forward_copy_shift pull push
+
+ subs r2, r2, #28
+ blt 14f
+
+ CALGN( ands ip, r0, #31 )
+ CALGN( rsb ip, ip, #32 )
+ CALGN( sbcnes r4, ip, r2 ) @ C is always set here
+ CALGN( subcc r2, r2, ip )
+ CALGN( bcc 15f )
+
+11: stmfd sp!, {r5 - r9}
+
+ PLD( pld [r1, #0] )
+ PLD( subs r2, r2, #96 )
+ PLD( pld [r1, #28] )
+ PLD( blt 13f )
+ PLD( pld [r1, #60] )
+ PLD( pld [r1, #92] )
+
+12: PLD( pld [r1, #124] )
+13: ldr4w r1, r4, r5, r6, r7, abort=19f
+ mov r3, lr, pull #\pull
+ subs r2, r2, #32
+ ldr4w r1, r8, r9, ip, lr, abort=19f
+ orr r3, r3, r4, push #\push
+ mov r4, r4, pull #\pull
+ orr r4, r4, r5, push #\push
+ mov r5, r5, pull #\pull
+ orr r5, r5, r6, push #\push
+ mov r6, r6, pull #\pull
+ orr r6, r6, r7, push #\push
+ mov r7, r7, pull #\pull
+ orr r7, r7, r8, push #\push
+ mov r8, r8, pull #\pull
+ orr r8, r8, r9, push #\push
+ mov r9, r9, pull #\pull
+ orr r9, r9, ip, push #\push
+ mov ip, ip, pull #\pull
+ orr ip, ip, lr, push #\push
+ str8w r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f
+ bge 12b
+ PLD( cmn r2, #96 )
+ PLD( bge 13b )
+
+ ldmfd sp!, {r5 - r9}
+
+14: ands ip, r2, #28
+ beq 16f
+
+15: mov r3, lr, pull #\pull
+ ldr1w r1, lr, abort=21f
+ subs ip, ip, #4
+ orr r3, r3, lr, push #\push
+ str1w r0, r3, abort=21f
+ bgt 15b
+ CALGN( cmp r2, #0 )
+ CALGN( bge 11b )
+
+16: sub r1, r1, #(\push / 8)
+ b 8b
+
+ .endm
+
+
+ forward_copy_shift pull=8 push=24
+
+17: forward_copy_shift pull=16 push=16
+
+18: forward_copy_shift pull=24 push=8
+
diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S
new file mode 100644
index 0000000..0cdf895
--- /dev/null
+++ b/arch/arm/lib/memset.S
@@ -0,0 +1,126 @@
+/*
+ * linux/arch/arm/lib/memset.S
+ *
+ * Copyright (C) 1995-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * ASM optimised string functions
+ */
+#include <asm/assembler.h>
+
+ .text
+ .align 5
+ .word 0
+
+1: subs r2, r2, #4 @ 1 do we have enough
+ blt 5f @ 1 bytes to align with?
+ cmp r3, #2 @ 1
+ strltb r1, [r0], #1 @ 1
+ strleb r1, [r0], #1 @ 1
+ strb r1, [r0], #1 @ 1
+ add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3))
+/*
+ * The pointer is now aligned and the length is adjusted. Try doing the
+ * memset again.
+ */
+
+.globl memset
+memset:
+ ands r3, r0, #3 @ 1 unaligned?
+ bne 1b @ 1
+/*
+ * we know that the pointer in r0 is aligned to a word boundary.
+ */
+ orr r1, r1, r1, lsl #8
+ orr r1, r1, r1, lsl #16
+ mov r3, r1
+ cmp r2, #16
+ blt 4f
+
+#if ! CALGN(1)+0
+
+/*
+ * We need an extra register for this loop - save the return address and
+ * use the LR
+ */
+ str lr, [sp, #-4]!
+ mov ip, r1
+ mov lr, r1
+
+2: subs r2, r2, #64
+ stmgeia r0!, {r1, r3, ip, lr} @ 64 bytes at a time.
+ stmgeia r0!, {r1, r3, ip, lr}
+ stmgeia r0!, {r1, r3, ip, lr}
+ stmgeia r0!, {r1, r3, ip, lr}
+ bgt 2b
+ ldmeqfd sp!, {pc} @ Now <64 bytes to go.
+/*
+ * No need to correct the count; we're only testing bits from now on
+ */
+ tst r2, #32
+ stmneia r0!, {r1, r3, ip, lr}
+ stmneia r0!, {r1, r3, ip, lr}
+ tst r2, #16
+ stmneia r0!, {r1, r3, ip, lr}
+ ldr lr, [sp], #4
+
+#else
+
+/*
+ * This version aligns the destination pointer in order to write
+ * whole cache lines at once.
+ */
+
+ stmfd sp!, {r4-r7, lr}
+ mov r4, r1
+ mov r5, r1
+ mov r6, r1
+ mov r7, r1
+ mov ip, r1
+ mov lr, r1
+
+ cmp r2, #96
+ tstgt r0, #31
+ ble 3f
+
+ and ip, r0, #31
+ rsb ip, ip, #32
+ sub r2, r2, ip
+ movs ip, ip, lsl #(32 - 4)
+ stmcsia r0!, {r4, r5, r6, r7}
+ stmmiia r0!, {r4, r5}
+ tst ip, #(1 << 30)
+ mov ip, r1
+ strne r1, [r0], #4
+
+3: subs r2, r2, #64
+ stmgeia r0!, {r1, r3-r7, ip, lr}
+ stmgeia r0!, {r1, r3-r7, ip, lr}
+ bgt 3b
+ ldmeqfd sp!, {r4-r7, pc}
+
+ tst r2, #32
+ stmneia r0!, {r1, r3-r7, ip, lr}
+ tst r2, #16
+ stmneia r0!, {r4-r7}
+ ldmfd sp!, {r4-r7, lr}
+
+#endif
+
+4: tst r2, #8
+ stmneia r0!, {r1, r3}
+ tst r2, #4
+ strne r1, [r0], #4
+/*
+ * When we get here, we've got less than 4 bytes to zero. We
+ * may have an unaligned pointer as well.
+ */
+5: tst r2, #2
+ strneb r1, [r0], #1
+ strneb r1, [r0], #1
+ tst r2, #1
+ strneb r1, [r0], #1
+ mov pc, lr
--
1.7.0.4
More information about the U-Boot
mailing list