[U-Boot] [PATCH V3] ARM: Don't include libgcc anymore

Fri Aug 7 23:33:46 CEST 2009

This patch removes the inclusion of libgcc functions into U-Boot on the ARM
architecture. Only the really needed functions are provided in the lib_arm
directory. Those implementations are copied from Linux where they are well
proven related to reliably, performance.

Signed-off-by: Jean-Christophe PLAGNIOL-VILLARD <plagnioj at jcrosoft.com>
---
 board/trab/u-boot.lds             |    2 +-
 include/asm-arm/assembler.h       |  112 ++++++++++++
 include/asm-arm/linkage.h         |   11 ++
 include/linux/linkage.h           |   95 ++++++++++
 lib_arm/Makefile                  |   19 +--
 lib_arm/_divsi3.S                 |  140 ---------------
 lib_arm/_modsi3.S                 |   99 -----------
 lib_arm/_udivsi3.S                |   77 --------
 lib_arm/_umodsi3.S                |   88 ----------
 lib_arm/{_ashldi3.S => ashldi3.S} |    9 +-
 lib_arm/{_ashrdi3.S => ashrdi3.S} |    9 +-
 lib_arm/config.mk                 |    2 +
 lib_arm/lib1funcs.S               |  344 +++++++++++++++++++++++++++++++++++++
 lib_arm/{_ashrdi3.S => lshrdi3.S} |   13 +-
 14 files changed, 596 insertions(+), 424 deletions(-)
 create mode 100644 include/asm-arm/assembler.h
 create mode 100644 include/asm-arm/linkage.h
 create mode 100644 include/linux/linkage.h
 delete mode 100644 lib_arm/_divsi3.S
 delete mode 100644 lib_arm/_modsi3.S
 delete mode 100644 lib_arm/_udivsi3.S
 delete mode 100644 lib_arm/_umodsi3.S
 rename lib_arm/{_ashldi3.S => ashldi3.S} (93%)
 copy lib_arm/{_ashrdi3.S => ashrdi3.S} (93%)
 create mode 100644 lib_arm/lib1funcs.S
 rename lib_arm/{_ashrdi3.S => lshrdi3.S} (90%)

diff --git a/board/trab/u-boot.lds b/board/trab/u-boot.lds
index d8bcfa4..a83853e 100644
--- a/board/trab/u-boot.lds
+++ b/board/trab/u-boot.lds
@@ -33,7 +33,7 @@ SECTIONS
 	.text      :
 	{
 	  cpu/arm920t/start.o	(.text)
-	  lib_arm/_umodsi3.o	(.text)
+	  lib_arm/lib1funcs.o	(.text)
 	  lib_generic/zlib.o	(.text)
 	  lib_generic/crc32.o	(.text)
 	  lib_generic/string.o	(.text)
diff --git a/include/asm-arm/assembler.h b/include/asm-arm/assembler.h
new file mode 100644
index 0000000..c7916d1
--- /dev/null
+++ b/include/asm-arm/assembler.h
@@ -0,0 +1,112 @@
+/*
+ *  arch/arm/include/asm/assembler.h
+ *
+ *  Copyright (C) 1996-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  This file contains arm architecture specific defines
+ *  for the different processors.
+ *
+ *  Do not include any C declarations in this file - it is included by
+ *  assembler source.
+ */
+#include <asm/ptrace.h>
+
+/*
+ * Endian independent macros for shifting bytes within registers.
+ */
+#ifndef __ARMEB__
+#define pull            lsr
+#define push            lsl
+#define get_byte_0      lsl #0
+#define get_byte_1	lsr #8
+#define get_byte_2	lsr #16
+#define get_byte_3	lsr #24
+#define put_byte_0      lsl #0
+#define put_byte_1	lsl #8
+#define put_byte_2	lsl #16
+#define put_byte_3	lsl #24
+#else
+#define pull            lsl
+#define push            lsr
+#define get_byte_0	lsr #24
+#define get_byte_1	lsr #16
+#define get_byte_2	lsr #8
+#define get_byte_3      lsl #0
+#define put_byte_0	lsl #24
+#define put_byte_1	lsl #16
+#define put_byte_2	lsl #8
+#define put_byte_3      lsl #0
+#endif
+
+/*
+ * Data preload for architectures that support it
+ */
+#if __LINUX_ARM_ARCH__ >= 5
+#define PLD(code...)	code
+#else
+#define PLD(code...)
+#endif
+
+/*
+ * This can be used to enable code to cacheline align the destination
+ * pointer when bulk writing to memory.  Experiments on StrongARM and
+ * XScale didn't show this a worthwhile thing to do when the cache is not
+ * set to write-allocate (this would need further testing on XScale when WA
+ * is used).
+ *
+ * On Feroceon there is much to gain however, regardless of cache mode.
+ */
+#ifdef CONFIG_CPU_FEROCEON
+#define CALGN(code...) code
+#else
+#define CALGN(code...)
+#endif
+
+/*
+ * Enable and disable interrupts
+ */
+#if __LINUX_ARM_ARCH__ >= 6
+	.macro	disable_irq
+	cpsid	i
+	.endm
+
+	.macro	enable_irq
+	cpsie	i
+	.endm
+#else
+	.macro	disable_irq
+	msr	cpsr_c, #PSR_I_BIT | SVC_MODE
+	.endm
+
+	.macro	enable_irq
+	msr	cpsr_c, #SVC_MODE
+	.endm
+#endif
+
+/*
+ * Save the current IRQ state and disable IRQs.  Note that this macro
+ * assumes FIQs are enabled, and that the processor is in SVC mode.
+ */
+	.macro	save_and_disable_irqs, oldcpsr
+	mrs	\oldcpsr, cpsr
+	disable_irq
+	.endm
+
+/*
+ * Restore interrupt state previously stored in a register.  We don't
+ * guarantee that this will preserve the flags.
+ */
+	.macro	restore_irqs, oldcpsr
+	msr	cpsr_c, \oldcpsr
+	.endm
+
+#define USER(x...)				\
+9999:	x;					\
+	.section __ex_table,"a";		\
+	.align	3;				\
+	.long	9999b,9001f;			\
+	.previous
diff --git a/include/asm-arm/linkage.h b/include/asm-arm/linkage.h
new file mode 100644
index 0000000..5a25632
--- /dev/null
+++ b/include/asm-arm/linkage.h
@@ -0,0 +1,11 @@
+#ifndef __ASM_LINKAGE_H
+#define __ASM_LINKAGE_H
+
+#define __ALIGN .align 0
+#define __ALIGN_STR ".align 0"
+
+#define ENDPROC(name) \
+  .type name, %function; \
+  END(name)
+
+#endif
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
new file mode 100644
index 0000000..b25624e
--- /dev/null
+++ b/include/linux/linkage.h
@@ -0,0 +1,95 @@
+#ifndef _LINUX_LINKAGE_H
+#define _LINUX_LINKAGE_H
+
+#include <asm/linkage.h>
+
+#ifdef __cplusplus
+#define CPP_ASMLINKAGE extern "C"
+#else
+#define CPP_ASMLINKAGE
+#endif
+
+#ifndef asmlinkage
+#define asmlinkage CPP_ASMLINKAGE
+#endif
+
+#ifndef asmregparm
+# define asmregparm
+#endif
+
+#define __page_aligned_data	__section(.data.page_aligned) __aligned(PAGE_SIZE)
+#define __page_aligned_bss	__section(.bss.page_aligned) __aligned(PAGE_SIZE)
+
+/*
+ * This is used by architectures to keep arguments on the stack
+ * untouched by the compiler by keeping them live until the end.
+ * The argument stack may be owned by the assembly-language
+ * caller, not the callee, and gcc doesn't always understand
+ * that.
+ *
+ * We have the return value, and a maximum of six arguments.
+ *
+ * This should always be followed by a "return ret" for the
+ * protection to work (ie no more work that the compiler might
+ * end up needing stack temporaries for).
+ */
+/* Assembly files may be compiled with -traditional .. */
+#ifndef __ASSEMBLY__
+#ifndef asmlinkage_protect
+# define asmlinkage_protect(n, ret, args...)	do { } while (0)
+#endif
+#endif
+
+#ifndef __ALIGN
+#define __ALIGN		.align 4,0x90
+#define __ALIGN_STR	".align 4,0x90"
+#endif
+
+#ifdef __ASSEMBLY__
+
+#define ALIGN __ALIGN
+#define ALIGN_STR __ALIGN_STR
+
+#ifndef ENTRY
+#define ENTRY(name) \
+  .globl name; \
+  ALIGN; \
+  name:
+#endif
+
+#ifndef WEAK
+#define WEAK(name)	   \
+	.weak name;	   \
+	name:
+#endif
+
+#define KPROBE_ENTRY(name) \
+  .pushsection .kprobes.text, "ax"; \
+  ENTRY(name)
+
+#define KPROBE_END(name) \
+  END(name);		 \
+  .popsection
+
+#ifndef END
+#define END(name) \
+  .size name, .-name
+#endif
+
+/* If symbol 'name' is treated as a subroutine (gets called, and returns)
+ * then please use ENDPROC to mark 'name' as STT_FUNC for the benefit of
+ * static analysis tools such as stack depth analyzer.
+ */
+#ifndef ENDPROC
+#define ENDPROC(name) \
+  .type name, @function; \
+  END(name)
+#endif
+
+#endif
+
+#define NORET_TYPE    /**/
+#define ATTRIB_NORET  __attribute__((noreturn))
+#define NORET_AND     noreturn,
+
+#endif
diff --git a/lib_arm/Makefile b/lib_arm/Makefile
index c37e2e0..3fda6d2 100644
--- a/lib_arm/Makefile
+++ b/lib_arm/Makefile
@@ -26,15 +26,12 @@ include $(TOPDIR)/config.mk
 LIB	= $(obj)lib$(ARCH).a
 LIBGCC	= $(obj)libgcc.a
 
-GLSOBJS	+= _ashldi3.o
-GLSOBJS	+= _ashrdi3.o
-GLSOBJS	+= _divsi3.o
-GLSOBJS	+= _lshrdi3.o
-GLSOBJS	+= _modsi3.o
-GLSOBJS	+= _udivsi3.o
-GLSOBJS	+= _umodsi3.o
+GLCOBJS-y += div0.o
 
-GLCOBJS	+= div0.o
+GLSOBJS-y += lib1funcs.o
+GLSOBJS-y += ashldi3.o
+GLSOBJS-y += ashrdi3.o
+GLSOBJS-y += lshrdi3.o
 
 COBJS-y	+= board.o
 COBJS-y	+= bootm.o
@@ -45,11 +42,11 @@ endif
 COBJS-y	+= interrupts.o
 COBJS-y	+= reset.o
 
-SRCS	:= $(GLSOBJS:.o=.S) $(GLCOBJS:.o=.c) \
+SRCS	:= $(GLSOBJS-y:.o=.S) $(GLCOBJS-y:.o=.c) \
 	   $(SOBJS-y:.o=.S) $(COBJS-y:.o=.c)
 OBJS	:= $(addprefix $(obj),$(SOBJS-y) $(COBJS-y))
-LGOBJS	:= $(addprefix $(obj),$(GLSOBJS)) \
-	   $(addprefix $(obj),$(GLCOBJS))
+LGOBJS	:= $(addprefix $(obj),$(GLSOBJS-y)) \
+	   $(addprefix $(obj),$(GLCOBJS-y))
 
 ifdef USE_PRIVATE_LIBGCC
 all:	$(LIB) $(LIBGCC)
diff --git a/lib_arm/_divsi3.S b/lib_arm/_divsi3.S
deleted file mode 100644
index 9dc15f6..0000000
--- a/lib_arm/_divsi3.S
+++ /dev/null
@@ -1,140 +0,0 @@
-
-.macro ARM_DIV_BODY dividend, divisor, result, curbit
-
-#if __LINUX_ARM_ARCH__ >= 5
-
-	clz	\curbit, \divisor
-	clz	\result, \dividend
-	sub	\result, \curbit, \result
-	mov	\curbit, #1
-	mov	\divisor, \divisor, lsl \result
-	mov	\curbit, \curbit, lsl \result
-	mov	\result, #0
-
-#else
-
-	@ Initially shift the divisor left 3 bits if possible,
-	@ set curbit accordingly.  This allows for curbit to be located
-	@ at the left end of each 4 bit nibbles in the division loop
-	@ to save one loop in most cases.
-	tst	\divisor, #0xe0000000
-	moveq	\divisor, \divisor, lsl #3
-	moveq	\curbit, #8
-	movne	\curbit, #1
-
-	@ Unless the divisor is very big, shift it up in multiples of
-	@ four bits, since this is the amount of unwinding in the main
-	@ division loop.  Continue shifting until the divisor is
-	@ larger than the dividend.
-1:	cmp	\divisor, #0x10000000
-	cmplo	\divisor, \dividend
-	movlo	\divisor, \divisor, lsl #4
-	movlo	\curbit, \curbit, lsl #4
-	blo	1b
-
-	@ For very big divisors, we must shift it a bit at a time, or
-	@ we will be in danger of overflowing.
-1:	cmp	\divisor, #0x80000000
-	cmplo	\divisor, \dividend
-	movlo	\divisor, \divisor, lsl #1
-	movlo	\curbit, \curbit, lsl #1
-	blo	1b
-
-	mov	\result, #0
-
-#endif
-
-	@ Division loop
-1:	cmp	\dividend, \divisor
-	subhs	\dividend, \dividend, \divisor
-	orrhs	\result,   \result,   \curbit
-	cmp	\dividend, \divisor,  lsr #1
-	subhs	\dividend, \dividend, \divisor, lsr #1
-	orrhs	\result,   \result,   \curbit,  lsr #1
-	cmp	\dividend, \divisor,  lsr #2
-	subhs	\dividend, \dividend, \divisor, lsr #2
-	orrhs	\result,   \result,   \curbit,  lsr #2
-	cmp	\dividend, \divisor,  lsr #3
-	subhs	\dividend, \dividend, \divisor, lsr #3
-	orrhs	\result,   \result,   \curbit,  lsr #3
-	cmp	\dividend, #0			@ Early termination?
-	movnes	\curbit,   \curbit,  lsr #4	@ No, any more bits to do?
-	movne	\divisor,  \divisor, lsr #4
-	bne	1b
-
-.endm
-
-.macro ARM_DIV2_ORDER divisor, order
-
-#if __LINUX_ARM_ARCH__ >= 5
-
-	clz	\order, \divisor
-	rsb	\order, \order, #31
-
-#else
-
-	cmp	\divisor, #(1 << 16)
-	movhs	\divisor, \divisor, lsr #16
-	movhs	\order, #16
-	movlo	\order, #0
-
-	cmp	\divisor, #(1 << 8)
-	movhs	\divisor, \divisor, lsr #8
-	addhs	\order, \order, #8
-
-	cmp	\divisor, #(1 << 4)
-	movhs	\divisor, \divisor, lsr #4
-	addhs	\order, \order, #4
-
-	cmp	\divisor, #(1 << 2)
-	addhi	\order, \order, #3
-	addls	\order, \order, \divisor, lsr #1
-
-#endif
-
-.endm
-
-	.align	5
-.globl __divsi3
-__divsi3:
-	cmp	r1, #0
-	eor	ip, r0, r1			@ save the sign of the result.
-	beq	Ldiv0
-	rsbmi	r1, r1, #0			@ loops below use unsigned.
-	subs	r2, r1, #1			@ division by 1 or -1 ?
-	beq	10f
-	movs	r3, r0
-	rsbmi	r3, r0, #0			@ positive dividend value
-	cmp	r3, r1
-	bls	11f
-	tst	r1, r2				@ divisor is power of 2 ?
-	beq	12f
-
-	ARM_DIV_BODY r3, r1, r0, r2
-
-	cmp	ip, #0
-	rsbmi	r0, r0, #0
-	mov	pc, lr
-
-10:	teq	ip, r0				@ same sign ?
-	rsbmi	r0, r0, #0
-	mov	pc, lr
-
-11:	movlo	r0, #0
-	moveq	r0, ip, asr #31
-	orreq	r0, r0, #1
-	mov	pc, lr
-
-12:	ARM_DIV2_ORDER r1, r2
-
-	cmp	ip, #0
-	mov	r0, r3, lsr r2
-	rsbmi	r0, r0, #0
-	mov	pc, lr
-
-Ldiv0:
-
-	str	lr, [sp, #-4]!
-	bl	__div0
-	mov	r0, #0			@ About as wrong as it could be.
-	ldr	pc, [sp], #4
diff --git a/lib_arm/_modsi3.S b/lib_arm/_modsi3.S
deleted file mode 100644
index 539c584..0000000
--- a/lib_arm/_modsi3.S
+++ /dev/null
@@ -1,99 +0,0 @@
-
-.macro ARM_MOD_BODY dividend, divisor, order, spare
-
-#if __LINUX_ARM_ARCH__ >= 5
-
-	clz	\order, \divisor
-	clz	\spare, \dividend
-	sub	\order, \order, \spare
-	mov	\divisor, \divisor, lsl \order
-
-#else
-
-	mov	\order, #0
-
-	@ Unless the divisor is very big, shift it up in multiples of
-	@ four bits, since this is the amount of unwinding in the main
-	@ division loop.  Continue shifting until the divisor is
-	@ larger than the dividend.
-1:	cmp	\divisor, #0x10000000
-	cmplo	\divisor, \dividend
-	movlo	\divisor, \divisor, lsl #4
-	addlo	\order, \order, #4
-	blo	1b
-
-	@ For very big divisors, we must shift it a bit at a time, or
-	@ we will be in danger of overflowing.
-1:	cmp	\divisor, #0x80000000
-	cmplo	\divisor, \dividend
-	movlo	\divisor, \divisor, lsl #1
-	addlo	\order, \order, #1
-	blo	1b
-
-#endif
-
-	@ Perform all needed substractions to keep only the reminder.
-	@ Do comparisons in batch of 4 first.
-	subs	\order, \order, #3		@ yes, 3 is intended here
-	blt	2f
-
-1:	cmp	\dividend, \divisor
-	subhs	\dividend, \dividend, \divisor
-	cmp	\dividend, \divisor,  lsr #1
-	subhs	\dividend, \dividend, \divisor, lsr #1
-	cmp	\dividend, \divisor,  lsr #2
-	subhs	\dividend, \dividend, \divisor, lsr #2
-	cmp	\dividend, \divisor,  lsr #3
-	subhs	\dividend, \dividend, \divisor, lsr #3
-	cmp	\dividend, #1
-	mov	\divisor, \divisor, lsr #4
-	subges	\order, \order, #4
-	bge	1b
-
-	tst	\order, #3
-	teqne	\dividend, #0
-	beq	5f
-
-	@ Either 1, 2 or 3 comparison/substractions are left.
-2:	cmn	\order, #2
-	blt	4f
-	beq	3f
-	cmp	\dividend, \divisor
-	subhs	\dividend, \dividend, \divisor
-	mov	\divisor,  \divisor,  lsr #1
-3:	cmp	\dividend, \divisor
-	subhs	\dividend, \dividend, \divisor
-	mov	\divisor,  \divisor,  lsr #1
-4:	cmp	\dividend, \divisor
-	subhs	\dividend, \dividend, \divisor
-5:
-.endm
-
-	.align	5
-.globl __modsi3
-__modsi3:
-	cmp	r1, #0
-	beq	Ldiv0
-	rsbmi	r1, r1, #0			@ loops below use unsigned.
-	movs	ip, r0				@ preserve sign of dividend
-	rsbmi	r0, r0, #0			@ if negative make positive
-	subs	r2, r1, #1			@ compare divisor with 1
-	cmpne	r0, r1				@ compare dividend with divisor
-	moveq	r0, #0
-	tsthi	r1, r2				@ see if divisor is power of 2
-	andeq	r0, r0, r2
-	bls	10f
-
-	ARM_MOD_BODY r0, r1, r2, r3
-
-10:	cmp	ip, #0
-	rsbmi	r0, r0, #0
-	mov	pc, lr
-
-
-Ldiv0:
-
-	str	lr, [sp, #-4]!
-	bl	__div0
-	mov	r0, #0			@ About as wrong as it could be.
-	ldr	pc, [sp], #4
diff --git a/lib_arm/_udivsi3.S b/lib_arm/_udivsi3.S
deleted file mode 100644
index a3f9b59..0000000
--- a/lib_arm/_udivsi3.S
+++ /dev/null
@@ -1,77 +0,0 @@
-/* # 1 "libgcc1.S" */
-@ libgcc1 routines for ARM cpu.
-@ Division routines, written by Richard Earnshaw, (rearnsha at armltd.co.uk)
-dividend	.req	r0
-divisor		.req	r1
-result		.req	r2
-curbit		.req	r3
-/* ip		.req	r12	*/
-/* sp		.req	r13	*/
-/* lr		.req	r14	*/
-/* pc		.req	r15	*/
-	.text
-	.globl	 __udivsi3
-	.type  __udivsi3       ,function
-	.align	0
- __udivsi3      :
-	cmp	divisor, #0
-	beq	Ldiv0
-	mov	curbit, #1
-	mov	result, #0
-	cmp	dividend, divisor
-	bcc	Lgot_result
-Loop1:
-	@ Unless the divisor is very big, shift it up in multiples of
-	@ four bits, since this is the amount of unwinding in the main
-	@ division loop.  Continue shifting until the divisor is
-	@ larger than the dividend.
-	cmp	divisor, #0x10000000
-	cmpcc	divisor, dividend
-	movcc	divisor, divisor, lsl #4
-	movcc	curbit, curbit, lsl #4
-	bcc	Loop1
-Lbignum:
-	@ For very big divisors, we must shift it a bit at a time, or
-	@ we will be in danger of overflowing.
-	cmp	divisor, #0x80000000
-	cmpcc	divisor, dividend
-	movcc	divisor, divisor, lsl #1
-	movcc	curbit, curbit, lsl #1
-	bcc	Lbignum
-Loop3:
-	@ Test for possible subtractions, and note which bits
-	@ are done in the result.  On the final pass, this may subtract
-	@ too much from the dividend, but the result will be ok, since the
-	@ "bit" will have been shifted out at the bottom.
-	cmp	dividend, divisor
-	subcs	dividend, dividend, divisor
-	orrcs	result, result, curbit
-	cmp	dividend, divisor, lsr #1
-	subcs	dividend, dividend, divisor, lsr #1
-	orrcs	result, result, curbit, lsr #1
-	cmp	dividend, divisor, lsr #2
-	subcs	dividend, dividend, divisor, lsr #2
-	orrcs	result, result, curbit, lsr #2
-	cmp	dividend, divisor, lsr #3
-	subcs	dividend, dividend, divisor, lsr #3
-	orrcs	result, result, curbit, lsr #3
-	cmp	dividend, #0			@ Early termination?
-	movnes	curbit, curbit, lsr #4		@ No, any more bits to do?
-	movne	divisor, divisor, lsr #4
-	bne	Loop3
-Lgot_result:
-	mov	r0, result
-	mov	pc, lr
-Ldiv0:
-	str	lr, [sp, #-4]!
-	bl	 __div0       (PLT)
-	mov	r0, #0			@ about as wrong as it could be
-	ldmia	sp!, {pc}
-	.size  __udivsi3       , . -  __udivsi3
-/* # 235 "libgcc1.S" */
-/* # 320 "libgcc1.S" */
-/* # 421 "libgcc1.S" */
-/* # 433 "libgcc1.S" */
-/* # 456 "libgcc1.S" */
-/* # 500 "libgcc1.S" */
-/* # 580 "libgcc1.S" */
diff --git a/lib_arm/_umodsi3.S b/lib_arm/_umodsi3.S
deleted file mode 100644
index 8465ef0..0000000
--- a/lib_arm/_umodsi3.S
+++ /dev/null
@@ -1,88 +0,0 @@
-/* # 1 "libgcc1.S" */
-@ libgcc1 routines for ARM cpu.
-@ Division routines, written by Richard Earnshaw, (rearnsha at armltd.co.uk)
-/* # 145 "libgcc1.S" */
-dividend	.req	r0
-divisor		.req	r1
-overdone	.req	r2
-curbit		.req	r3
-/* ip		.req	r12	*/
-/* sp		.req	r13	*/
-/* lr		.req	r14	*/
-/* pc		.req	r15	*/
-	.text
-	.globl	 __umodsi3
-	.type  __umodsi3       ,function
-	.align 0
- __umodsi3      :
-	cmp	divisor, #0
-	beq	Ldiv0
-	mov	curbit, #1
-	cmp	dividend, divisor
-	movcc	pc, lr
-Loop1:
-	@ Unless the divisor is very big, shift it up in multiples of
-	@ four bits, since this is the amount of unwinding in the main
-	@ division loop.  Continue shifting until the divisor is
-	@ larger than the dividend.
-	cmp	divisor, #0x10000000
-	cmpcc	divisor, dividend
-	movcc	divisor, divisor, lsl #4
-	movcc	curbit, curbit, lsl #4
-	bcc	Loop1
-Lbignum:
-	@ For very big divisors, we must shift it a bit at a time, or
-	@ we will be in danger of overflowing.
-	cmp	divisor, #0x80000000
-	cmpcc	divisor, dividend
-	movcc	divisor, divisor, lsl #1
-	movcc	curbit, curbit, lsl #1
-	bcc	Lbignum
-Loop3:
-	@ Test for possible subtractions.  On the final pass, this may
-	@ subtract too much from the dividend, so keep track of which
-	@ subtractions are done, we can fix them up afterwards...
-	mov	overdone, #0
-	cmp	dividend, divisor
-	subcs	dividend, dividend, divisor
-	cmp	dividend, divisor, lsr #1
-	subcs	dividend, dividend, divisor, lsr #1
-	orrcs	overdone, overdone, curbit, ror #1
-	cmp	dividend, divisor, lsr #2
-	subcs	dividend, dividend, divisor, lsr #2
-	orrcs	overdone, overdone, curbit, ror #2
-	cmp	dividend, divisor, lsr #3
-	subcs	dividend, dividend, divisor, lsr #3
-	orrcs	overdone, overdone, curbit, ror #3
-	mov	ip, curbit
-	cmp	dividend, #0			@ Early termination?
-	movnes	curbit, curbit, lsr #4		@ No, any more bits to do?
-	movne	divisor, divisor, lsr #4
-	bne	Loop3
-	@ Any subtractions that we should not have done will be recorded in
-	@ the top three bits of "overdone".  Exactly which were not needed
-	@ are governed by the position of the bit, stored in ip.
-	@ If we terminated early, because dividend became zero,
-	@ then none of the below will match, since the bit in ip will not be
-	@ in the bottom nibble.
-	ands	overdone, overdone, #0xe0000000
-	moveq	pc, lr				@ No fixups needed
-	tst	overdone, ip, ror #3
-	addne	dividend, dividend, divisor, lsr #3
-	tst	overdone, ip, ror #2
-	addne	dividend, dividend, divisor, lsr #2
-	tst	overdone, ip, ror #1
-	addne	dividend, dividend, divisor, lsr #1
-	mov	pc, lr
-Ldiv0:
-	str	lr, [sp, #-4]!
-	bl	 __div0       (PLT)
-	mov	r0, #0			@ about as wrong as it could be
-	ldmia	sp!, {pc}
-	.size  __umodsi3       , . -  __umodsi3
-/* # 320 "libgcc1.S" */
-/* # 421 "libgcc1.S" */
-/* # 433 "libgcc1.S" */
-/* # 456 "libgcc1.S" */
-/* # 500 "libgcc1.S" */
-/* # 580 "libgcc1.S" */
diff --git a/lib_arm/_ashldi3.S b/lib_arm/ashldi3.S
similarity index 93%
rename from lib_arm/_ashldi3.S
rename to lib_arm/ashldi3.S
index de4403d..1154d92 100644
--- a/lib_arm/_ashldi3.S
+++ b/lib_arm/ashldi3.S
@@ -26,6 +26,8 @@ the Free Software Foundation, 51 Franklin Street, Fifth Floor,
 Boston, MA 02110-1301, USA.  */
 
 
+#include <linux/linkage.h>
+
 #ifdef __ARMEB__
 #define al r1
 #define ah r0
@@ -34,8 +36,8 @@ Boston, MA 02110-1301, USA.  */
 #define ah r1
 #endif
 
-.globl __ashldi3
-__ashldi3:
+ENTRY(__ashldi3)
+ENTRY(__aeabi_llsl)
 
 	subs	r3, r2, #32
 	rsb	ip, r2, #32
@@ -44,3 +46,6 @@ __ashldi3:
 	orrmi	ah, ah, al, lsr ip
 	mov	al, al, lsl r2
 	mov	pc, lr
+
+ENDPROC(__ashldi3)
+ENDPROC(__aeabi_llsl)
diff --git a/lib_arm/_ashrdi3.S b/lib_arm/ashrdi3.S
similarity index 93%
copy from lib_arm/_ashrdi3.S
copy to lib_arm/ashrdi3.S
index 5edbcb3..9f8b355 100644
--- a/lib_arm/_ashrdi3.S
+++ b/lib_arm/ashrdi3.S
@@ -26,6 +26,8 @@ the Free Software Foundation, 51 Franklin Street, Fifth Floor,
 Boston, MA 02110-1301, USA.  */
 
 
+#include <linux/linkage.h>
+
 #ifdef __ARMEB__
 #define al r1
 #define ah r0
@@ -34,8 +36,8 @@ Boston, MA 02110-1301, USA.  */
 #define ah r1
 #endif
 
-.globl __ashrdi3
-__ashrdi3:
+ENTRY(__ashrdi3)
+ENTRY(__aeabi_lasr)
 
 	subs	r3, r2, #32
 	rsb	ip, r2, #32
@@ -44,3 +46,6 @@ __ashrdi3:
 	orrmi	al, al, ah, lsl ip
 	mov	ah, ah, asr r2
 	mov	pc, lr
+
+ENDPROC(__ashrdi3)
+ENDPROC(__aeabi_lasr)
diff --git a/lib_arm/config.mk b/lib_arm/config.mk
index a13603e..3f85cfd 100644
--- a/lib_arm/config.mk
+++ b/lib_arm/config.mk
@@ -26,3 +26,5 @@ CROSS_COMPILE ?= arm-linux-
 PLATFORM_CPPFLAGS += -DCONFIG_ARM -D__ARM__
 
 LDSCRIPT := $(SRCTREE)/cpu/$(CPU)/u-boot.lds
+
+USE_PRIVATE_LIBGCC ?= yes
diff --git a/lib_arm/lib1funcs.S b/lib_arm/lib1funcs.S
new file mode 100644
index 0000000..97488ba
--- /dev/null
+++ b/lib_arm/lib1funcs.S
@@ -0,0 +1,344 @@
+/*
+ * linux/arch/arm/lib/lib1funcs.S: Optimized ARM division routines
+ *
+ * Author: Nicolas Pitre <nico at cam.org>
+ *   - contributed to gcc-3.4 on Sep 30, 2003
+ *   - adapted for the Linux kernel on Oct 2, 2003
+ */
+
+/* Copyright 1995, 1996, 1998, 1999, 2000, 2003 Free Software Foundation, Inc.
+
+This file is free software; you can redistribute it and/or modify it
+under the terms of the GNU General Public License as published by the
+Free Software Foundation; either version 2, or (at your option) any
+later version.
+
+In addition to the permissions in the GNU General Public License, the
+Free Software Foundation gives you unlimited permission to link the
+compiled version of this file into combinations with other programs,
+and to distribute those combinations without any restriction coming
+from the use of this file.  (The General Public License restrictions
+do apply in other respects; for example, they cover modification of
+the file, and distribution when not linked into a combine
+executable.)
+
+This file is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; see the file COPYING.  If not, write to
+the Free Software Foundation, 59 Temple Place - Suite 330,
+Boston, MA 02111-1307, USA.  */
+
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+
+.macro ARM_DIV_BODY dividend, divisor, result, curbit
+
+#if __LINUX_ARM_ARCH__ >= 5
+
+	clz	\curbit, \divisor
+	clz	\result, \dividend
+	sub	\result, \curbit, \result
+	mov	\curbit, #1
+	mov	\divisor, \divisor, lsl \result
+	mov	\curbit, \curbit, lsl \result
+	mov	\result, #0
+	
+#else
+
+	@ Initially shift the divisor left 3 bits if possible,
+	@ set curbit accordingly.  This allows for curbit to be located
+	@ at the left end of each 4 bit nibbles in the division loop
+	@ to save one loop in most cases.
+	tst	\divisor, #0xe0000000
+	moveq	\divisor, \divisor, lsl #3
+	moveq	\curbit, #8
+	movne	\curbit, #1
+
+	@ Unless the divisor is very big, shift it up in multiples of
+	@ four bits, since this is the amount of unwinding in the main
+	@ division loop.  Continue shifting until the divisor is 
+	@ larger than the dividend.
+1:	cmp	\divisor, #0x10000000
+	cmplo	\divisor, \dividend
+	movlo	\divisor, \divisor, lsl #4
+	movlo	\curbit, \curbit, lsl #4
+	blo	1b
+
+	@ For very big divisors, we must shift it a bit at a time, or
+	@ we will be in danger of overflowing.
+1:	cmp	\divisor, #0x80000000
+	cmplo	\divisor, \dividend
+	movlo	\divisor, \divisor, lsl #1
+	movlo	\curbit, \curbit, lsl #1
+	blo	1b
+
+	mov	\result, #0
+
+#endif
+
+	@ Division loop
+1:	cmp	\dividend, \divisor
+	subhs	\dividend, \dividend, \divisor
+	orrhs	\result,   \result,   \curbit
+	cmp	\dividend, \divisor,  lsr #1
+	subhs	\dividend, \dividend, \divisor, lsr #1
+	orrhs	\result,   \result,   \curbit,  lsr #1
+	cmp	\dividend, \divisor,  lsr #2
+	subhs	\dividend, \dividend, \divisor, lsr #2
+	orrhs	\result,   \result,   \curbit,  lsr #2
+	cmp	\dividend, \divisor,  lsr #3
+	subhs	\dividend, \dividend, \divisor, lsr #3
+	orrhs	\result,   \result,   \curbit,  lsr #3
+	cmp	\dividend, #0			@ Early termination?
+	movnes	\curbit,   \curbit,  lsr #4	@ No, any more bits to do?
+	movne	\divisor,  \divisor, lsr #4
+	bne	1b
+
+.endm
+
+
+.macro ARM_DIV2_ORDER divisor, order
+
+#if __LINUX_ARM_ARCH__ >= 5
+
+	clz	\order, \divisor
+	rsb	\order, \order, #31
+
+#else
+
+	cmp	\divisor, #(1 << 16)
+	movhs	\divisor, \divisor, lsr #16
+	movhs	\order, #16
+	movlo	\order, #0
+
+	cmp	\divisor, #(1 << 8)
+	movhs	\divisor, \divisor, lsr #8
+	addhs	\order, \order, #8
+
+	cmp	\divisor, #(1 << 4)
+	movhs	\divisor, \divisor, lsr #4
+	addhs	\order, \order, #4
+
+	cmp	\divisor, #(1 << 2)
+	addhi	\order, \order, #3
+	addls	\order, \order, \divisor, lsr #1
+
+#endif
+
+.endm
+
+
+.macro ARM_MOD_BODY dividend, divisor, order, spare
+
+#if __LINUX_ARM_ARCH__ >= 5
+
+	clz	\order, \divisor
+	clz	\spare, \dividend
+	sub	\order, \order, \spare
+	mov	\divisor, \divisor, lsl \order
+
+#else
+
+	mov	\order, #0
+
+	@ Unless the divisor is very big, shift it up in multiples of
+	@ four bits, since this is the amount of unwinding in the main
+	@ division loop.  Continue shifting until the divisor is 
+	@ larger than the dividend.
+1:	cmp	\divisor, #0x10000000
+	cmplo	\divisor, \dividend
+	movlo	\divisor, \divisor, lsl #4
+	addlo	\order, \order, #4
+	blo	1b
+
+	@ For very big divisors, we must shift it a bit at a time, or
+	@ we will be in danger of overflowing.
+1:	cmp	\divisor, #0x80000000
+	cmplo	\divisor, \dividend
+	movlo	\divisor, \divisor, lsl #1
+	addlo	\order, \order, #1
+	blo	1b
+
+#endif
+
+	@ Perform all needed substractions to keep only the reminder.
+	@ Do comparisons in batch of 4 first.
+	subs	\order, \order, #3		@ yes, 3 is intended here
+	blt	2f
+
+1:	cmp	\dividend, \divisor
+	subhs	\dividend, \dividend, \divisor
+	cmp	\dividend, \divisor,  lsr #1
+	subhs	\dividend, \dividend, \divisor, lsr #1
+	cmp	\dividend, \divisor,  lsr #2
+	subhs	\dividend, \dividend, \divisor, lsr #2
+	cmp	\dividend, \divisor,  lsr #3
+	subhs	\dividend, \dividend, \divisor, lsr #3
+	cmp	\dividend, #1
+	mov	\divisor, \divisor, lsr #4
+	subges	\order, \order, #4
+	bge	1b
+
+	tst	\order, #3
+	teqne	\dividend, #0
+	beq	5f
+
+	@ Either 1, 2 or 3 comparison/substractions are left.
+2:	cmn	\order, #2
+	blt	4f
+	beq	3f
+	cmp	\dividend, \divisor
+	subhs	\dividend, \dividend, \divisor
+	mov	\divisor,  \divisor,  lsr #1
+3:	cmp	\dividend, \divisor
+	subhs	\dividend, \dividend, \divisor
+	mov	\divisor,  \divisor,  lsr #1
+4:	cmp	\dividend, \divisor
+	subhs	\dividend, \dividend, \divisor
+5:
+.endm
+
+
+ENTRY(__udivsi3)
+ENTRY(__aeabi_uidiv)
+
+	subs	r2, r1, #1
+	moveq	pc, lr
+	bcc	Ldiv0
+	cmp	r0, r1
+	bls	11f
+	tst	r1, r2
+	beq	12f
+
+	ARM_DIV_BODY r0, r1, r2, r3
+
+	mov	r0, r2
+	mov	pc, lr
+
+11:	moveq	r0, #1
+	movne	r0, #0
+	mov	pc, lr
+
+12:	ARM_DIV2_ORDER r1, r2
+
+	mov	r0, r0, lsr r2
+	mov	pc, lr
+
+ENDPROC(__udivsi3)
+ENDPROC(__aeabi_uidiv)
+
+ENTRY(__umodsi3)
+
+	subs	r2, r1, #1			@ compare divisor with 1
+	bcc	Ldiv0
+	cmpne	r0, r1				@ compare dividend with divisor
+	moveq   r0, #0
+	tsthi	r1, r2				@ see if divisor is power of 2
+	andeq	r0, r0, r2
+	movls	pc, lr
+
+	ARM_MOD_BODY r0, r1, r2, r3
+
+	mov	pc, lr
+
+ENDPROC(__umodsi3)
+
+ENTRY(__divsi3)
+ENTRY(__aeabi_idiv)
+
+	cmp	r1, #0
+	eor	ip, r0, r1			@ save the sign of the result.
+	beq	Ldiv0
+	rsbmi	r1, r1, #0			@ loops below use unsigned.
+	subs	r2, r1, #1			@ division by 1 or -1 ?
+	beq	10f
+	movs	r3, r0
+	rsbmi	r3, r0, #0			@ positive dividend value
+	cmp	r3, r1
+	bls	11f
+	tst	r1, r2				@ divisor is power of 2 ?
+	beq	12f
+
+	ARM_DIV_BODY r3, r1, r0, r2
+
+	cmp	ip, #0
+	rsbmi	r0, r0, #0
+	mov	pc, lr
+
+10:	teq	ip, r0				@ same sign ?
+	rsbmi	r0, r0, #0
+	mov	pc, lr
+
+11:	movlo	r0, #0
+	moveq	r0, ip, asr #31
+	orreq	r0, r0, #1
+	mov	pc, lr
+
+12:	ARM_DIV2_ORDER r1, r2
+
+	cmp	ip, #0
+	mov	r0, r3, lsr r2
+	rsbmi	r0, r0, #0
+	mov	pc, lr
+
+ENDPROC(__divsi3)
+ENDPROC(__aeabi_idiv)
+
+ENTRY(__modsi3)
+
+	cmp	r1, #0
+	beq	Ldiv0
+	rsbmi	r1, r1, #0			@ loops below use unsigned.
+	movs	ip, r0				@ preserve sign of dividend
+	rsbmi	r0, r0, #0			@ if negative make positive
+	subs	r2, r1, #1			@ compare divisor with 1
+	cmpne	r0, r1				@ compare dividend with divisor
+	moveq	r0, #0
+	tsthi	r1, r2				@ see if divisor is power of 2
+	andeq	r0, r0, r2
+	bls	10f
+
+	ARM_MOD_BODY r0, r1, r2, r3
+
+10:	cmp	ip, #0
+	rsbmi	r0, r0, #0
+	mov	pc, lr
+
+ENDPROC(__modsi3)
+
+ENTRY(__aeabi_uidivmod)
+
+	stmfd	sp!, {r0, r1, ip, lr}
+	bl	__aeabi_uidiv
+	ldmfd	sp!, {r1, r2, ip, lr}
+	mul	r3, r0, r2
+	sub	r1, r1, r3
+	mov	pc, lr
+
+ENDPROC(__aeabi_uidivmod)
+
+ENTRY(__aeabi_idivmod)
+
+	stmfd	sp!, {r0, r1, ip, lr}
+	bl	__aeabi_idiv
+	ldmfd	sp!, {r1, r2, ip, lr}
+	mul	r3, r0, r2
+	sub	r1, r1, r3
+	mov	pc, lr
+
+ENDPROC(__aeabi_idivmod)
+
+Ldiv0:
+
+	str	lr, [sp, #-8]!
+	bl	__div0
+	mov	r0, #0			@ About as wrong as it could be.
+	ldr	pc, [sp], #8
+
+
diff --git a/lib_arm/_ashrdi3.S b/lib_arm/lshrdi3.S
similarity index 90%
rename from lib_arm/_ashrdi3.S
rename to lib_arm/lshrdi3.S
index 5edbcb3..99ea338 100644
--- a/lib_arm/_ashrdi3.S
+++ b/lib_arm/lshrdi3.S
@@ -26,6 +26,8 @@ the Free Software Foundation, 51 Franklin Street, Fifth Floor,
 Boston, MA 02110-1301, USA.  */
 
 
+#include <linux/linkage.h>
+
 #ifdef __ARMEB__
 #define al r1
 #define ah r0
@@ -34,13 +36,16 @@ Boston, MA 02110-1301, USA.  */
 #define ah r1
 #endif
 
-.globl __ashrdi3
-__ashrdi3:
+ENTRY(__lshrdi3)
+ENTRY(__aeabi_llsr)
 
 	subs	r3, r2, #32
 	rsb	ip, r2, #32
 	movmi	al, al, lsr r2
-	movpl	al, ah, asr r3
+	movpl	al, ah, lsr r3
 	orrmi	al, al, ah, lsl ip
-	mov	ah, ah, asr r2
+	mov	ah, ah, lsr r2
 	mov	pc, lr
+
+ENDPROC(__lshrdi3)
+ENDPROC(__aeabi_llsr)
-- 
1.6.3.1