diff --git a/include/asm-arm/assembler.h b/include/asm-arm/assembler.h
new file mode 100644
index 0000000..fce8328
--- /dev/null
+++ b/include/asm-arm/assembler.h
@@ -0,0 +1,101 @@
+/*
+ *  linux/include/asm-arm/assembler.h
+ *
+ *  Copyright (C) 1996-2000 Russell King
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ *  This file contains arm architecture specific defines
+ *  for the different processors.
+ *
+ *  Do not include any C declarations in this file - it is included by
+ *  assembler source.
+ */
+#ifndef __ASSEMBLY__
+#error "Only include this from assembly code"
+#endif
+
+#include <asm/ptrace.h>
+
+/*
+ * Endian independent macros for shifting bytes within registers.
+ */
+#ifndef __ARMEB__
+#define pull            lsr
+#define push            lsl
+#define get_byte_0      lsl #0
+#define get_byte_1	lsr #8
+#define get_byte_2	lsr #16
+#define get_byte_3	lsr #24
+#define put_byte_0      lsl #0
+#define put_byte_1	lsl #8
+#define put_byte_2	lsl #16
+#define put_byte_3	lsl #24
+#else
+#define pull            lsl
+#define push            lsr
+#define get_byte_0	lsr #24
+#define get_byte_1	lsr #16
+#define get_byte_2	lsr #8
+#define get_byte_3      lsl #0
+#define put_byte_0	lsl #24
+#define put_byte_1	lsl #16
+#define put_byte_2	lsl #8
+#define put_byte_3      lsl #0
+#endif
+
+/*
+ * Data preload for architectures that support it
+ */
+#if __LINUX_ARM_ARCH__ >= 5
+#define PLD(code...)	code
+#else
+#define PLD(code...)
+#endif
+
+/*
+ * Enable and disable interrupts
+ */
+#if __LINUX_ARM_ARCH__ >= 6
+	.macro	disable_irq
+	cpsid	i
+	.endm
+
+	.macro	enable_irq
+	cpsie	i
+	.endm
+#else
+	.macro	disable_irq
+	msr	cpsr_c, #PSR_I_BIT | SVC_MODE
+	.endm
+
+	.macro	enable_irq
+	msr	cpsr_c, #SVC_MODE
+	.endm
+#endif
+
+/*
+ * Save the current IRQ state and disable IRQs.  Note that this macro
+ * assumes FIQs are enabled, and that the processor is in SVC mode.
+ */
+	.macro	save_and_disable_irqs, oldcpsr
+	mrs	\oldcpsr, cpsr
+	disable_irq
+	.endm
+
+/*
+ * Restore interrupt state previously stored in a register.  We don't
+ * guarantee that this will preserve the flags.
+ */
+	.macro	restore_irqs, oldcpsr
+	msr	cpsr_c, \oldcpsr
+	.endm
+
+#define USER(x...)				\
+9999:	x;					\
+	.section __ex_table,"a";		\
+	.align	3;				\
+	.long	9999b,9001f;			\
+	.previous
diff --git a/include/asm-arm/linkage.h b/include/asm-arm/linkage.h
new file mode 100644
index 0000000..dbe4b4e
--- /dev/null
+++ b/include/asm-arm/linkage.h
@@ -0,0 +1,7 @@
+#ifndef __ASM_LINKAGE_H
+#define __ASM_LINKAGE_H
+
+#define __ALIGN .align 0
+#define __ALIGN_STR ".align 0"
+
+#endif
diff --git a/include/asm-arm/string.h b/include/asm-arm/string.h
index c3ea582..3fb0ab4 100644
--- a/include/asm-arm/string.h
+++ b/include/asm-arm/string.h
@@ -12,10 +12,10 @@ extern char * strrchr(const char * s, int c);
 #undef __HAVE_ARCH_STRCHR
 extern char * strchr(const char * s, int c);
 
-#undef __HAVE_ARCH_MEMCPY
+#define __HAVE_ARCH_MEMCPY
 extern void * memcpy(void *, const void *, __kernel_size_t);
 
-#undef __HAVE_ARCH_MEMMOVE
+#define __HAVE_ARCH_MEMMOVE
 extern void * memmove(void *, const void *, __kernel_size_t);
 
 #undef __HAVE_ARCH_MEMCHR
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
new file mode 100644
index 0000000..6c9873f
--- /dev/null
+++ b/include/linux/linkage.h
@@ -0,0 +1,67 @@
+#ifndef _LINUX_LINKAGE_H
+#define _LINUX_LINKAGE_H
+
+#include <asm/linkage.h>
+
+#ifdef __cplusplus
+#define CPP_ASMLINKAGE extern "C"
+#else
+#define CPP_ASMLINKAGE
+#endif
+
+#ifndef asmlinkage
+#define asmlinkage CPP_ASMLINKAGE
+#endif
+
+#ifndef prevent_tail_call
+# define prevent_tail_call(ret) do { } while (0)
+#endif
+
+#ifndef __ALIGN
+#define __ALIGN		.align 4,0x90
+#define __ALIGN_STR	".align 4,0x90"
+#endif
+
+#ifdef __ASSEMBLY__
+
+#define ALIGN __ALIGN
+#define ALIGN_STR __ALIGN_STR
+
+#ifndef ENTRY
+#define ENTRY(name) \
+  .globl name; \
+  ALIGN; \
+  name:
+#endif
+
+#define KPROBE_ENTRY(name) \
+  .pushsection .kprobes.text, "ax"; \
+  ENTRY(name)
+
+#define KPROBE_END(name) \
+  END(name);		 \
+  .popsection
+
+#ifndef END
+#define END(name) \
+  .size name, .-name
+#endif
+
+#ifndef ENDPROC
+#define ENDPROC(name) \
+  .type name, @function; \
+  END(name)
+#endif
+
+#endif
+
+#define NORET_TYPE    /**/
+#define ATTRIB_NORET  __attribute__((noreturn))
+#define NORET_AND     noreturn,
+
+#ifndef FASTCALL
+#define FASTCALL(x)	x
+#define fastcall
+#endif
+
+#endif
diff --git a/lib_arm/Makefile b/lib_arm/Makefile
index 037c475..8be5622 100644
--- a/lib_arm/Makefile
+++ b/lib_arm/Makefile
@@ -25,7 +25,8 @@ include $(TOPDIR)/config.mk
 
 LIB	= $(obj)lib$(ARCH).a
 
-SOBJS	= _ashldi3.o _ashrdi3.o _divsi3.o _modsi3.o _udivsi3.o _umodsi3.o
+SOBJS	= _ashldi3.o _ashrdi3.o _divsi3.o _modsi3.o _udivsi3.o _umodsi3.o \
+	  memcpy.o memmove.o
 
 COBJS	= armlinux.o board.o \
 	  cache.o div0.o
diff --git a/lib_arm/copy_template.S b/lib_arm/copy_template.S
new file mode 100644
index 0000000..cab355c
--- /dev/null
+++ b/lib_arm/copy_template.S
@@ -0,0 +1,255 @@
+/*
+ *  linux/arch/arm/lib/copy_template.s
+ *
+ *  Code template for optimized memory copy functions
+ *
+ *  Author:	Nicolas Pitre
+ *  Created:	Sep 28, 2005
+ *  Copyright:	MontaVista Software, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+/*
+ * This can be used to enable code to cacheline align the source pointer.
+ * Experiments on tested architectures (StrongARM and XScale) didn't show
+ * this a worthwhile thing to do.  That might be different in the future.
+ */
+//#define CALGN(code...)	code
+#define CALGN(code...)
+
+/*
+ * Theory of operation
+ * -------------------
+ *
+ * This file provides the core code for a forward memory copy used in
+ * the implementation of memcopy(), copy_to_user() and copy_from_user().
+ *
+ * The including file must define the following accessor macros
+ * according to the need of the given function:
+ *
+ * ldr1w ptr reg abort
+ *
+ *	This loads one word from 'ptr', stores it in 'reg' and increments
+ *	'ptr' to the next word. The 'abort' argument is used for fixup tables.
+ *
+ * ldr4w ptr reg1 reg2 reg3 reg4 abort
+ * ldr8w ptr, reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+ *
+ *	This loads four or eight words starting from 'ptr', stores them
+ *	in provided registers and increments 'ptr' past those words.
+ *	The'abort' argument is used for fixup tables.
+ *
+ * ldr1b ptr reg cond abort
+ *
+ *	Similar to ldr1w, but it loads a byte and increments 'ptr' one byte.
+ *	It also must apply the condition code if provided, otherwise the
+ *	"al" condition is assumed by default.
+ *
+ * str1w ptr reg abort
+ * str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+ * str1b ptr reg cond abort
+ *
+ *	Same as their ldr* counterparts, but data is stored to 'ptr' location
+ *	rather than being loaded.
+ *
+ * enter reg1 reg2
+ *
+ *	Preserve the provided registers on the stack plus any additional
+ *	data as needed by the implementation including this code. Called
+ *	upon code entry.
+ *
+ * exit reg1 reg2
+ *
+ *	Restore registers with the values previously saved with the
+ *	'preserv' macro. Called upon code termination.
+ */
+
+
+		enter	r4, lr
+
+		subs	r2, r2, #4
+		blt	8f
+		ands	ip, r0, #3
+	PLD(	pld	[r1, #0]		)
+		bne	9f
+		ands	ip, r1, #3
+		bne	10f
+
+1:		subs	r2, r2, #(28)
+		stmfd	sp!, {r5 - r8}
+		blt	5f
+
+	CALGN(	ands	ip, r1, #31		)
+	CALGN(	rsb	r3, ip, #32		)
+	CALGN(	sbcnes	r4, r3, r2		)  @ C is always set here
+	CALGN(	bcs	2f			)
+	CALGN(	adr	r4, 6f			)
+	CALGN(	subs	r2, r2, r3		)  @ C gets set
+	CALGN(	add	pc, r4, ip		)
+
+	PLD(	pld	[r1, #0]		)
+2:	PLD(	subs	r2, r2, #96		)
+	PLD(	pld	[r1, #28]		)
+	PLD(	blt	4f			)
+	PLD(	pld	[r1, #60]		)
+	PLD(	pld	[r1, #92]		)
+
+3:	PLD(	pld	[r1, #124]		)
+4:		ldr8w	r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+		subs	r2, r2, #32
+		str8w	r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+		bge	3b
+	PLD(	cmn	r2, #96			)
+	PLD(	bge	4b			)
+
+5:		ands	ip, r2, #28
+		rsb	ip, ip, #32
+		addne	pc, pc, ip		@ C is always clear here
+		b	7f
+6:		nop
+		ldr1w	r1, r3, abort=20f
+		ldr1w	r1, r4, abort=20f
+		ldr1w	r1, r5, abort=20f
+		ldr1w	r1, r6, abort=20f
+		ldr1w	r1, r7, abort=20f
+		ldr1w	r1, r8, abort=20f
+		ldr1w	r1, lr, abort=20f
+
+		add	pc, pc, ip
+		nop
+		nop
+		str1w	r0, r3, abort=20f
+		str1w	r0, r4, abort=20f
+		str1w	r0, r5, abort=20f
+		str1w	r0, r6, abort=20f
+		str1w	r0, r7, abort=20f
+		str1w	r0, r8, abort=20f
+		str1w	r0, lr, abort=20f
+
+	CALGN(	bcs	2b			)
+
+7:		ldmfd	sp!, {r5 - r8}
+
+8:		movs	r2, r2, lsl #31
+		ldr1b	r1, r3, ne, abort=21f
+		ldr1b	r1, r4, cs, abort=21f
+		ldr1b	r1, ip, cs, abort=21f
+		str1b	r0, r3, ne, abort=21f
+		str1b	r0, r4, cs, abort=21f
+		str1b	r0, ip, cs, abort=21f
+
+		exit	r4, pc
+
+9:		rsb	ip, ip, #4
+		cmp	ip, #2
+		ldr1b	r1, r3, gt, abort=21f
+		ldr1b	r1, r4, ge, abort=21f
+		ldr1b	r1, lr, abort=21f
+		str1b	r0, r3, gt, abort=21f
+		str1b	r0, r4, ge, abort=21f
+		subs	r2, r2, ip
+		str1b	r0, lr, abort=21f
+		blt	8b
+		ands	ip, r1, #3
+		beq	1b
+
+10:		bic	r1, r1, #3
+		cmp	ip, #2
+		ldr1w	r1, lr, abort=21f
+		beq	17f
+		bgt	18f
+
+
+		.macro	forward_copy_shift pull push
+
+		subs	r2, r2, #28
+		blt	14f
+
+	CALGN(	ands	ip, r1, #31		)
+	CALGN(	rsb	ip, ip, #32		)
+	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
+	CALGN(	subcc	r2, r2, ip		)
+	CALGN(	bcc	15f			)
+
+11:		stmfd	sp!, {r5 - r9}
+
+	PLD(	pld	[r1, #0]		)
+	PLD(	subs	r2, r2, #96		)
+	PLD(	pld	[r1, #28]		)
+	PLD(	blt	13f			)
+	PLD(	pld	[r1, #60]		)
+	PLD(	pld	[r1, #92]		)
+
+12:	PLD(	pld	[r1, #124]		)
+13:		ldr4w	r1, r4, r5, r6, r7, abort=19f
+		mov	r3, lr, pull #\pull
+		subs	r2, r2, #32
+		ldr4w	r1, r8, r9, ip, lr, abort=19f
+		orr	r3, r3, r4, push #\push
+		mov	r4, r4, pull #\pull
+		orr	r4, r4, r5, push #\push
+		mov	r5, r5, pull #\pull
+		orr	r5, r5, r6, push #\push
+		mov	r6, r6, pull #\pull
+		orr	r6, r6, r7, push #\push
+		mov	r7, r7, pull #\pull
+		orr	r7, r7, r8, push #\push
+		mov	r8, r8, pull #\pull
+		orr	r8, r8, r9, push #\push
+		mov	r9, r9, pull #\pull
+		orr	r9, r9, ip, push #\push
+		mov	ip, ip, pull #\pull
+		orr	ip, ip, lr, push #\push
+		str8w	r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f
+		bge	12b
+	PLD(	cmn	r2, #96			)
+	PLD(	bge	13b			)
+
+		ldmfd	sp!, {r5 - r9}
+
+14:		ands	ip, r2, #28
+		beq	16f
+
+15:		mov	r3, lr, pull #\pull
+		ldr1w	r1, lr, abort=21f
+		subs	ip, ip, #4
+		orr	r3, r3, lr, push #\push
+		str1w	r0, r3, abort=21f
+		bgt	15b
+	CALGN(	cmp	r2, #0			)
+	CALGN(	bge	11b			)
+
+16:		sub	r1, r1, #(\push / 8)
+		b	8b
+
+		.endm
+
+
+		forward_copy_shift	pull=8	push=24
+
+17:		forward_copy_shift	pull=16	push=16
+
+18:		forward_copy_shift	pull=24	push=8
+
+
+/*
+ * Abort preamble and completion macros.
+ * If a fixup handler is required then those macros must surround it.
+ * It is assumed that the fixup code will handle the private part of
+ * the exit macro.
+ */
+
+	.macro	copy_abort_preamble
+19:	ldmfd	sp!, {r5 - r9}
+	b	21f
+20:	ldmfd	sp!, {r5 - r8}
+21:
+	.endm
+
+	.macro	copy_abort_end
+	ldmfd	sp!, {r4, pc}
+	.endm
+
diff --git a/lib_arm/memcpy.S b/lib_arm/memcpy.S
new file mode 100644
index 0000000..7e71d67
--- /dev/null
+++ b/lib_arm/memcpy.S
@@ -0,0 +1,59 @@
+/*
+ *  linux/arch/arm/lib/memcpy.S
+ *
+ *  Author:	Nicolas Pitre
+ *  Created:	Sep 28, 2005
+ *  Copyright:	MontaVista Software, Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.macro ldr1w ptr reg abort
+	ldr \reg, [\ptr], #4
+	.endm
+
+	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
+	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
+	.endm
+
+	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+	.endm
+
+	.macro ldr1b ptr reg cond=al abort
+	ldr\cond\()b \reg, [\ptr], #1
+	.endm
+
+	.macro str1w ptr reg abort
+	str \reg, [\ptr], #4
+	.endm
+
+	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
+	stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
+	.endm
+
+	.macro str1b ptr reg cond=al abort
+	str\cond\()b \reg, [\ptr], #1
+	.endm
+
+	.macro enter reg1 reg2
+	stmdb sp!, {r0, \reg1, \reg2}
+	.endm
+
+	.macro exit reg1 reg2
+	ldmfd sp!, {r0, \reg1, \reg2}
+	.endm
+
+	.text
+
+/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
+
+ENTRY(memcpy)
+
+#include "copy_template.S"
+
diff --git a/lib_arm/memmove.S b/lib_arm/memmove.S
new file mode 100644
index 0000000..ef7fddc
--- /dev/null
+++ b/lib_arm/memmove.S
@@ -0,0 +1,206 @@
+/*
+ *  linux/arch/arm/lib/memmove.S
+ *
+ *  Author:	Nicolas Pitre
+ *  Created:	Sep 28, 2005
+ *  Copyright:	(C) MontaVista Software Inc.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/*
+ * This can be used to enable code to cacheline align the source pointer.
+ * Experiments on tested architectures (StrongARM and XScale) didn't show
+ * this a worthwhile thing to do.  That might be different in the future.
+ */
+//#define CALGN(code...)        code
+#define CALGN(code...)
+
+		.text
+
+/*
+ * Prototype: void *memmove(void *dest, const void *src, size_t n);
+ *
+ * Note:
+ *
+ * If the memory regions don't overlap, we simply branch to memcpy which is
+ * normally a bit faster. Otherwise the copy is done going downwards.  This
+ * is a transposition of the code from copy_template.S but with the copy
+ * occurring in the opposite direction.
+ */
+
+ENTRY(memmove)
+
+		subs	ip, r0, r1
+		cmphi	r2, ip
+		bls	memcpy
+
+		stmfd	sp!, {r0, r4, lr}
+		add	r1, r1, r2
+		add	r0, r0, r2
+		subs	r2, r2, #4
+		blt	8f
+		ands	ip, r0, #3
+	PLD(	pld	[r1, #-4]		)
+		bne	9f
+		ands	ip, r1, #3
+		bne	10f
+
+1:		subs	r2, r2, #(28)
+		stmfd	sp!, {r5 - r8}
+		blt	5f
+
+	CALGN(	ands	ip, r1, #31		)
+	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
+	CALGN(	bcs	2f			)
+	CALGN(	adr	r4, 6f			)
+	CALGN(	subs	r2, r2, ip		)  @ C is set here
+	CALGN(	add	pc, r4, ip		)
+
+	PLD(	pld	[r1, #-4]		)
+2:	PLD(	subs	r2, r2, #96		)
+	PLD(	pld	[r1, #-32]		)
+	PLD(	blt	4f			)
+	PLD(	pld	[r1, #-64]		)
+	PLD(	pld	[r1, #-96]		)
+
+3:	PLD(	pld	[r1, #-128]		)
+4:		ldmdb	r1!, {r3, r4, r5, r6, r7, r8, ip, lr}
+		subs	r2, r2, #32
+		stmdb	r0!, {r3, r4, r5, r6, r7, r8, ip, lr}
+		bge	3b
+	PLD(	cmn	r2, #96			)
+	PLD(	bge	4b			)
+
+5:		ands	ip, r2, #28
+		rsb	ip, ip, #32
+		addne	pc, pc, ip		@ C is always clear here
+		b	7f
+6:		nop
+		ldr	r3, [r1, #-4]!
+		ldr	r4, [r1, #-4]!
+		ldr	r5, [r1, #-4]!
+		ldr	r6, [r1, #-4]!
+		ldr	r7, [r1, #-4]!
+		ldr	r8, [r1, #-4]!
+		ldr	lr, [r1, #-4]!
+
+		add	pc, pc, ip
+		nop
+		nop
+		str	r3, [r0, #-4]!
+		str	r4, [r0, #-4]!
+		str	r5, [r0, #-4]!
+		str	r6, [r0, #-4]!
+		str	r7, [r0, #-4]!
+		str	r8, [r0, #-4]!
+		str	lr, [r0, #-4]!
+
+	CALGN(	bcs	2b			)
+
+7:		ldmfd	sp!, {r5 - r8}
+
+8:		movs	r2, r2, lsl #31
+		ldrneb	r3, [r1, #-1]!
+		ldrcsb	r4, [r1, #-1]!
+		ldrcsb	ip, [r1, #-1]
+		strneb	r3, [r0, #-1]!
+		strcsb	r4, [r0, #-1]!
+		strcsb	ip, [r0, #-1]
+		ldmfd	sp!, {r0, r4, pc}
+
+9:		cmp	ip, #2
+		ldrgtb	r3, [r1, #-1]!
+		ldrgeb	r4, [r1, #-1]!
+		ldrb	lr, [r1, #-1]!
+		strgtb	r3, [r0, #-1]!
+		strgeb	r4, [r0, #-1]!
+		subs	r2, r2, ip
+		strb	lr, [r0, #-1]!
+		blt	8b
+		ands	ip, r1, #3
+		beq	1b
+
+10:		bic	r1, r1, #3
+		cmp	ip, #2
+		ldr	r3, [r1, #0]
+		beq	17f
+		blt	18f
+
+
+		.macro	backward_copy_shift push pull
+
+		subs	r2, r2, #28
+		blt	14f
+
+	CALGN(	ands	ip, r1, #31		)
+	CALGN(	rsb	ip, ip, #32		)
+	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
+	CALGN(	subcc	r2, r2, ip		)
+	CALGN(	bcc	15f			)
+
+11:		stmfd	sp!, {r5 - r9}
+
+	PLD(	pld	[r1, #-4]		)
+	PLD(	subs	r2, r2, #96		)
+	PLD(	pld	[r1, #-32]		)
+	PLD(	blt	13f			)
+	PLD(	pld	[r1, #-64]		)
+	PLD(	pld	[r1, #-96]		)
+
+12:	PLD(	pld	[r1, #-128]		)
+13:		ldmdb   r1!, {r7, r8, r9, ip}
+		mov     lr, r3, push #\push
+		subs    r2, r2, #32
+		ldmdb   r1!, {r3, r4, r5, r6}
+		orr     lr, lr, ip, pull #\pull
+		mov     ip, ip, push #\push
+		orr     ip, ip, r9, pull #\pull
+		mov     r9, r9, push #\push
+		orr     r9, r9, r8, pull #\pull
+		mov     r8, r8, push #\push
+		orr     r8, r8, r7, pull #\pull
+		mov     r7, r7, push #\push
+		orr     r7, r7, r6, pull #\pull
+		mov     r6, r6, push #\push
+		orr     r6, r6, r5, pull #\pull
+		mov     r5, r5, push #\push
+		orr     r5, r5, r4, pull #\pull
+		mov     r4, r4, push #\push
+		orr     r4, r4, r3, pull #\pull
+		stmdb   r0!, {r4 - r9, ip, lr}
+		bge	12b
+	PLD(	cmn	r2, #96			)
+	PLD(	bge	13b			)
+
+		ldmfd	sp!, {r5 - r9}
+
+14:		ands	ip, r2, #28
+		beq	16f
+
+15:		mov     lr, r3, push #\push
+		ldr	r3, [r1, #-4]!
+		subs	ip, ip, #4
+		orr	lr, lr, r3, pull #\pull
+		str	lr, [r0, #-4]!
+		bgt	15b
+	CALGN(	cmp	r2, #0			)
+	CALGN(	bge	11b			)
+
+16:		add	r1, r1, #(\pull / 8)
+		b	8b
+
+		.endm
+
+
+		backward_copy_shift	push=8	pull=24
+
+17:		backward_copy_shift	push=16	pull=16
+
+18:		backward_copy_shift	push=24	pull=8
+