[U-Boot] [PATCH] arm: add 64-64 bit divider

Wed Aug 31 12:38:50 CEST 2011

This patch adds a 64-64 bit divider that supports ARMv4 and above.

Because clz (count leading zero) instruction is added until ARMv5, the
divider implements a clz function for ARMv4 targets.

The divider was tested with the following test driver code ran by
qemu-arm:

  int main(void)
  {
    uint64_t a, b, q, r;
    while (scanf("%llx %llx %llx %llx", &a, &b, &q, &r) > 0)
      printf("%016llx %016llx %016llx %016llx\n", a, b, a / b, a % b);
    return 0;
  }

Signed-off-by: Che-Liang Chiou <clchiou at chromium.org>
Cc: Albert Aribaud <albert.u.boot at aribaud.net>
---
This patch is alos tested with `MAKEALL -a arm`

 arch/arm/lib/Makefile    |    1 +
 arch/arm/lib/_uldivmod.S |  266 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 267 insertions(+), 0 deletions(-)
 create mode 100644 arch/arm/lib/_uldivmod.S

diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index 300c8fa..31770dd 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -33,6 +33,7 @@ GLSOBJS	+= _divsi3.o
 GLSOBJS	+= _lshrdi3.o
 GLSOBJS	+= _modsi3.o
 GLSOBJS	+= _udivsi3.o
+GLSOBJS	+= _uldivmod.o
 GLSOBJS	+= _umodsi3.o
 
 GLCOBJS	+= div0.o
diff --git a/arch/arm/lib/_uldivmod.S b/arch/arm/lib/_uldivmod.S
new file mode 100644
index 0000000..9e3a5e6
--- /dev/null
+++ b/arch/arm/lib/_uldivmod.S
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2011 The Chromium OS Authors.
+ * See file CREDITS for list of people who contributed to this
+ * project.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ * MA 02111-1307 USA
+ */
+
+/*
+ * A, Q = r0 + (r1 << 32)
+ * B, R = r2 + (r3 << 32)
+ * A / B = Q ... R
+ */
+
+	.text
+	.global	__aeabi_uldivmod
+	.type	__aeabi_uldivmod, function
+	.align	0
+
+/* armv4 does not support clz (count leading zero) instruction. */
+#if __LINUX_ARM_ARCH__ <= 4
+#  define CLZ(dst, src)		bl	L_clz_ ## dst ## _ ## src
+#  define CLZEQ(dst, src)	bleq	L_clz_ ## dst ## _ ## src
+#else
+#  define CLZ(dst, src)		clz	dst, src
+#  define CLZEQ(dst, src)	clzeq	dst, src
+#endif
+
+A_0	.req	r0
+A_1	.req	r1
+B_0	.req	r2
+B_1	.req	r3
+C_0	.req	r4
+C_1	.req	r5
+D_0	.req	r6
+D_1	.req	r7
+
+Q_0	.req	r0
+Q_1	.req	r1
+R_0	.req	r2
+R_1	.req	r3
+
+__aeabi_uldivmod:
+	stmfd	sp!, {r4, r5, r6, r7, lr}
+	@ Test if B == 0
+	orrs	ip, B_0, B_1		@ Z set -> B == 0
+	beq	L_div_by_0
+	@ Test if B is power of 2: (B & (B - 1)) == 0
+	subs	C_0, B_0, #1
+	sbc	C_1, B_1, #0
+	tst	C_0, B_0
+	tsteq	B_1, C_1
+	beq	L_pow2
+	@ Test if A_1 == B_1 == 0
+	orrs	ip, A_1, B_1
+	beq	L_div_32_32
+
+L_div_64_64:
+	mov	C_0, #1
+	mov	C_1, #0
+	@ D_0 = clz A
+	CLZ(D_0, A_1)
+	teq	A_1, #0
+	CLZEQ(ip, A_0)
+	teq	A_1, #0
+	addeq	D_0, D_0, ip
+	@ D_1 = clz B
+	CLZ(D_1, B_1)
+	teq	B_1, #0
+	CLZEQ(ip, B_0)
+	teq	B_1, #0
+	addeq	D_1, D_1, ip
+	@ if clz B - clz A <= 0: goto L_done_shift
+	subs	D_0, D_1, D_0
+	bls	L_done_shift
+	subs	D_1, D_0, #32
+	rsb	ip, D_0, #32
+	@ B <<= (clz B - clz A)
+	movmi	B_1, B_1, lsl D_0
+	orrmi	B_1, B_1, B_0, lsr ip
+	movpl	B_1, B_0, lsl D_1
+	mov	B_0, B_0, lsl D_0
+	@ C = 1 << (clz B - clz A)
+	movmi	C_1, C_1, lsl D_0
+	orrmi	C_1, C_1, C_0, lsr ip
+	movpl	C_1, C_0, lsl D_1
+	mov	C_0, C_0, lsl D_0
+L_done_shift:
+	mov	D_0, #0
+	mov	D_1, #0
+	@ C: current bit; D: result
+L_subtract:
+	@ if A >= B
+	cmp	A_1, B_1
+	cmpeq	A_0, B_0
+	bcc	L_update
+	@ A -= B
+	subs	A_0, A_0, B_0
+	sbc	A_1, A_1, B_1
+	@ D |= C
+	orr	D_0, D_0, C_0
+	orr	D_1, D_1, C_1
+L_update:
+	@ if A == 0: break
+	orrs	ip, A_1, A_0
+	beq	L_exit
+	@ C >>= 1
+	movs	C_1, C_1, lsr #1
+	movs	C_0, C_0, rrx
+	@ if C == 0: break
+	orrs	ip, C_1, C_0
+	beq	L_exit
+	@ B >>= 1
+	movs	B_1, B_1, lsr #1
+	mov	B_0, B_0, rrx
+	b	L_subtract
+L_exit:
+	@ Note: A, B & Q, R are aliases
+	mov	R_0, A_0
+	mov	R_1, A_1
+	mov	Q_0, D_0
+	mov	Q_1, D_1
+	ldmfd	sp!, {r4, r5, r6, r7, pc}
+
+L_div_32_32:
+	@ Note:	A_0 &	r0 are aliases
+	@	Q_1	r1
+	mov	r1, B_0
+	bl	__aeabi_uidivmod
+	mov	R_0, r1
+	mov	R_1, #0
+	mov	Q_1, #0
+	ldmfd	sp!, {r4, r5, r6, r7, pc}
+
+L_pow2:
+	@ Note: A, B and Q, R are aliases
+	@ R = A & (B - 1)
+	and	C_0, A_0, C_0
+	and	C_1, A_1, C_1
+	@ Q = A >> log2(B)
+	@ Note: B must not be 0 here!
+	CLZ(D_0, B_0)
+	add	D_1, D_0, #1
+	rsbs	D_0, D_0, #31
+	movpl	A_0, A_0, lsr D_0
+	orrpl	A_0, A_0, A_1, lsl D_1
+	bpl	L_1
+	CLZ(D_0, B_1)
+	rsb	D_0, D_0, #31
+	mov	A_0, A_1, lsr D_0
+	add	D_0, D_0, #32
+L_1:
+	mov	A_1, A_1, lsr D_0
+	@ Mov back C to R
+	mov	R_0, C_0
+	mov	R_1, C_1
+	ldmfd	sp!, {r4, r5, r6, r7, pc}
+
+L_div_by_0:
+	bl	__div0
+	@ As wrong as it could be
+	mov	Q_0, #0
+	mov	Q_1, #0
+	mov	R_0, #0
+	mov	R_1, #0
+	ldmfd	sp!, {r4, r5, r6, r7, pc}
+
+#if __LINUX_ARM_ARCH__ <= 4
+/*
+ * count leading zero
+ *
+ * input	: r0
+ * output	: r0
+ * destroy	: r1, r2, r3, r4, r5
+ */
+L_clz:
+	mov	r1, #0		// clz result
+	mov	r2, #0xf0000000	// mask
+	mov	r3, #28		// shift amount
+	adr	r4, L_clz_table
+L_clz_loop:
+	teq	r2, #0
+	beq	L_clz_loop_done
+	ands	r5, r0, r2
+	mov	r5, r5, lsr r3
+	ldrsb	r5, [r4, r5]
+	add	r1, r1, r5
+	mov	r2, r2, lsr #4
+	add	r3, r3, #-4
+	beq	L_clz_loop
+L_clz_loop_done:
+	mov	r0, r1
+	mov	pc, lr
+L_clz_table:
+	.byte	4
+	.byte	3
+	.byte	2
+	.byte	2
+	.byte	1
+	.byte	1
+	.byte	1
+	.byte	1
+	.byte	0
+	.byte	0
+	.byte	0
+	.byte	0
+	.byte	0
+	.byte	0
+	.byte	0
+	.byte	0
+
+L_clz_D_0_A_1:
+	stmfd	sp!, {r0, r1, r2, r3, r4, r5, lr}
+	mov	r0, A_1
+	bl	L_clz
+	mov	D_0, r0
+	ldmfd	sp!, {r0, r1, r2, r3, r4, r5, pc}
+
+L_clz_ip_A_0:
+	stmfd	sp!, {r0, r1, r2, r3, r4, r5, lr}
+	mov	r0, A_0
+	bl	L_clz
+	mov	ip, r0
+	ldmfd	sp!, {r0, r1, r2, r3, r4, r5, pc}
+
+L_clz_D_1_B_1:
+	stmfd	sp!, {r0, r1, r2, r3, r4, r5, lr}
+	mov	r0, B_1
+	bl	L_clz
+	mov	D_1, r0
+	ldmfd	sp!, {r0, r1, r2, r3, r4, r5, pc}
+
+L_clz_ip_B_0:
+	stmfd	sp!, {r0, r1, r2, r3, r4, r5, lr}
+	mov	r0, B_0
+	bl	L_clz
+	mov	ip, r0
+	ldmfd	sp!, {r0, r1, r2, r3, r4, r5, pc}
+
+L_clz_D_0_B_0:
+	stmfd	sp!, {r0, r1, r2, r3, r4, r5, lr}
+	mov	r0, B_0
+	bl	L_clz
+	mov	D_0, r0
+	ldmfd	sp!, {r0, r1, r2, r3, r4, r5, pc}
+
+L_clz_D_0_B_1:
+	stmfd	sp!, {r0, r1, r2, r3, r4, r5, lr}
+	mov	r0, B_1
+	bl	L_clz
+	mov	D_0, r0
+	ldmfd	sp!, {r0, r1, r2, r3, r4, r5, pc}
+#endif /* __LINUX_ARM_ARCH__  */
-- 
1.7.3.1