[PATCH] arm64: memset: speed up cache disabled path
Josh Law
josh2 at disroot.org
Mon May 25 20:26:27 CEST 2026
AArch64 memset() skips dc zva when dcache is off. That fallback
was still storing one byte per loop.
This runs before the MMU comes on. One common caller is create_table(),
which clears 4 KiB page tables during MMU setup.
Build the repeated byte in x1, align the destination to 8 bytes, then
use four 64 bit stores per loop. The head and tail stay byte based.
The cache disabled path still avoids SIMD and dc instructions.
For qemu_arm64_defconfig, the object text grows from 412 bytes to
524 bytes.
A qemu user mode bench of this fallback loop gave a 4 KiB fill time of
4970643109 ns before and 254278076 ns after, a 19.55x speedup.
Signed-off-by: Josh Law <josh2 at disroot.org>
---
arch/arm/lib/memset-arm64.S | 55 +++++++++++++++++++++++++++----------
1 file changed, 40 insertions(+), 15 deletions(-)
diff --git a/arch/arm/lib/memset-arm64.S b/arch/arm/lib/memset-arm64.S
index ee9f9a96cfe..97849e8b4be 100644
--- a/arch/arm/lib/memset-arm64.S
+++ b/arch/arm/lib/memset-arm64.S
@@ -28,9 +28,8 @@ ENTRY (memset)
/*
* The optimized memset uses the dc opcode, which causes problems
- * when the cache is disabled. Let's check if the cache is disabled
- * and use a very simple memset implementation in this case. Otherwise
- * jump to the optimized version.
+ * when the cache is disabled. Use the plain store path in that case,
+ * then jump to the fast version otherwise.
*/
switch_el x6, 3f, 2f, 1f
3: mrs x6, sctlr_el3
@@ -40,21 +39,47 @@ ENTRY (memset)
1: mrs x6, sctlr_el1
0:
tst x6, #CR_C
- bne 9f
+ bne 10f
/*
- * A very "simple" memset implementation without the use of the
- * dc opcode. Can be run with caches disabled.
+ * Avoid dc zva while dcache is off. Build the byte pattern once,
+ * then use aligned stores for the main body.
*/
- mov x3, #0x0
- cmp count, x3 /* check for zero length */
- beq 8f
-4: strb valw, [dstin, x3]
- add x3, x3, #0x1
- cmp count, x3
- bne 4b
-8: ret
-9:
+ cbz count, 8f
+ and val, val, #255
+ orr val, val, val, lsl #8
+ orr val, val, val, lsl #16
+ orr val, val, val, lsl #32
+
+ mov dst, dstin
+ tst dst, #7
+ b.eq 5f
+4: strb valw, [dst], #1
+ subs count, count, #1
+ b.eq 8f
+ tst dst, #7
+ b.ne 4b
+5: cmp count, #32
+ b.lo 7f
+6: str val, [dst], #8
+ str val, [dst], #8
+ str val, [dst], #8
+ str val, [dst], #8
+ subs count, count, #32
+ cmp count, #32
+ b.hs 6b
+7: cmp count, #8
+ b.lo 8f
+ str val, [dst], #8
+ subs count, count, #8
+ cmp count, #8
+ b.hs 7b
+8: cbz count, 9f
+ strb valw, [dst], #1
+ subs count, count, #1
+ b.ne 8b
+9: ret
+10:
/* Here the optimized memset version starts */
dup v0.16B, valw
--
2.47.3
More information about the U-Boot
mailing list