[PATCH v4 2/5] lib/lz4: update LZ4 decompressor module
Jonathan Liu
net147 at gmail.com
Sun May 26 10:06:50 CEST 2024
Hi Gao,
On Sat, 25 May 2024 at 02:52, Gao Xiang <hsiangkao at linux.alibaba.com> wrote:
>
> Hi,
>
> On 2024/5/24 22:26, Jonathan Liu wrote:
> > Hi Jianan,
> >
> > On Sat, 26 Feb 2022 at 18:05, Huang Jianan <jnhuang95 at gmail.com> wrote:
> >>
> >> Update the LZ4 compression module based on LZ4 v1.8.3 in order to
> >> use the newest LZ4_decompress_safe_partial() which can now decode
> >> exactly the nb of bytes requested.
> >>
> >> Signed-off-by: Huang Jianan <jnhuang95 at gmail.com>
> >
> > I noticed after this commit LZ4 decompression is slower.
> > ulz4fn function call takes 1.209670 seconds with this commit.
> > After reverting this commit, the ulz4fn function call takes 0.587032 seconds.
> >
> > I am decompressing a LZ4 compressed kernel (compressed with lz4 v1.9.4
> > using -9 option for maximum compression) on RK3399.
> >
> > Any ideas why it is slower with this commit and how the performance
> > regression can be fixed?
>
> Just the quick glance, I think the issue may be due to memcpy/memmove
> since it seems the main difference between these two codebases
> (I'm not sure which LZ4 version the old codebase was based on) and
> the new version mainly relies on memcpy/memmove instead of its own
> versions.
>
> Would you mind to check the assembly how memcpy/memset is generated
> on your platform?
Here is the assembly (-mcpu=cortex-a72.cortex-a53 -march=armv8-a+crc+crypto):
000000000028220c <memset>:
#if !CONFIG_IS_ENABLED(TINY_MEMSET)
unsigned long cl = 0;
int i;
/* do it one word at a time (32 bits or 64 bits) while possible */
if ( ((ulong)s & (sizeof(*sl) - 1)) == 0) {
28220c: f2400803 ands x3, x0, #0x7
282210: 540002c1 b.ne 282268 <memset+0x5c> // b.any
for (i = 0; i < sizeof(*sl); i++) {
cl <<= 8;
cl |= c & 0xff;
282214: 92401c26 and x6, x1, #0xff
unsigned long cl = 0;
282218: d2800004 mov x4, #0x0 // #0
28221c: 52800105 mov w5, #0x8 // #8
cl |= c & 0xff;
282220: aa0420c4 orr x4, x6, x4, lsl #8
for (i = 0; i < sizeof(*sl); i++) {
282224: 710004a5 subs w5, w5, #0x1
282228: 54ffffc1 b.ne 282220 <memset+0x14> // b.any
}
while (count >= sizeof(*sl)) {
28222c: cb030045 sub x5, x2, x3
282230: f1001cbf cmp x5, #0x7
282234: 54000148 b.hi 28225c <memset+0x50> // b.pmore
282238: d343fc43 lsr x3, x2, #3
28223c: 928000e4 mov x4, #0xfffffffffffffff8 // #-8
282240: 9b047c63 mul x3, x3, x4
282244: 8b030042 add x2, x2, x3
282248: cb030003 sub x3, x0, x3
unsigned long *sl = (unsigned long *) s;
28224c: d2800004 mov x4, #0x0 // #0
count -= sizeof(*sl);
}
}
#endif /* fill 8 bits at a time */
s8 = (char *)sl;
while (count--)
282250: eb04005f cmp x2, x4
282254: 540000e1 b.ne 282270 <memset+0x64> // b.any
*s8++ = c;
return s;
}
282258: d65f03c0 ret
*sl++ = cl;
28225c: f8236804 str x4, [x0, x3]
count -= sizeof(*sl);
282260: 91002063 add x3, x3, #0x8
282264: 17fffff2 b 28222c <memset+0x20>
unsigned long *sl = (unsigned long *) s;
282268: aa0003e3 mov x3, x0
28226c: 17fffff8 b 28224c <memset+0x40>
*s8++ = c;
282270: 38246861 strb w1, [x3, x4]
282274: 91000484 add x4, x4, #0x1
282278: 17fffff6 b 282250 <memset+0x44>
000000000028227c <memcpy>:
__used void * memcpy(void *dest, const void *src, size_t count)
{
unsigned long *dl = (unsigned long *)dest, *sl = (unsigned long *)src;
char *d8, *s8;
if (src == dest)
28227c: eb01001f cmp x0, x1
282280: 54000100 b.eq 2822a0 <memcpy+0x24> // b.none
return dest;
/* while all data is aligned (common case), copy a word at a time */
if ( (((ulong)dest | (ulong)src) & (sizeof(*dl) - 1)) == 0) {
282284: aa010003 orr x3, x0, x1
282288: f2400863 ands x3, x3, #0x7
28228c: 54000120 b.eq 2822b0 <memcpy+0x34> // b.none
282290: aa0003e4 mov x4, x0
282294: d2800003 mov x3, #0x0 // #0
}
}
/* copy the reset one byte at a time */
d8 = (char *)dl;
s8 = (char *)sl;
while (count--)
282298: eb03005f cmp x2, x3
28229c: 540001e1 b.ne 2822d8 <memcpy+0x5c> // b.any
*d8++ = *s8++;
return dest;
}
2822a0: d65f03c0 ret
*dl++ = *sl++;
2822a4: f8636824 ldr x4, [x1, x3]
2822a8: f8236804 str x4, [x0, x3]
count -= sizeof(*dl);
2822ac: 91002063 add x3, x3, #0x8
while (count >= sizeof(*dl)) {
2822b0: cb030044 sub x4, x2, x3
2822b4: f1001c9f cmp x4, #0x7
2822b8: 54ffff68 b.hi 2822a4 <memcpy+0x28> // b.pmore
2822bc: d343fc43 lsr x3, x2, #3
2822c0: 928000e4 mov x4, #0xfffffffffffffff8 // #-8
2822c4: 9b047c63 mul x3, x3, x4
2822c8: 8b030042 add x2, x2, x3
2822cc: cb030004 sub x4, x0, x3
2822d0: cb030021 sub x1, x1, x3
2822d4: 17fffff0 b 282294 <memcpy+0x18>
*d8++ = *s8++;
2822d8: 38636825 ldrb w5, [x1, x3]
2822dc: 38236885 strb w5, [x4, x3]
2822e0: 91000463 add x3, x3, #0x1
2822e4: 17ffffed b 282298 <memcpy+0x1c>
I tried enabling CONFIG_USE_ARCH_MEMCPY=y, CONFIG_USE_ARCH_MEMSET=y in
the .config (but leaving it disabled in SPL/TPL) and it results in
Synchronous Abort:
U-Boot SPL 2024.04 (Apr 02 2024 - 10:58:58 +0000)
Trying to boot from MMC1
NOTICE: BL31: v1.3(release):8f40012ab
NOTICE: BL31: Built : 14:20:53, Feb 16 2023
NOTICE: BL31: Rockchip release version: v1.1
INFO: GICv3 with legacy support detected. ARM GICV3 driver initialized in EL3
INFO: Using opteed sec cpu_context!
INFO: boot cpu mask: 0
INFO: plat_rockchip_pmu_init(1203): pd status 3e
INFO: BL31: Initializing runtime services
WARNING: No OPTEE provided by BL2 boot loader, Booting device without
OPTEE initialization. SMC`s destined for OPTEE will return SMC_UNK
ERROR: Error initializing runtime service opteed_fast
INFO: BL31: Preparing for EL3 exit to normal world
INFO: Entry point address = 0x200000
INFO: SPSR = 0x3c9
"Synchronous Abort" handler, esr 0x96000021, far 0x2957e1
elr: 000000000020233c lr : 000000000026a388
x0 : 00000000002fbf38 x1 : 00000000002957e1
x2 : 0000000000000008 x3 : 0000000000000065
x4 : 00000000002957e9 x5 : 00000000002fbf40
x6 : 0000000000000065 x7 : 0000000000000003
x8 : 00000000002c7960 x9 : 000000000000ffd0
x10: 00000000002fbc5c x11: 00000000000132e8
x12: 00000000002fbce8 x13: 00000000002c7960
x14: 00000000002c7960 x15: 0000000000000000
x16: 0000000000000000 x17: 0000000000000000
x18: 00000000002fbe30 x19: 0000000000000007
x20: 00000000002957d8 x21: 0000000000000009
x22: 000000000029d189 x23: 0000000000000020
x24: 00000000002fbf38 x25: 00000000002957e7
x26: 00000000002957b2 x27: 0000000000007fff
x28: 0000000000000000 x29: 00000000002fbcc0
Code: a9001c06 a93f34ac d65f03c0 361800c2 (f9400026)
Resetting CPU ...
resetting ...
>
> Thanks,
> Gao Xiang
Regards,
Jonathan
More information about the U-Boot
mailing list