[linux-morello] Re: [PATCH v3 09/10] kselftests/arm64: morello: Add optimized routine for memcpy

14 Feb 2023

On 2/14/23 14:32, Kevin Brodsky wrote:
...
On 13/02/2023 10:23, Amit Daniel Kachhap wrote:
...
Gcc toolchain complains about the missing memcpy function with -nostdlib
linker option so a memcpy/memmove implementation is added.
This commit is similar to previous commit "arm64: morello: Use the Morello
optimized routine for memcpy" which adds an optimized routine for memcpy in
the kernel.
Signed-off-by: Amit Daniel Kachhap amit.kachhap@arm.com
.../testing/selftests/arm64/morello/Makefile  |   3 +-
  .../selftests/arm64/morello/morello_memcpy.S  | 533 ++++++++++++++++++
  2 files changed, 535 insertions(+), 1 deletion(-)
  create mode 100644 tools/testing/selftests/arm64/morello/morello_memcpy.S

diff --git a/tools/testing/selftests/arm64/morello/Makefile b/tools/testing/selftests/arm64/morello/Makefile
index 21906770f216..72f34426c7c0 100644
--- a/tools/testing/selftests/arm64/morello/Makefile
+++ b/tools/testing/selftests/arm64/morello/Makefile
@@ -33,7 +33,8 @@ $(OUTPUT)/%.o:%.S $(DEPS)
  $(OUTPUT)/%.o:%.c $(DEPS)
   $(CC) $< -o $@ $(CFLAGS) -c
  
-$(OUTPUT)/%: $(OUTPUT)/%.o $(OUTPUT)/freestanding_start.o $(OUTPUT)/freestanding_init_globals.o $(OUTPUT)/freestanding.o
+$(OUTPUT)/%: $(OUTPUT)/%.o $(OUTPUT)/freestanding_start.o $(OUTPUT)/freestanding_init_globals.o \

$(OUTPUT)/freestanding.o $(OUTPUT)/morello_memcpy.o
$(CC) $^ -o $@ $(LDFLAGS)

$(OUTPUT)/signal: $(OUTPUT)/signal_common.o
diff --git a/tools/testing/selftests/arm64/morello/morello_memcpy.S b/tools/testing/selftests/arm64/morello/morello_memcpy.S
new file mode 100644
index 000000000000..5381dde92db0
--- /dev/null
+++ b/tools/testing/selftests/arm64/morello/morello_memcpy.S
@@ -0,0 +1,533 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*


Copyright (c) 2019-2023, Arm Limited.







morello_memcpy - copy memory area



Adapted from the original at:



https://github.com/ARM-software/optimized-routines/blob/1c9fdae82a43049e/str...


*/


+#include <asm/unistd.h>
Are we actually using this? I expect this is useful for __NR_* but we're
not making any syscall here.
Yes this should not be required. This remained after splitting from 
freestanding_start.S
...
...



+#define FUNCTION_START(name)		\

.global	name;			\
.align	4;			\
.type	name STT_FUNC;		\
name:


+#define FUNCTION_END(name)		\

.size name, .-name


+#define FUNCTION_ALIAS(name)		\

.global	name;			\
.type	name STT_FUNC;		\
name:


+#define L(label) .L ## label



+#define xdstin	x0
+#define xsrc	x1
+#define count	x2
+#define xdst	x3
+#define xsrcend	x4
+#define xdstend	x5
+#define auoff   x14
+#define cap_count   x15
+#define tmp1	x16
+#define tmp2	x17



+#if defined(__CHERI_PURE_CAPABILITY__)
+#define dstin	c0
+#define src	c1
+#define dst	c3
+#define srcend	c4
+#define dstend	c5
+#define tmp1_ptr c16
+#else
+#define dstin	x0
+#define src	x1
+#define dst	x3
+#define srcend	x4
+#define dstend	x5
+#define tmp1_ptr x16
+#endif



+#define A_l   x6
+#define B_l   x7
+#define C_l   x8
+#define D_l   x9
+#define E_l   x10
+#define F_l   x11
+#define G_l   x12
+#define H_l   x13



+#define A_cap   c6
+#define B_cap   c7
+#define C_cap   c8
+#define D_cap   c9
+#define E_cap   c10
+#define F_cap   c11
+#define G_cap   c12
+#define H_cap   c13



+/* This algorithm has not been benchmarked. It's derived

from the base aarch64 one with small changes to account
for copying tags.

We're copying less than 16 bytes, so no capabilities.


 Use the traditional code path for these.



src mod 16 != dst mode 16. We're not copying capabilities,


 so again use the traditional memcpy.



We're copying more than 8 capabilities plus the head and tail.


a. No overlap, use forward copy
b. Overlap, use backward copy

We're copying 0..8 capabilities


a. No capabilities to copy. This means we are copying 16..30 bytes.
  Use the existing code path to do this from the original algorithm.


b. Copying 1..2 capabilities plus the head and tail
  Use a branchless sequence.


c. Copying 3..4 capabilities plus the head and tail
  Use a branchless sequence.


d. Copying 5..8 capabilities plus the head and tail
  Use a branchless sequence.


*/

+.text
I don't think we need to add an explicit .text - there is one in
freestanding_start.S simply because it places some globals in .data at
the start.
Yes agreed.
Amit
...
Kevin
...
+FUNCTION_ALIAS(memmove)
+FUNCTION_START(memcpy)

add	srcend, src, count
add	dstend, dstin, count

/* Copies of less than 16 bytes don't use capabilities. */
cmp	count, 16
b.lo	L(copy16)

/* If src mod 16 != dst mod 16 we're not transferring tags. */
and	tmp1, xsrc, 15
and	tmp2, xdstin, 15
cmp	tmp1, tmp2
b.ne	L(memcpy_nocap)

/* Get the number of capabilities that we need to store. */
neg	tmp2, tmp1
add	tmp2, tmp2, 16
and	auoff, tmp2, 15

sub	cap_count, count, auoff
lsr	cap_count, cap_count, 4

cmp	cap_count, 8
b.hi	L(copy_long_cap)
cmp	cap_count, 2
b.hi	L(copy32_128_cap)

/* Copy 0..2 capabilities using a branchless sequence. */
cbz	cap_count, L(copy32)
ldp	E_l, F_l, [src]
ldp	C_l, D_l, [srcend, -16]
add	src, src, auoff /* align up src to 16 bytes */

+#if defined(__CHERI_PURE_CAPABILITY__)

alignd	srcend, srcend, 4

+#else

bic	srcend, srcend, 15

+#endif

ldr	A_cap, [src]
ldr	B_cap, [srcend, -16]
stp	E_l, F_l, [dstin]
stp	C_l, D_l, [dstend, -16]
add	tmp1_ptr, dstin, auoff /* align up dstin to 16 bytes */

+#if defined (__CHERI_PURE_CAPABILITY__)

alignd	dstend, dstend, 4

+#else

bic	dstend, dstend, 15

+#endif

str	A_cap, [tmp1_ptr]
str	B_cap, [dstend, -16]
ret

.p2align 4

+L(copy32_128_cap):

cmp	cap_count, 4
b.hi	L(copy128_cap)
/* Copy 3..4 capabilities using a branchless sequence. */
ldp	E_l, F_l, [src]
ldp	G_l, H_l, [srcend, -16]
add	src, src, auoff /* align up src to 16 bytes */

+#if defined (__CHERI_PURE_CAPABILITY__)

alignd	srcend, srcend, 4

+#else

bic	srcend, srcend, 15

+#endif

ldp	A_cap, B_cap, [src]
ldp	C_cap, D_cap, [srcend, -32]
stp	E_l, F_l, [dstin]
stp	G_l, H_l, [dstend, -16]
add	tmp1_ptr, dstin, auoff /* align up dstin to 16 bytes */

+#if defined (__CHERI_PURE_CAPABILITY__)

alignd  dstend, dstend, 4

+#else

bic	dstend, dstend, 15

+#endif

stp	A_cap, B_cap, [tmp1_ptr]
stp	C_cap, D_cap, [dstend, -32]
ret

.p2align 4

+L(copy128_cap):

/* Copy 5..8 capabilities using a branchless sequence. */
ldp	count, tmp2, [src]
ldp	tmp1, cap_count, [srcend, -16]
add	src, src, auoff /* align up src to 16 bytes */

+#if defined (__CHERI_PURE_CAPABILITY__)

alignd	srcend, srcend, 4

+#else

bic	srcend, srcend, 15

+#endif

ldp	A_cap, B_cap, [src]
ldp	C_cap, D_cap, [src, 32]
ldp	E_cap, F_cap, [srcend, -32]
ldp	G_cap, H_cap, [srcend, -64]
stp	count, tmp2, [dstin]
stp	tmp1, cap_count, [dstend, -16]
add	tmp1_ptr, dstin, auoff /* align up src to 16 bytes */

+#if defined (__CHERI_PURE_CAPABILITY__)

alignd	dstend, dstend, 4

+#else

bic	dstend, dstend, 15

+#endif

stp	A_cap, B_cap, [tmp1_ptr]
stp	C_cap, D_cap, [tmp1_ptr, 32]
stp	E_cap, F_cap, [dstend, -32]
stp	G_cap, H_cap, [dstend, -64]
ret


+L(copy_long_cap):

/* Use backwards copy if there is an overlap.  */
sub	tmp1, xdstin, xsrc
cmp	tmp1, count
b.lo	L(copy_long_backwards_cap)

/* Copy 16 bytes and then align src to 16-byte alignment.  */
ldp	C_l, D_l, [src]
ldr	E_cap, [src, auoff]
and	tmp1, xsrc, 15

+#if defined(__CHERI_PURE_CAPABILITY__)

alignd	src, src, 4
neg	tmp2, tmp1
add	dst, dstin, tmp2

+#else

bic	src, src, 15
sub	dst, dstin, tmp1

+#endif

add	count, count, tmp1	/* Count is now 16 too large.  */
ldp	A_cap, B_cap, [src, 16]
stp	C_l, D_l, [dstin]
str	E_cap, [dstin, auoff]
ldp	C_cap, D_cap, [src, 48]
subs	count, count, 128 + 16	/* Test and readjust count.  */
b.ls	L(copy64_from_end_cap)

+L(loop64_cap):

stp	A_cap, B_cap, [dst, 16]
ldp	A_cap, B_cap, [src, 80]
stp	C_cap, D_cap, [dst, 48]
ldp	C_cap, D_cap, [src, 112]
add	src, src, 64
add	dst, dst, 64
subs	count, count, 64
b.hi	L(loop64_cap)

/* Write the last iteration and copy the last 16-byte aligned 64 byte block
  from the end and the tail.  */



+L(copy64_from_end_cap):

ldp	G_l, H_l, [srcend, -16]

+#if defined(__CHERI_PURE_CAPABILITY__)

alignd	srcend, srcend, 4
alignd	tmp1_ptr, dstend, 4

+#else

bic	srcend, srcend, 15
bic	tmp1_ptr, dstend, 15

+#endif

ldp	E_cap, F_cap, [srcend, -64]
stp	A_cap, B_cap, [dst, 16]
ldp	A_cap, B_cap, [srcend, -32]
stp	C_cap, D_cap, [dst, 48]
stp	E_cap, F_cap, [tmp1_ptr, -64]
stp	G_l, H_l, [dstend, -16]
stp	A_cap, B_cap, [tmp1_ptr, -32]
ret


+L(copy_long_backwards_cap):

cbz	tmp1, L(copy0)
ldp	E_l, F_l, [srcend, -16]
and	tmp1, xsrcend, 15

+#if defined(__CHERI_PURE_CAPABILITY__)

alignd	srcend, srcend, 4
neg	tmp2, tmp1
add	count, count, tmp2

+#else

bic	srcend, srcend, 15
sub	count, count, tmp1

+#endif

ldp	A_cap, B_cap, [srcend, -32]
stp	E_l, F_l, [dstend, -16]
ldp	C_cap, D_cap, [srcend, -64]

+#if defined(__CHERI_PURE_CAPABILITY__)

add	dstend, dstend, tmp2  /* tmp1 was negated above to tmp2. */

+#else

sub	dstend, dstend, tmp1

+#endif

subs	count, count, 128
b.ls	L(copy64_from_start)


+L(loop64_backwards_cap):

str	B_cap, [dstend, -16]
str	A_cap, [dstend, -32]
ldp	A_cap, B_cap, [srcend, -96]
str	D_cap, [dstend, -48]
str	C_cap, [dstend, -64]!
ldp	C_cap, D_cap, [srcend, -128]
sub	srcend, srcend, 64
subs	count, count, 64
b.hi	L(loop64_backwards_cap)

/* Write the last iteration and copy 64 bytes from the start.  */

+L(copy64_from_start_cap):

ldp	G_l, H_l, [src]
add	src, src, auoff /* align up src to 16 bytes */
add	tmp1_ptr, dstin, auoff /* align up dstin to 16 bytes */
ldp	E_cap, F_cap, [src, 32]
stp	A_cap, B_cap, [dstend, -32]
ldp	A_cap, B_cap, [src]
stp	C_cap, D_cap, [dstend, -64]
stp	E_cap, F_cap, [tmp1_ptr, 32]
stp	G_l, H_l, [dstin]
stp	A_cap, B_cap, [tmp1_ptr]
ret


+L(memcpy_nocap):

cmp	count, 128
b.hi	L(copy_long)
cmp	count, 32
b.hi	L(copy32_128)


+#undef A_l
+#undef B_l
+#undef C_l
+#undef D_l
+#undef E_l
+#undef F_l
+#undef G_l
+#undef H_l
+#undef tmp1
+#undef tmp1_ptr
+#undef tmp2



+#define A_l	x6
+#define A_lw	w6
+#define A_h	x7
+#define B_l	x8
+#define B_lw	w8
+#define B_h	x9
+#define C_l	x10
+#define C_lw	w10
+#define C_h	x11
+#define D_l	x12
+#define D_h	x13
+#define E_l	x14
+#define E_h	x15
+#define F_l	x16
+#define F_h	x17
+#define G_l	count
+#define G_h	xdst
+#define H_l	xsrc
+#define H_h	xsrcend
+#define tmp1    E_l
+#define tmp2    F_l



+L(copy32):

ldp	A_l, A_h, [src]
ldp	D_l, D_h, [srcend, -16]
stp	A_l, A_h, [dstin]
stp	D_l, D_h, [dstend, -16]
ret

/* Copy 8-15 bytes.  */

+L(copy16):

tbz	count, 3, L(copy8)
ldr	A_l, [src]
ldr	A_h, [srcend, -8]
str	A_l, [dstin]
str	A_h, [dstend, -8]
ret

.p2align 3
/* Copy 4-7 bytes.  */

+L(copy8):

tbz	count, 2, L(copy4)
ldr	A_lw, [src]
ldr	B_lw, [srcend, -4]
str	A_lw, [dstin]
str	B_lw, [dstend, -4]
ret

/* Copy 0..3 bytes using a branchless sequence.  */

+L(copy4):

cbz	count, L(copy0)
lsr	tmp1, count, 1
ldrb	A_lw, [src]
ldrb	C_lw, [srcend, -1]
ldrb	B_lw, [src, tmp1]
strb	A_lw, [dstin]
strb	B_lw, [dstin, tmp1]
strb	C_lw, [dstend, -1]

+L(copy0):

ret

.p2align 4
/* Medium copies: 33..128 bytes.  */

+L(copy32_128):

ldp	A_l, A_h, [src]
ldp	B_l, B_h, [src, 16]
ldp	C_l, C_h, [srcend, -32]
ldp	D_l, D_h, [srcend, -16]
cmp	count, 64
b.hi	L(copy128)
stp	A_l, A_h, [dstin]
stp	B_l, B_h, [dstin, 16]
stp	C_l, C_h, [dstend, -32]
stp	D_l, D_h, [dstend, -16]
ret

.p2align 4
/* Copy 65..128 bytes.  */

+L(copy128):

ldp	E_l, E_h, [src, 32]
ldp	F_l, F_h, [src, 48]
cmp	count, 96
b.ls	L(copy96)
ldp	G_l, G_h, [srcend, -64]
ldp	H_l, H_h, [srcend, -48]
stp	G_l, G_h, [dstend, -64]
stp	H_l, H_h, [dstend, -48]

+L(copy96):

stp	A_l, A_h, [dstin]
stp	B_l, B_h, [dstin, 16]
stp	E_l, E_h, [dstin, 32]
stp	F_l, F_h, [dstin, 48]
stp	C_l, C_h, [dstend, -32]
stp	D_l, D_h, [dstend, -16]
ret

.p2align 4
/* Copy more than 128 bytes.  */

+L(copy_long):

/* Use backwards copy if there is an overlap.  */
sub	tmp1, xdstin, xsrc
cbz	tmp1, L(copy0)
cmp	tmp1, count
b.lo	L(copy_long_backwards)

/* Copy 16 bytes and then align dst to 16-byte alignment.  */

ldp	D_l, D_h, [src]
and	tmp1, xdstin, 15

+#if defined(__CHERI_PURE_CAPABILITY__)

alignd	dst, dstin, 4
neg	tmp2, tmp1
add	src, src, tmp2

+#else

bic	dst, dstin, 15
sub	src, src, tmp1

+#endif

add	count, count, tmp1	/* Count is now 16 too large.  */
ldp	A_l, A_h, [src, 16]
stp	D_l, D_h, [dstin]
ldp	B_l, B_h, [src, 32]
ldp	C_l, C_h, [src, 48]
ldp	D_l, D_h, [src, 64]!
subs	count, count, 128 + 16	/* Test and readjust count.  */
b.ls	L(copy64_from_end)


+L(loop64):

stp	A_l, A_h, [dst, 16]
ldp	A_l, A_h, [src, 16]
stp	B_l, B_h, [dst, 32]
ldp	B_l, B_h, [src, 32]
stp	C_l, C_h, [dst, 48]
ldp	C_l, C_h, [src, 48]
stp	D_l, D_h, [dst, 64]!
ldp	D_l, D_h, [src, 64]!
subs	count, count, 64
b.hi	L(loop64)

/* Write the last iteration and copy 64 bytes from the end.  */

+L(copy64_from_end):

ldp	E_l, E_h, [srcend, -64]
stp	A_l, A_h, [dst, 16]
ldp	A_l, A_h, [srcend, -48]
stp	B_l, B_h, [dst, 32]
ldp	B_l, B_h, [srcend, -32]
stp	C_l, C_h, [dst, 48]
ldp	C_l, C_h, [srcend, -16]
stp	D_l, D_h, [dst, 64]
stp	E_l, E_h, [dstend, -64]
stp	A_l, A_h, [dstend, -48]
stp	B_l, B_h, [dstend, -32]
stp	C_l, C_h, [dstend, -16]
ret

.p2align 4

/* Large backwards copy for overlapping copies.
  Copy 16 bytes and then align dst to 16-byte alignment.  */



+L(copy_long_backwards):

ldp	D_l, D_h, [srcend, -16]
and	tmp1, xdstend, 15

+#if defined(__CHERI_PURE_CAPABILITY__)

neg	tmp2, tmp1
add	srcend, srcend, tmp2

+#else

sub	srcend, srcend, tmp1

+#endif

sub	count, count, tmp1
ldp	A_l, A_h, [srcend, -16]
stp	D_l, D_h, [dstend, -16]
ldp	B_l, B_h, [srcend, -32]
ldp	C_l, C_h, [srcend, -48]
ldp	D_l, D_h, [srcend, -64]!

+#if defined(__CHERI_PURE_CAPABILITY__)

add	dstend, dstend, tmp2

+#else

sub	dstend, dstend, tmp1

+#endif

subs	count, count, 128
b.ls	L(copy64_from_start)


+L(loop64_backwards):

stp	A_l, A_h, [dstend, -16]
ldp	A_l, A_h, [srcend, -16]
stp	B_l, B_h, [dstend, -32]
ldp	B_l, B_h, [srcend, -32]
stp	C_l, C_h, [dstend, -48]
ldp	C_l, C_h, [srcend, -48]
stp	D_l, D_h, [dstend, -64]!
ldp	D_l, D_h, [srcend, -64]!
subs	count, count, 64
b.hi	L(loop64_backwards)

/* Write the last iteration and copy 64 bytes from the start.  */

+L(copy64_from_start):

ldp	G_l, G_h, [src, 48]
stp	A_l, A_h, [dstend, -16]
ldp	A_l, A_h, [src, 32]
stp	B_l, B_h, [dstend, -32]
ldp	B_l, B_h, [src, 16]
stp	C_l, C_h, [dstend, -48]
ldp	C_l, C_h, [src]
stp	D_l, D_h, [dstend, -64]
stp	G_l, G_h, [dstin, 48]
stp	A_l, A_h, [dstin, 32]
stp	B_l, B_h, [dstin, 16]
stp	C_l, C_h, [dstin]
ret


+FUNCTION_END(memcpy)
+FUNCTION_END(memmove)

    

2026

2025

2024

2023

2022

[linux-morello] Re: [PATCH v3 09/10] kselftests/arm64: morello: Add optimized routine for memcpy

Signed-off-by: Amit Daniel Kachhap amit.kachhap@arm.com