On 2/14/23 14:32, Kevin Brodsky wrote:
On 13/02/2023 10:23, Amit Daniel Kachhap wrote:
Gcc toolchain complains about the missing memcpy function with -nostdlib linker option so a memcpy/memmove implementation is added.
This commit is similar to previous commit "arm64: morello: Use the Morello optimized routine for memcpy" which adds an optimized routine for memcpy in the kernel.
Signed-off-by: Amit Daniel Kachhap amit.kachhap@arm.com
.../testing/selftests/arm64/morello/Makefile | 3 +- .../selftests/arm64/morello/morello_memcpy.S | 533 ++++++++++++++++++ 2 files changed, 535 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/arm64/morello/morello_memcpy.S
diff --git a/tools/testing/selftests/arm64/morello/Makefile b/tools/testing/selftests/arm64/morello/Makefile index 21906770f216..72f34426c7c0 100644 --- a/tools/testing/selftests/arm64/morello/Makefile +++ b/tools/testing/selftests/arm64/morello/Makefile @@ -33,7 +33,8 @@ $(OUTPUT)/%.o:%.S $(DEPS) $(OUTPUT)/%.o:%.c $(DEPS) $(CC) $< -o $@ $(CFLAGS) -c -$(OUTPUT)/%: $(OUTPUT)/%.o $(OUTPUT)/freestanding_start.o $(OUTPUT)/freestanding_init_globals.o $(OUTPUT)/freestanding.o +$(OUTPUT)/%: $(OUTPUT)/%.o $(OUTPUT)/freestanding_start.o $(OUTPUT)/freestanding_init_globals.o \
- $(OUTPUT)/freestanding.o $(OUTPUT)/morello_memcpy.o $(CC) $^ -o $@ $(LDFLAGS)
$(OUTPUT)/signal: $(OUTPUT)/signal_common.o diff --git a/tools/testing/selftests/arm64/morello/morello_memcpy.S b/tools/testing/selftests/arm64/morello/morello_memcpy.S new file mode 100644 index 000000000000..5381dde92db0 --- /dev/null +++ b/tools/testing/selftests/arm64/morello/morello_memcpy.S @@ -0,0 +1,533 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/*
- Copyright (c) 2019-2023, Arm Limited.
- morello_memcpy - copy memory area
- Adapted from the original at:
- */
+#include <asm/unistd.h>
Are we actually using this? I expect this is useful for __NR_* but we're not making any syscall here.
Yes this should not be required. This remained after splitting from freestanding_start.S
+#define FUNCTION_START(name) \
- .global name; \
- .align 4; \
- .type name STT_FUNC; \
- name:
+#define FUNCTION_END(name) \
- .size name, .-name
+#define FUNCTION_ALIAS(name) \
- .global name; \
- .type name STT_FUNC; \
- name:
+#define L(label) .L ## label
+#define xdstin x0 +#define xsrc x1 +#define count x2 +#define xdst x3 +#define xsrcend x4 +#define xdstend x5 +#define auoff x14 +#define cap_count x15 +#define tmp1 x16 +#define tmp2 x17
+#if defined(__CHERI_PURE_CAPABILITY__) +#define dstin c0 +#define src c1 +#define dst c3 +#define srcend c4 +#define dstend c5 +#define tmp1_ptr c16 +#else +#define dstin x0 +#define src x1 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define tmp1_ptr x16 +#endif
+#define A_l x6 +#define B_l x7 +#define C_l x8 +#define D_l x9 +#define E_l x10 +#define F_l x11 +#define G_l x12 +#define H_l x13
+#define A_cap c6 +#define B_cap c7 +#define C_cap c8 +#define D_cap c9 +#define E_cap c10 +#define F_cap c11 +#define G_cap c12 +#define H_cap c13
+/* This algorithm has not been benchmarked. It's derived
- from the base aarch64 one with small changes to account
- for copying tags.
- We're copying less than 16 bytes, so no capabilities.
Use the traditional code path for these.
- src mod 16 != dst mode 16. We're not copying capabilities,
so again use the traditional memcpy.
- We're copying more than 8 capabilities plus the head and tail.
- a. No overlap, use forward copy
- b. Overlap, use backward copy
- We're copying 0..8 capabilities
- a. No capabilities to copy. This means we are copying 16..30 bytes.
Use the existing code path to do this from the original algorithm.
- b. Copying 1..2 capabilities plus the head and tail
Use a branchless sequence.
- c. Copying 3..4 capabilities plus the head and tail
Use a branchless sequence.
- d. Copying 5..8 capabilities plus the head and tail
Use a branchless sequence.
- */
+.text
I don't think we need to add an explicit .text - there is one in freestanding_start.S simply because it places some globals in .data at the start.
Yes agreed.
Amit
Kevin
+FUNCTION_ALIAS(memmove) +FUNCTION_START(memcpy)
- add srcend, src, count
- add dstend, dstin, count
- /* Copies of less than 16 bytes don't use capabilities. */
- cmp count, 16
- b.lo L(copy16)
- /* If src mod 16 != dst mod 16 we're not transferring tags. */
- and tmp1, xsrc, 15
- and tmp2, xdstin, 15
- cmp tmp1, tmp2
- b.ne L(memcpy_nocap)
- /* Get the number of capabilities that we need to store. */
- neg tmp2, tmp1
- add tmp2, tmp2, 16
- and auoff, tmp2, 15
- sub cap_count, count, auoff
- lsr cap_count, cap_count, 4
- cmp cap_count, 8
- b.hi L(copy_long_cap)
- cmp cap_count, 2
- b.hi L(copy32_128_cap)
- /* Copy 0..2 capabilities using a branchless sequence. */
- cbz cap_count, L(copy32)
- ldp E_l, F_l, [src]
- ldp C_l, D_l, [srcend, -16]
- add src, src, auoff /* align up src to 16 bytes */
+#if defined(__CHERI_PURE_CAPABILITY__)
- alignd srcend, srcend, 4
+#else
- bic srcend, srcend, 15
+#endif
- ldr A_cap, [src]
- ldr B_cap, [srcend, -16]
- stp E_l, F_l, [dstin]
- stp C_l, D_l, [dstend, -16]
- add tmp1_ptr, dstin, auoff /* align up dstin to 16 bytes */
+#if defined (__CHERI_PURE_CAPABILITY__)
- alignd dstend, dstend, 4
+#else
- bic dstend, dstend, 15
+#endif
- str A_cap, [tmp1_ptr]
- str B_cap, [dstend, -16]
- ret
- .p2align 4
+L(copy32_128_cap):
- cmp cap_count, 4
- b.hi L(copy128_cap)
- /* Copy 3..4 capabilities using a branchless sequence. */
- ldp E_l, F_l, [src]
- ldp G_l, H_l, [srcend, -16]
- add src, src, auoff /* align up src to 16 bytes */
+#if defined (__CHERI_PURE_CAPABILITY__)
- alignd srcend, srcend, 4
+#else
- bic srcend, srcend, 15
+#endif
- ldp A_cap, B_cap, [src]
- ldp C_cap, D_cap, [srcend, -32]
- stp E_l, F_l, [dstin]
- stp G_l, H_l, [dstend, -16]
- add tmp1_ptr, dstin, auoff /* align up dstin to 16 bytes */
+#if defined (__CHERI_PURE_CAPABILITY__)
- alignd dstend, dstend, 4
+#else
- bic dstend, dstend, 15
+#endif
- stp A_cap, B_cap, [tmp1_ptr]
- stp C_cap, D_cap, [dstend, -32]
- ret
- .p2align 4
+L(copy128_cap):
- /* Copy 5..8 capabilities using a branchless sequence. */
- ldp count, tmp2, [src]
- ldp tmp1, cap_count, [srcend, -16]
- add src, src, auoff /* align up src to 16 bytes */
+#if defined (__CHERI_PURE_CAPABILITY__)
- alignd srcend, srcend, 4
+#else
- bic srcend, srcend, 15
+#endif
- ldp A_cap, B_cap, [src]
- ldp C_cap, D_cap, [src, 32]
- ldp E_cap, F_cap, [srcend, -32]
- ldp G_cap, H_cap, [srcend, -64]
- stp count, tmp2, [dstin]
- stp tmp1, cap_count, [dstend, -16]
- add tmp1_ptr, dstin, auoff /* align up src to 16 bytes */
+#if defined (__CHERI_PURE_CAPABILITY__)
- alignd dstend, dstend, 4
+#else
- bic dstend, dstend, 15
+#endif
- stp A_cap, B_cap, [tmp1_ptr]
- stp C_cap, D_cap, [tmp1_ptr, 32]
- stp E_cap, F_cap, [dstend, -32]
- stp G_cap, H_cap, [dstend, -64]
- ret
+L(copy_long_cap):
- /* Use backwards copy if there is an overlap. */
- sub tmp1, xdstin, xsrc
- cmp tmp1, count
- b.lo L(copy_long_backwards_cap)
- /* Copy 16 bytes and then align src to 16-byte alignment. */
- ldp C_l, D_l, [src]
- ldr E_cap, [src, auoff]
- and tmp1, xsrc, 15
+#if defined(__CHERI_PURE_CAPABILITY__)
- alignd src, src, 4
- neg tmp2, tmp1
- add dst, dstin, tmp2
+#else
- bic src, src, 15
- sub dst, dstin, tmp1
+#endif
- add count, count, tmp1 /* Count is now 16 too large. */
- ldp A_cap, B_cap, [src, 16]
- stp C_l, D_l, [dstin]
- str E_cap, [dstin, auoff]
- ldp C_cap, D_cap, [src, 48]
- subs count, count, 128 + 16 /* Test and readjust count. */
- b.ls L(copy64_from_end_cap)
+L(loop64_cap):
- stp A_cap, B_cap, [dst, 16]
- ldp A_cap, B_cap, [src, 80]
- stp C_cap, D_cap, [dst, 48]
- ldp C_cap, D_cap, [src, 112]
- add src, src, 64
- add dst, dst, 64
- subs count, count, 64
- b.hi L(loop64_cap)
- /* Write the last iteration and copy the last 16-byte aligned 64 byte block
from the end and the tail. */
+L(copy64_from_end_cap):
- ldp G_l, H_l, [srcend, -16]
+#if defined(__CHERI_PURE_CAPABILITY__)
- alignd srcend, srcend, 4
- alignd tmp1_ptr, dstend, 4
+#else
- bic srcend, srcend, 15
- bic tmp1_ptr, dstend, 15
+#endif
- ldp E_cap, F_cap, [srcend, -64]
- stp A_cap, B_cap, [dst, 16]
- ldp A_cap, B_cap, [srcend, -32]
- stp C_cap, D_cap, [dst, 48]
- stp E_cap, F_cap, [tmp1_ptr, -64]
- stp G_l, H_l, [dstend, -16]
- stp A_cap, B_cap, [tmp1_ptr, -32]
- ret
+L(copy_long_backwards_cap):
- cbz tmp1, L(copy0)
- ldp E_l, F_l, [srcend, -16]
- and tmp1, xsrcend, 15
+#if defined(__CHERI_PURE_CAPABILITY__)
- alignd srcend, srcend, 4
- neg tmp2, tmp1
- add count, count, tmp2
+#else
- bic srcend, srcend, 15
- sub count, count, tmp1
+#endif
- ldp A_cap, B_cap, [srcend, -32]
- stp E_l, F_l, [dstend, -16]
- ldp C_cap, D_cap, [srcend, -64]
+#if defined(__CHERI_PURE_CAPABILITY__)
- add dstend, dstend, tmp2 /* tmp1 was negated above to tmp2. */
+#else
- sub dstend, dstend, tmp1
+#endif
- subs count, count, 128
- b.ls L(copy64_from_start)
+L(loop64_backwards_cap):
- str B_cap, [dstend, -16]
- str A_cap, [dstend, -32]
- ldp A_cap, B_cap, [srcend, -96]
- str D_cap, [dstend, -48]
- str C_cap, [dstend, -64]!
- ldp C_cap, D_cap, [srcend, -128]
- sub srcend, srcend, 64
- subs count, count, 64
- b.hi L(loop64_backwards_cap)
- /* Write the last iteration and copy 64 bytes from the start. */
+L(copy64_from_start_cap):
- ldp G_l, H_l, [src]
- add src, src, auoff /* align up src to 16 bytes */
- add tmp1_ptr, dstin, auoff /* align up dstin to 16 bytes */
- ldp E_cap, F_cap, [src, 32]
- stp A_cap, B_cap, [dstend, -32]
- ldp A_cap, B_cap, [src]
- stp C_cap, D_cap, [dstend, -64]
- stp E_cap, F_cap, [tmp1_ptr, 32]
- stp G_l, H_l, [dstin]
- stp A_cap, B_cap, [tmp1_ptr]
- ret
+L(memcpy_nocap):
- cmp count, 128
- b.hi L(copy_long)
- cmp count, 32
- b.hi L(copy32_128)
+#undef A_l +#undef B_l +#undef C_l +#undef D_l +#undef E_l +#undef F_l +#undef G_l +#undef H_l +#undef tmp1 +#undef tmp1_ptr +#undef tmp2
+#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_l x10 +#define C_lw w10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l x14 +#define E_h x15 +#define F_l x16 +#define F_h x17 +#define G_l count +#define G_h xdst +#define H_l xsrc +#define H_h xsrcend +#define tmp1 E_l +#define tmp2 F_l
+L(copy32):
- ldp A_l, A_h, [src]
- ldp D_l, D_h, [srcend, -16]
- stp A_l, A_h, [dstin]
- stp D_l, D_h, [dstend, -16]
- ret
- /* Copy 8-15 bytes. */
+L(copy16):
- tbz count, 3, L(copy8)
- ldr A_l, [src]
- ldr A_h, [srcend, -8]
- str A_l, [dstin]
- str A_h, [dstend, -8]
- ret
- .p2align 3
- /* Copy 4-7 bytes. */
+L(copy8):
- tbz count, 2, L(copy4)
- ldr A_lw, [src]
- ldr B_lw, [srcend, -4]
- str A_lw, [dstin]
- str B_lw, [dstend, -4]
- ret
- /* Copy 0..3 bytes using a branchless sequence. */
+L(copy4):
- cbz count, L(copy0)
- lsr tmp1, count, 1
- ldrb A_lw, [src]
- ldrb C_lw, [srcend, -1]
- ldrb B_lw, [src, tmp1]
- strb A_lw, [dstin]
- strb B_lw, [dstin, tmp1]
- strb C_lw, [dstend, -1]
+L(copy0):
- ret
- .p2align 4
- /* Medium copies: 33..128 bytes. */
+L(copy32_128):
- ldp A_l, A_h, [src]
- ldp B_l, B_h, [src, 16]
- ldp C_l, C_h, [srcend, -32]
- ldp D_l, D_h, [srcend, -16]
- cmp count, 64
- b.hi L(copy128)
- stp A_l, A_h, [dstin]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstend, -32]
- stp D_l, D_h, [dstend, -16]
- ret
- .p2align 4
- /* Copy 65..128 bytes. */
+L(copy128):
- ldp E_l, E_h, [src, 32]
- ldp F_l, F_h, [src, 48]
- cmp count, 96
- b.ls L(copy96)
- ldp G_l, G_h, [srcend, -64]
- ldp H_l, H_h, [srcend, -48]
- stp G_l, G_h, [dstend, -64]
- stp H_l, H_h, [dstend, -48]
+L(copy96):
- stp A_l, A_h, [dstin]
- stp B_l, B_h, [dstin, 16]
- stp E_l, E_h, [dstin, 32]
- stp F_l, F_h, [dstin, 48]
- stp C_l, C_h, [dstend, -32]
- stp D_l, D_h, [dstend, -16]
- ret
- .p2align 4
- /* Copy more than 128 bytes. */
+L(copy_long):
- /* Use backwards copy if there is an overlap. */
- sub tmp1, xdstin, xsrc
- cbz tmp1, L(copy0)
- cmp tmp1, count
- b.lo L(copy_long_backwards)
- /* Copy 16 bytes and then align dst to 16-byte alignment. */
- ldp D_l, D_h, [src]
- and tmp1, xdstin, 15
+#if defined(__CHERI_PURE_CAPABILITY__)
- alignd dst, dstin, 4
- neg tmp2, tmp1
- add src, src, tmp2
+#else
- bic dst, dstin, 15
- sub src, src, tmp1
+#endif
- add count, count, tmp1 /* Count is now 16 too large. */
- ldp A_l, A_h, [src, 16]
- stp D_l, D_h, [dstin]
- ldp B_l, B_h, [src, 32]
- ldp C_l, C_h, [src, 48]
- ldp D_l, D_h, [src, 64]!
- subs count, count, 128 + 16 /* Test and readjust count. */
- b.ls L(copy64_from_end)
+L(loop64):
- stp A_l, A_h, [dst, 16]
- ldp A_l, A_h, [src, 16]
- stp B_l, B_h, [dst, 32]
- ldp B_l, B_h, [src, 32]
- stp C_l, C_h, [dst, 48]
- ldp C_l, C_h, [src, 48]
- stp D_l, D_h, [dst, 64]!
- ldp D_l, D_h, [src, 64]!
- subs count, count, 64
- b.hi L(loop64)
- /* Write the last iteration and copy 64 bytes from the end. */
+L(copy64_from_end):
- ldp E_l, E_h, [srcend, -64]
- stp A_l, A_h, [dst, 16]
- ldp A_l, A_h, [srcend, -48]
- stp B_l, B_h, [dst, 32]
- ldp B_l, B_h, [srcend, -32]
- stp C_l, C_h, [dst, 48]
- ldp C_l, C_h, [srcend, -16]
- stp D_l, D_h, [dst, 64]
- stp E_l, E_h, [dstend, -64]
- stp A_l, A_h, [dstend, -48]
- stp B_l, B_h, [dstend, -32]
- stp C_l, C_h, [dstend, -16]
- ret
- .p2align 4
- /* Large backwards copy for overlapping copies.
Copy 16 bytes and then align dst to 16-byte alignment. */
+L(copy_long_backwards):
- ldp D_l, D_h, [srcend, -16]
- and tmp1, xdstend, 15
+#if defined(__CHERI_PURE_CAPABILITY__)
- neg tmp2, tmp1
- add srcend, srcend, tmp2
+#else
- sub srcend, srcend, tmp1
+#endif
- sub count, count, tmp1
- ldp A_l, A_h, [srcend, -16]
- stp D_l, D_h, [dstend, -16]
- ldp B_l, B_h, [srcend, -32]
- ldp C_l, C_h, [srcend, -48]
- ldp D_l, D_h, [srcend, -64]!
+#if defined(__CHERI_PURE_CAPABILITY__)
- add dstend, dstend, tmp2
+#else
- sub dstend, dstend, tmp1
+#endif
- subs count, count, 128
- b.ls L(copy64_from_start)
+L(loop64_backwards):
- stp A_l, A_h, [dstend, -16]
- ldp A_l, A_h, [srcend, -16]
- stp B_l, B_h, [dstend, -32]
- ldp B_l, B_h, [srcend, -32]
- stp C_l, C_h, [dstend, -48]
- ldp C_l, C_h, [srcend, -48]
- stp D_l, D_h, [dstend, -64]!
- ldp D_l, D_h, [srcend, -64]!
- subs count, count, 64
- b.hi L(loop64_backwards)
- /* Write the last iteration and copy 64 bytes from the start. */
+L(copy64_from_start):
- ldp G_l, G_h, [src, 48]
- stp A_l, A_h, [dstend, -16]
- ldp A_l, A_h, [src, 32]
- stp B_l, B_h, [dstend, -32]
- ldp B_l, B_h, [src, 16]
- stp C_l, C_h, [dstend, -48]
- ldp C_l, C_h, [src]
- stp D_l, D_h, [dstend, -64]
- stp G_l, G_h, [dstin, 48]
- stp A_l, A_h, [dstin, 32]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstin]
- ret
+FUNCTION_END(memcpy) +FUNCTION_END(memmove)