diff options
Diffstat (limited to 'arch/arm/lib')
-rw-r--r-- | arch/arm/lib/Makefile | 5 | ||||
-rw-r--r-- | arch/arm/lib/asm-offsets.c | 88 | ||||
-rw-r--r-- | arch/arm/lib/asmdefs.h | 98 | ||||
-rw-r--r-- | arch/arm/lib/bootm.c | 45 | ||||
-rw-r--r-- | arch/arm/lib/ccn504.S | 3 | ||||
-rw-r--r-- | arch/arm/lib/crt0.S | 11 | ||||
-rw-r--r-- | arch/arm/lib/div64.S | 10 | ||||
-rw-r--r-- | arch/arm/lib/lib1funcs.S | 6 | ||||
-rw-r--r-- | arch/arm/lib/memcpy-arm64.S | 242 | ||||
-rw-r--r-- | arch/arm/lib/memset-arm64.S | 148 | ||||
-rw-r--r-- | arch/arm/lib/relocate.S | 35 | ||||
-rw-r--r-- | arch/arm/lib/stack.c | 14 |
12 files changed, 553 insertions, 152 deletions
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile index 7f66332715..c48e1f622d 100644 --- a/arch/arm/lib/Makefile +++ b/arch/arm/lib/Makefile @@ -39,8 +39,13 @@ obj-$(CONFIG_$(SPL_TPL_)FRAMEWORK) += spl.o obj-$(CONFIG_SPL_FRAMEWORK) += zimage.o obj-$(CONFIG_OF_LIBFDT) += bootm-fdt.o endif +ifdef CONFIG_ARM64 +obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMSET) += memset-arm64.o +obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMCPY) += memcpy-arm64.o +else obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMSET) += memset.o obj-$(CONFIG_$(SPL_TPL_)USE_ARCH_MEMCPY) += memcpy.o +endif obj-$(CONFIG_SEMIHOSTING) += semihosting.o obj-y += bdinfo.o diff --git a/arch/arm/lib/asm-offsets.c b/arch/arm/lib/asm-offsets.c index 1a306ec415..22fd541f9a 100644 --- a/arch/arm/lib/asm-offsets.c +++ b/arch/arm/lib/asm-offsets.c @@ -15,7 +15,7 @@ #include <linux/kbuild.h> #include <linux/arm-smccc.h> -#if defined(CONFIG_MX25) || defined(CONFIG_MX27) || defined(CONFIG_MX35) \ +#if defined(CONFIG_MX27) \ || defined(CONFIG_MX51) || defined(CONFIG_MX53) #include <asm/arch/imx-regs.h> #endif @@ -35,42 +35,6 @@ int main(void) * code. Is it better to define the macros directly in headers? */ -#if defined(CONFIG_MX25) - /* Clock Control Module */ - DEFINE(CCM_CCTL, offsetof(struct ccm_regs, cctl)); - DEFINE(CCM_CGCR0, offsetof(struct ccm_regs, cgr0)); - DEFINE(CCM_CGCR1, offsetof(struct ccm_regs, cgr1)); - DEFINE(CCM_CGCR2, offsetof(struct ccm_regs, cgr2)); - DEFINE(CCM_PCDR2, offsetof(struct ccm_regs, pcdr[2])); - DEFINE(CCM_MCR, offsetof(struct ccm_regs, mcr)); - - /* Enhanced SDRAM Controller */ - DEFINE(ESDRAMC_ESDCTL0, offsetof(struct esdramc_regs, ctl0)); - DEFINE(ESDRAMC_ESDCFG0, offsetof(struct esdramc_regs, cfg0)); - DEFINE(ESDRAMC_ESDMISC, offsetof(struct esdramc_regs, misc)); - - /* Multi-Layer AHB Crossbar Switch */ - DEFINE(MAX_MPR0, offsetof(struct max_regs, mpr0)); - DEFINE(MAX_SGPCR0, offsetof(struct max_regs, sgpcr0)); - DEFINE(MAX_MPR1, offsetof(struct max_regs, mpr1)); - DEFINE(MAX_SGPCR1, offsetof(struct max_regs, sgpcr1)); - DEFINE(MAX_MPR2, offsetof(struct max_regs, mpr2)); - DEFINE(MAX_SGPCR2, offsetof(struct max_regs, sgpcr2)); - DEFINE(MAX_MPR3, offsetof(struct max_regs, mpr3)); - DEFINE(MAX_SGPCR3, offsetof(struct max_regs, sgpcr3)); - DEFINE(MAX_MPR4, offsetof(struct max_regs, mpr4)); - DEFINE(MAX_SGPCR4, offsetof(struct max_regs, sgpcr4)); - DEFINE(MAX_MGPCR0, offsetof(struct max_regs, mgpcr0)); - DEFINE(MAX_MGPCR1, offsetof(struct max_regs, mgpcr1)); - DEFINE(MAX_MGPCR2, offsetof(struct max_regs, mgpcr2)); - DEFINE(MAX_MGPCR3, offsetof(struct max_regs, mgpcr3)); - DEFINE(MAX_MGPCR4, offsetof(struct max_regs, mgpcr4)); - - /* AHB <-> IP-Bus Interface */ - DEFINE(AIPS_MPR_0_7, offsetof(struct aips_regs, mpr_0_7)); - DEFINE(AIPS_MPR_8_15, offsetof(struct aips_regs, mpr_8_15)); -#endif - #if defined(CONFIG_MX27) DEFINE(AIPI1_PSR0, IMX_AIPI1_BASE + offsetof(struct aipi_regs, psr0)); DEFINE(AIPI1_PSR1, IMX_AIPI1_BASE + offsetof(struct aipi_regs, psr1)); @@ -97,56 +61,6 @@ int main(void) offsetof(struct system_control_regs, fmcr)); #endif -#if defined(CONFIG_MX35) - /* Round up to make sure size gives nice stack alignment */ - DEFINE(CLKCTL_CCMR, offsetof(struct ccm_regs, ccmr)); - DEFINE(CLKCTL_PDR0, offsetof(struct ccm_regs, pdr0)); - DEFINE(CLKCTL_PDR1, offsetof(struct ccm_regs, pdr1)); - DEFINE(CLKCTL_PDR2, offsetof(struct ccm_regs, pdr2)); - DEFINE(CLKCTL_PDR3, offsetof(struct ccm_regs, pdr3)); - DEFINE(CLKCTL_PDR4, offsetof(struct ccm_regs, pdr4)); - DEFINE(CLKCTL_RCSR, offsetof(struct ccm_regs, rcsr)); - DEFINE(CLKCTL_MPCTL, offsetof(struct ccm_regs, mpctl)); - DEFINE(CLKCTL_PPCTL, offsetof(struct ccm_regs, ppctl)); - DEFINE(CLKCTL_ACMR, offsetof(struct ccm_regs, acmr)); - DEFINE(CLKCTL_COSR, offsetof(struct ccm_regs, cosr)); - DEFINE(CLKCTL_CGR0, offsetof(struct ccm_regs, cgr0)); - DEFINE(CLKCTL_CGR1, offsetof(struct ccm_regs, cgr1)); - DEFINE(CLKCTL_CGR2, offsetof(struct ccm_regs, cgr2)); - DEFINE(CLKCTL_CGR3, offsetof(struct ccm_regs, cgr3)); - - /* Multi-Layer AHB Crossbar Switch */ - DEFINE(MAX_MPR0, offsetof(struct max_regs, mpr0)); - DEFINE(MAX_SGPCR0, offsetof(struct max_regs, sgpcr0)); - DEFINE(MAX_MPR1, offsetof(struct max_regs, mpr1)); - DEFINE(MAX_SGPCR1, offsetof(struct max_regs, sgpcr1)); - DEFINE(MAX_MPR2, offsetof(struct max_regs, mpr2)); - DEFINE(MAX_SGPCR2, offsetof(struct max_regs, sgpcr2)); - DEFINE(MAX_MPR3, offsetof(struct max_regs, mpr3)); - DEFINE(MAX_SGPCR3, offsetof(struct max_regs, sgpcr3)); - DEFINE(MAX_MPR4, offsetof(struct max_regs, mpr4)); - DEFINE(MAX_SGPCR4, offsetof(struct max_regs, sgpcr4)); - DEFINE(MAX_MGPCR0, offsetof(struct max_regs, mgpcr0)); - DEFINE(MAX_MGPCR1, offsetof(struct max_regs, mgpcr1)); - DEFINE(MAX_MGPCR2, offsetof(struct max_regs, mgpcr2)); - DEFINE(MAX_MGPCR3, offsetof(struct max_regs, mgpcr3)); - DEFINE(MAX_MGPCR4, offsetof(struct max_regs, mgpcr4)); - DEFINE(MAX_MGPCR5, offsetof(struct max_regs, mgpcr5)); - - /* AHB <-> IP-Bus Interface */ - DEFINE(AIPS_MPR_0_7, offsetof(struct aips_regs, mpr_0_7)); - DEFINE(AIPS_MPR_8_15, offsetof(struct aips_regs, mpr_8_15)); - DEFINE(AIPS_PACR_0_7, offsetof(struct aips_regs, pacr_0_7)); - DEFINE(AIPS_PACR_8_15, offsetof(struct aips_regs, pacr_8_15)); - DEFINE(AIPS_PACR_16_23, offsetof(struct aips_regs, pacr_16_23)); - DEFINE(AIPS_PACR_24_31, offsetof(struct aips_regs, pacr_24_31)); - DEFINE(AIPS_OPACR_0_7, offsetof(struct aips_regs, opacr_0_7)); - DEFINE(AIPS_OPACR_8_15, offsetof(struct aips_regs, opacr_8_15)); - DEFINE(AIPS_OPACR_16_23, offsetof(struct aips_regs, opacr_16_23)); - DEFINE(AIPS_OPACR_24_31, offsetof(struct aips_regs, opacr_24_31)); - DEFINE(AIPS_OPACR_32_39, offsetof(struct aips_regs, opacr_32_39)); -#endif - #if defined(CONFIG_MX51) || defined(CONFIG_MX53) /* Round up to make sure size gives nice stack alignment */ DEFINE(CLKCTL_CCMR, offsetof(struct clkctl, ccr)); diff --git a/arch/arm/lib/asmdefs.h b/arch/arm/lib/asmdefs.h new file mode 100644 index 0000000000..d307a3a8a2 --- /dev/null +++ b/arch/arm/lib/asmdefs.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Macros for asm code. + * + * Copyright (c) 2019, Arm Limited. + */ + +#ifndef _ASMDEFS_H +#define _ASMDEFS_H + +#if defined(__aarch64__) + +/* Branch Target Identitication support. */ +#define BTI_C hint 34 +#define BTI_J hint 36 +/* Return address signing support (pac-ret). */ +#define PACIASP hint 25; .cfi_window_save +#define AUTIASP hint 29; .cfi_window_save + +/* GNU_PROPERTY_AARCH64_* macros from elf.h. */ +#define FEATURE_1_AND 0xc0000000 +#define FEATURE_1_BTI 1 +#define FEATURE_1_PAC 2 + +/* Add a NT_GNU_PROPERTY_TYPE_0 note. */ +#define GNU_PROPERTY(type, value) \ + .section .note.gnu.property, "a"; \ + .p2align 3; \ + .word 4; \ + .word 16; \ + .word 5; \ + .asciz "GNU"; \ + .word type; \ + .word 4; \ + .word value; \ + .word 0; \ + .text + +/* If set then the GNU Property Note section will be added to + mark objects to support BTI and PAC-RET. */ +#ifndef WANT_GNU_PROPERTY +#define WANT_GNU_PROPERTY 1 +#endif + +#if WANT_GNU_PROPERTY +/* Add property note with supported features to all asm files. */ +GNU_PROPERTY (FEATURE_1_AND, FEATURE_1_BTI|FEATURE_1_PAC) +#endif + +#define ENTRY_ALIGN(name, alignment) \ + .global name; \ + .type name,%function; \ + .align alignment; \ + name: \ + .cfi_startproc; \ + BTI_C; + +#else + +#define END_FILE + +#define ENTRY_ALIGN(name, alignment) \ + .global name; \ + .type name,%function; \ + .align alignment; \ + name: \ + .cfi_startproc; + +#endif + +#define ENTRY(name) ENTRY_ALIGN(name, 6) + +#define ENTRY_ALIAS(name) \ + .global name; \ + .type name,%function; \ + name: + +#define END(name) \ + .cfi_endproc; \ + .size name, .-name; + +#define L(l) .L ## l + +#ifdef __ILP32__ + /* Sanitize padding bits of pointer arguments as per aapcs64 */ +#define PTR_ARG(n) mov w##n, w##n +#else +#define PTR_ARG(n) +#endif + +#ifdef __ILP32__ + /* Sanitize padding bits of size arguments as per aapcs64 */ +#define SIZE_ARG(n) mov w##n, w##n +#else +#define SIZE_ARG(n) +#endif + +#endif diff --git a/arch/arm/lib/bootm.c b/arch/arm/lib/bootm.c index f60ee3a7e6..dd6a69315a 100644 --- a/arch/arm/lib/bootm.c +++ b/arch/arm/lib/bootm.c @@ -16,7 +16,6 @@ #include <command.h> #include <cpu_func.h> #include <dm.h> -#include <lmb.h> #include <log.h> #include <asm/global_data.h> #include <dm/root.h> @@ -43,50 +42,6 @@ DECLARE_GLOBAL_DATA_PTR; static struct tag *params; -static ulong get_sp(void) -{ - ulong ret; - - asm("mov %0, sp" : "=r"(ret) : ); - return ret; -} - -void arch_lmb_reserve(struct lmb *lmb) -{ - ulong sp, bank_end; - int bank; - - /* - * Booting a (Linux) kernel image - * - * Allocate space for command line and board info - the - * address should be as high as possible within the reach of - * the kernel (see CONFIG_SYS_BOOTMAPSZ settings), but in unused - * memory, which means far enough below the current stack - * pointer. - */ - sp = get_sp(); - debug("## Current stack ends at 0x%08lx ", sp); - - /* adjust sp by 4K to be safe */ - sp -= 4096; - for (bank = 0; bank < CONFIG_NR_DRAM_BANKS; bank++) { - if (!gd->bd->bi_dram[bank].size || - sp < gd->bd->bi_dram[bank].start) - continue; - /* Watch out for RAM at end of address space! */ - bank_end = gd->bd->bi_dram[bank].start + - gd->bd->bi_dram[bank].size - 1; - if (sp > bank_end) - continue; - if (bank_end > gd->ram_top) - bank_end = gd->ram_top - 1; - - lmb_reserve(lmb, sp, bank_end - sp + 1); - break; - } -} - __weak void board_quiesce_devices(void) { } diff --git a/arch/arm/lib/ccn504.S b/arch/arm/lib/ccn504.S index 2c584095c3..c6ea3e3afc 100644 --- a/arch/arm/lib/ccn504.S +++ b/arch/arm/lib/ccn504.S @@ -12,7 +12,7 @@ /************************************************************************* * * void ccn504_add_masters_to_dvm(CCI_MN_BASE, CCI_MN_RNF_NODEID_LIST, - * CCI_MN_DVM_DOMAIN_CTL_SET); + * CCI_MN_DVM_DOMAIN_CTL_SET); * * Add fully-coherent masters to DVM domain * @@ -78,4 +78,3 @@ ENTRY(ccn504_set_aux) ret ENDPROC(ccn504_set_aux) - diff --git a/arch/arm/lib/crt0.S b/arch/arm/lib/crt0.S index 46b6be21a8..956d258c9d 100644 --- a/arch/arm/lib/crt0.S +++ b/arch/arm/lib/crt0.S @@ -130,6 +130,14 @@ ENTRY(_main) ldr r9, [r9, #GD_NEW_GD] /* r9 <- gd->new_gd */ adr lr, here +#if defined(CONFIG_POSITION_INDEPENDENT) + adr r0, _main + ldr r1, _start_ofs + add r0, r1 + ldr r1, =CONFIG_SYS_TEXT_BASE + sub r1, r0 + add lr, r1 +#endif ldr r0, [r9, #GD_RELOC_OFF] /* r0 = gd->reloc_off */ add lr, lr, r0 #if defined(CONFIG_CPU_V7M) @@ -180,3 +188,6 @@ here: #endif ENDPROC(_main) + +_start_ofs: + .word _start - _main diff --git a/arch/arm/lib/div64.S b/arch/arm/lib/div64.S index 3ef1ce1fff..a83e337214 100644 --- a/arch/arm/lib/div64.S +++ b/arch/arm/lib/div64.S @@ -34,12 +34,12 @@ * This is meant to be used by do_div() from include/asm/div64.h only. * * Input parameters: - * xh-xl = dividend (clobbered) - * r4 = divisor (preserved) + * xh-xl = dividend (clobbered) + * r4 = divisor (preserved) * * Output values: - * yh-yl = result - * xh = remainder + * yh-yl = result + * xh = remainder * * Clobbered regs: xl, ip */ @@ -85,7 +85,7 @@ UNWIND(.fnstart) #endif @ The division loop for needed upper bit positions. - @ Break out early if dividend reaches 0. + @ Break out early if dividend reaches 0. 2: cmp xh, yl orrcs yh, yh, ip subscs xh, xh, yl diff --git a/arch/arm/lib/lib1funcs.S b/arch/arm/lib/lib1funcs.S index 0798d098af..700eee5fbb 100644 --- a/arch/arm/lib/lib1funcs.S +++ b/arch/arm/lib/lib1funcs.S @@ -34,7 +34,7 @@ mov \divisor, \divisor, lsl \result mov \curbit, \curbit, lsl \result mov \result, #0 - + #else @ Initially shift the divisor left 3 bits if possible, @@ -48,7 +48,7 @@ @ Unless the divisor is very big, shift it up in multiples of @ four bits, since this is the amount of unwinding in the main - @ division loop. Continue shifting until the divisor is + @ division loop. Continue shifting until the divisor is @ larger than the dividend. 1: cmp \divisor, #0x10000000 cmplo \divisor, \dividend @@ -135,7 +135,7 @@ @ Unless the divisor is very big, shift it up in multiples of @ four bits, since this is the amount of unwinding in the main - @ division loop. Continue shifting until the divisor is + @ division loop. Continue shifting until the divisor is @ larger than the dividend. 1: cmp \divisor, #0x10000000 cmplo \divisor, \dividend diff --git a/arch/arm/lib/memcpy-arm64.S b/arch/arm/lib/memcpy-arm64.S new file mode 100644 index 0000000000..507054d847 --- /dev/null +++ b/arch/arm/lib/memcpy-arm64.S @@ -0,0 +1,242 @@ +/* SPDX-License-Identifier: MIT */ +/* + * memcpy - copy memory area + * + * Copyright (c) 2012-2020, Arm Limited. + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, unaligned accesses. + * + */ + +#include "asmdefs.h" + +#define dstin x0 +#define src x1 +#define count x2 +#define dst x3 +#define srcend x4 +#define dstend x5 +#define A_l x6 +#define A_lw w6 +#define A_h x7 +#define B_l x8 +#define B_lw w8 +#define B_h x9 +#define C_l x10 +#define C_lw w10 +#define C_h x11 +#define D_l x12 +#define D_h x13 +#define E_l x14 +#define E_h x15 +#define F_l x16 +#define F_h x17 +#define G_l count +#define G_h dst +#define H_l src +#define H_h srcend +#define tmp1 x14 + +/* This implementation handles overlaps and supports both memcpy and memmove + from a single entry point. It uses unaligned accesses and branchless + sequences to keep the code small, simple and improve performance. + + Copies are split into 3 main cases: small copies of up to 32 bytes, medium + copies of up to 128 bytes, and large copies. The overhead of the overlap + check is negligible since it is only required for large copies. + + Large copies use a software pipelined loop processing 64 bytes per iteration. + The destination pointer is 16-byte aligned to minimize unaligned accesses. + The loop tail is handled by always copying 64 bytes from the end. +*/ + +ENTRY_ALIAS (memmove) +ENTRY (memcpy) + PTR_ARG (0) + PTR_ARG (1) + SIZE_ARG (2) + add srcend, src, count + add dstend, dstin, count + cmp count, 128 + b.hi L(copy_long) + cmp count, 32 + b.hi L(copy32_128) + + /* Small copies: 0..32 bytes. */ + cmp count, 16 + b.lo L(copy16) + ldp A_l, A_h, [src] + ldp D_l, D_h, [srcend, -16] + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret + + /* Copy 8-15 bytes. */ +L(copy16): + tbz count, 3, L(copy8) + ldr A_l, [src] + ldr A_h, [srcend, -8] + str A_l, [dstin] + str A_h, [dstend, -8] + ret + + .p2align 3 + /* Copy 4-7 bytes. */ +L(copy8): + tbz count, 2, L(copy4) + ldr A_lw, [src] + ldr B_lw, [srcend, -4] + str A_lw, [dstin] + str B_lw, [dstend, -4] + ret + + /* Copy 0..3 bytes using a branchless sequence. */ +L(copy4): + cbz count, L(copy0) + lsr tmp1, count, 1 + ldrb A_lw, [src] + ldrb C_lw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb C_lw, [dstend, -1] +L(copy0): + ret + + .p2align 4 + /* Medium copies: 33..128 bytes. */ +L(copy32_128): + ldp A_l, A_h, [src] + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + ldp D_l, D_h, [srcend, -16] + cmp count, 64 + b.hi L(copy128) + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + /* Copy 65..128 bytes. */ +L(copy128): + ldp E_l, E_h, [src, 32] + ldp F_l, F_h, [src, 48] + cmp count, 96 + b.ls L(copy96) + ldp G_l, G_h, [srcend, -64] + ldp H_l, H_h, [srcend, -48] + stp G_l, G_h, [dstend, -64] + stp H_l, H_h, [dstend, -48] +L(copy96): + stp A_l, A_h, [dstin] + stp B_l, B_h, [dstin, 16] + stp E_l, E_h, [dstin, 32] + stp F_l, F_h, [dstin, 48] + stp C_l, C_h, [dstend, -32] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 + /* Copy more than 128 bytes. */ +L(copy_long): + /* Use backwards copy if there is an overlap. */ + sub tmp1, dstin, src + cbz tmp1, L(copy0) + cmp tmp1, count + b.lo L(copy_long_backwards) + + /* Copy 16 bytes and then align dst to 16-byte alignment. */ + + ldp D_l, D_h, [src] + and tmp1, dstin, 15 + bic dst, dstin, 15 + sub src, src, tmp1 + add count, count, tmp1 /* Count is now 16 too large. */ + ldp A_l, A_h, [src, 16] + stp D_l, D_h, [dstin] + ldp B_l, B_h, [src, 32] + ldp C_l, C_h, [src, 48] + ldp D_l, D_h, [src, 64]! + subs count, count, 128 + 16 /* Test and readjust count. */ + b.ls L(copy64_from_end) + +L(loop64): + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [src, 16] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [src, 32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [src, 48] + stp D_l, D_h, [dst, 64]! + ldp D_l, D_h, [src, 64]! + subs count, count, 64 + b.hi L(loop64) + + /* Write the last iteration and copy 64 bytes from the end. */ +L(copy64_from_end): + ldp E_l, E_h, [srcend, -64] + stp A_l, A_h, [dst, 16] + ldp A_l, A_h, [srcend, -48] + stp B_l, B_h, [dst, 32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dst, 48] + ldp C_l, C_h, [srcend, -16] + stp D_l, D_h, [dst, 64] + stp E_l, E_h, [dstend, -64] + stp A_l, A_h, [dstend, -48] + stp B_l, B_h, [dstend, -32] + stp C_l, C_h, [dstend, -16] + ret + + .p2align 4 + + /* Large backwards copy for overlapping copies. + Copy 16 bytes and then align dst to 16-byte alignment. */ +L(copy_long_backwards): + ldp D_l, D_h, [srcend, -16] + and tmp1, dstend, 15 + sub srcend, srcend, tmp1 + sub count, count, tmp1 + ldp A_l, A_h, [srcend, -16] + stp D_l, D_h, [dstend, -16] + ldp B_l, B_h, [srcend, -32] + ldp C_l, C_h, [srcend, -48] + ldp D_l, D_h, [srcend, -64]! + sub dstend, dstend, tmp1 + subs count, count, 128 + b.ls L(copy64_from_start) + +L(loop64_backwards): + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [srcend, -16] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [srcend, -32] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [srcend, -48] + stp D_l, D_h, [dstend, -64]! + ldp D_l, D_h, [srcend, -64]! + subs count, count, 64 + b.hi L(loop64_backwards) + + /* Write the last iteration and copy 64 bytes from the start. */ +L(copy64_from_start): + ldp G_l, G_h, [src, 48] + stp A_l, A_h, [dstend, -16] + ldp A_l, A_h, [src, 32] + stp B_l, B_h, [dstend, -32] + ldp B_l, B_h, [src, 16] + stp C_l, C_h, [dstend, -48] + ldp C_l, C_h, [src] + stp D_l, D_h, [dstend, -64] + stp G_l, G_h, [dstin, 48] + stp A_l, A_h, [dstin, 32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstin] + ret + +END (memcpy) diff --git a/arch/arm/lib/memset-arm64.S b/arch/arm/lib/memset-arm64.S new file mode 100644 index 0000000000..ee9f9a96cf --- /dev/null +++ b/arch/arm/lib/memset-arm64.S @@ -0,0 +1,148 @@ +/* SPDX-License-Identifier: MIT */ +/* + * memset - fill memory with a constant byte + * + * Copyright (c) 2012-2021, Arm Limited. + */ + +/* Assumptions: + * + * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses. + * + */ + +#include <asm/macro.h> +#include "asmdefs.h" + +#define dstin x0 +#define val x1 +#define valw w1 +#define count x2 +#define dst x3 +#define dstend x4 +#define zva_val x5 + +ENTRY (memset) + PTR_ARG (0) + SIZE_ARG (2) + + /* + * The optimized memset uses the dc opcode, which causes problems + * when the cache is disabled. Let's check if the cache is disabled + * and use a very simple memset implementation in this case. Otherwise + * jump to the optimized version. + */ + switch_el x6, 3f, 2f, 1f +3: mrs x6, sctlr_el3 + b 0f +2: mrs x6, sctlr_el2 + b 0f +1: mrs x6, sctlr_el1 +0: + tst x6, #CR_C + bne 9f + + /* + * A very "simple" memset implementation without the use of the + * dc opcode. Can be run with caches disabled. + */ + mov x3, #0x0 + cmp count, x3 /* check for zero length */ + beq 8f +4: strb valw, [dstin, x3] + add x3, x3, #0x1 + cmp count, x3 + bne 4b +8: ret +9: + + /* Here the optimized memset version starts */ + dup v0.16B, valw + add dstend, dstin, count + + cmp count, 96 + b.hi L(set_long) + cmp count, 16 + b.hs L(set_medium) + mov val, v0.D[0] + + /* Set 0..15 bytes. */ + tbz count, 3, 1f + str val, [dstin] + str val, [dstend, -8] + ret + .p2align 4 +1: tbz count, 2, 2f + str valw, [dstin] + str valw, [dstend, -4] + ret +2: cbz count, 3f + strb valw, [dstin] + tbz count, 1, 3f + strh valw, [dstend, -2] +3: ret + + /* Set 17..96 bytes. */ +L(set_medium): + str q0, [dstin] + tbnz count, 6, L(set96) + str q0, [dstend, -16] + tbz count, 5, 1f + str q0, [dstin, 16] + str q0, [dstend, -32] +1: ret + + .p2align 4 + /* Set 64..96 bytes. Write 64 bytes from the start and + 32 bytes from the end. */ +L(set96): + str q0, [dstin, 16] + stp q0, q0, [dstin, 32] + stp q0, q0, [dstend, -32] + ret + + .p2align 4 +L(set_long): + and valw, valw, 255 + bic dst, dstin, 15 + str q0, [dstin] + cmp count, 160 + ccmp valw, 0, 0, hs + b.ne L(no_zva) + +#ifndef SKIP_ZVA_CHECK + mrs zva_val, dczid_el0 + and zva_val, zva_val, 31 + cmp zva_val, 4 /* ZVA size is 64 bytes. */ + b.ne L(no_zva) +#endif + str q0, [dst, 16] + stp q0, q0, [dst, 32] + bic dst, dst, 63 + sub count, dstend, dst /* Count is now 64 too large. */ + sub count, count, 128 /* Adjust count and bias for loop. */ + + .p2align 4 +L(zva_loop): + add dst, dst, 64 + dc zva, dst + subs count, count, 64 + b.hi L(zva_loop) + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + +L(no_zva): + sub count, dstend, dst /* Count is 16 too large. */ + sub dst, dst, 16 /* Dst is biased by -32. */ + sub count, count, 64 + 16 /* Adjust count and bias for loop. */ +L(no_zva_loop): + stp q0, q0, [dst, 32] + stp q0, q0, [dst, 64]! + subs count, count, 64 + b.hi L(no_zva_loop) + stp q0, q0, [dstend, -64] + stp q0, q0, [dstend, -32] + ret + +END (memset) diff --git a/arch/arm/lib/relocate.S b/arch/arm/lib/relocate.S index e5f7267be1..14b7f61c1a 100644 --- a/arch/arm/lib/relocate.S +++ b/arch/arm/lib/relocate.S @@ -78,22 +78,28 @@ ENDPROC(relocate_vectors) */ ENTRY(relocate_code) - ldr r1, =__image_copy_start /* r1 <- SRC &__image_copy_start */ - subs r4, r0, r1 /* r4 <- relocation offset */ - beq relocate_done /* skip relocation */ - ldr r2, =__image_copy_end /* r2 <- SRC &__image_copy_end */ - + adr r3, relocate_code + ldr r1, _image_copy_start_ofs + add r1, r3 /* r1 <- Run &__image_copy_start */ + subs r4, r0, r1 /* r4 <- Run to copy offset */ + beq relocate_done /* skip relocation */ + ldr r1, _image_copy_start_ofs + add r1, r3 /* r1 <- Run &__image_copy_start */ + ldr r2, _image_copy_end_ofs + add r2, r3 /* r2 <- Run &__image_copy_end */ copy_loop: - ldmia r1!, {r10-r11} /* copy from source address [r1] */ - stmia r0!, {r10-r11} /* copy to target address [r0] */ - cmp r1, r2 /* until source end address [r2] */ + ldmia r1!, {r10-r11} /* copy from source address [r1] */ + stmia r0!, {r10-r11} /* copy to target address [r0] */ + cmp r1, r2 /* until source end address [r2] */ blo copy_loop /* * fix .rel.dyn relocations */ - ldr r2, =__rel_dyn_start /* r2 <- SRC &__rel_dyn_start */ - ldr r3, =__rel_dyn_end /* r3 <- SRC &__rel_dyn_end */ + ldr r1, _rel_dyn_start_ofs + add r2, r1, r3 /* r2 <- Run &__rel_dyn_start */ + ldr r1, _rel_dyn_end_ofs + add r3, r1, r3 /* r3 <- Run &__rel_dyn_end */ fixloop: ldmia r2!, {r0-r1} /* (r0,r1) <- (SRC location,fixup) */ and r1, r1, #0xff @@ -129,3 +135,12 @@ relocate_done: #endif ENDPROC(relocate_code) + +_image_copy_start_ofs: + .word __image_copy_start - relocate_code +_image_copy_end_ofs: + .word __image_copy_end - relocate_code +_rel_dyn_start_ofs: + .word __rel_dyn_start - relocate_code +_rel_dyn_end_ofs: + .word __rel_dyn_end - relocate_code diff --git a/arch/arm/lib/stack.c b/arch/arm/lib/stack.c index b03e1cfc80..656084c7e5 100644 --- a/arch/arm/lib/stack.c +++ b/arch/arm/lib/stack.c @@ -12,6 +12,7 @@ */ #include <common.h> #include <init.h> +#include <lmb.h> #include <asm/global_data.h> DECLARE_GLOBAL_DATA_PTR; @@ -33,3 +34,16 @@ int arch_reserve_stacks(void) return 0; } + +static ulong get_sp(void) +{ + ulong ret; + + asm("mov %0, sp" : "=r"(ret) : ); + return ret; +} + +void arch_lmb_reserve(struct lmb *lmb) +{ + arch_lmb_reserve_generic(lmb, get_sp(), gd->ram_top, 16384); +} |