]> git.baikalelectronics.ru Git - kernel.git/commitdiff
ARM: implement support for vmap'ed stacks
authorArd Biesheuvel <ardb@kernel.org>
Thu, 23 Sep 2021 07:15:53 +0000 (09:15 +0200)
committerArd Biesheuvel <ardb@kernel.org>
Fri, 3 Dec 2021 14:11:33 +0000 (15:11 +0100)
Wire up the generic support for managing task stack allocations via vmalloc,
and implement the entry code that detects whether we faulted because of a
stack overrun (or future stack overrun caused by pushing the pt_regs array)

While this adds a fair amount of tricky entry asm code, it should be
noted that it only adds a TST + branch to the svc_entry path. The code
implementing the non-trivial handling of the overflow stack is emitted
out-of-line into the .text section.

Since on ARM, we rely on do_translation_fault() to keep PMD level page
table entries that cover the vmalloc region up to date, we need to
ensure that we don't hit such a stale PMD entry when accessing the
stack. So we do a dummy read from the new stack while still running from
the old one on the context switch path, and bump the vmalloc_seq counter
when PMD level entries in the vmalloc range are modified, so that the MM
switch fetches the latest version of the entries.

Note that we need to increase the per-mode stack by 1 word, to gain some
space to stash a GPR until we know it is safe to touch the stack.
However, due to the cacheline alignment of the struct, this does not
actually increase the memory footprint of the struct stack array at all.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Tested-by: Keith Packard <keithpac@amazon.com>
Tested-by: Marc Zyngier <maz@kernel.org>
Tested-by: Vladimir Murzin <vladimir.murzin@arm.com> # ARMv7M
arch/arm/Kconfig
arch/arm/include/asm/page.h
arch/arm/include/asm/thread_info.h
arch/arm/kernel/entry-armv.S
arch/arm/kernel/entry-header.S
arch/arm/kernel/irq.c
arch/arm/kernel/setup.c
arch/arm/kernel/sleep.S
arch/arm/kernel/traps.c
arch/arm/kernel/unwind.c
arch/arm/kernel/vmlinux.lds.S

index 41849d5e0bc3baab2424f647906843c8138e915b..e2ab72f2bf4ac45ec0cb8a92d16997d683109cd3 100644 (file)
@@ -127,6 +127,7 @@ config ARM
        select RTC_LIB
        select SYS_SUPPORTS_APM_EMULATION
        select THREAD_INFO_IN_TASK if CURRENT_POINTER_IN_TPIDRURO
+       select HAVE_ARCH_VMAP_STACK if MMU && THREAD_INFO_IN_TASK && (!LD_IS_LLD || LLD_VERSION >= 140000)
        select TRACE_IRQFLAGS_SUPPORT if !CPU_V7M
        # Above selects are sorted alphabetically; please add new ones
        # according to that.  Thanks.
index 11b058a72a5b839161ba2c654aba789221435ecc..7b871ed99ccf0a63a5b883913997abf78adcff91 100644 (file)
@@ -149,6 +149,10 @@ extern void copy_page(void *to, const void *from);
 #include <asm/pgtable-2level-types.h>
 #endif
 
+#ifdef CONFIG_VMAP_STACK
+#define ARCH_PAGE_TABLE_SYNC_MASK      PGTBL_PMD_MODIFIED
+#endif
+
 #endif /* CONFIG_MMU */
 
 typedef struct page *pgtable_t;
index 164e15f26485da42130bf0a8816bb994a9adb538..004b89d86224b15e875556f7ff14775da5d3c447 100644 (file)
 #define THREAD_SIZE            (PAGE_SIZE << THREAD_SIZE_ORDER)
 #define THREAD_START_SP                (THREAD_SIZE - 8)
 
+#ifdef CONFIG_VMAP_STACK
+#define THREAD_ALIGN           (2 * THREAD_SIZE)
+#else
+#define THREAD_ALIGN           THREAD_SIZE
+#endif
+
+#define OVERFLOW_STACK_SIZE    SZ_4K
+
 #ifndef __ASSEMBLY__
 
 struct task_struct;
index 207875ac62ffdc04961daa67d57a1e2972aa7ef1..5fb7465d14d93449a338536fc9b6e0ebb2554ac9 100644 (file)
@@ -57,6 +57,10 @@ UNWIND(      .setfp  fpreg, sp               )
        @
        subs    r2, sp, r0              @ SP above bottom of IRQ stack?
        rsbscs  r2, r2, #THREAD_SIZE    @ ... and below the top?
+#ifdef CONFIG_VMAP_STACK
+       ldr_l   r2, high_memory, cc     @ End of the linear region
+       cmpcc   r2, r0                  @ Stack pointer was below it?
+#endif
        movcs   sp, r0                  @ If so, revert to incoming SP
 
 #ifndef CONFIG_UNWINDER_ARM
@@ -188,13 +192,18 @@ ENDPROC(__und_invalid)
 #define SPFIX(code...)
 #endif
 
-       .macro  svc_entry, stack_hole=0, trace=1, uaccess=1
+       .macro  svc_entry, stack_hole=0, trace=1, uaccess=1, overflow_check=1
  UNWIND(.fnstart               )
- UNWIND(.save {r0 - pc}                )
        sub     sp, sp, #(SVC_REGS_SIZE + \stack_hole)
+ THUMB(        add     sp, r1          )       @ get SP in a GPR without
+ THUMB(        sub     r1, sp, r1      )       @ using a temp register
+
+       .if     \overflow_check
+ UNWIND(.save  {r0 - pc}       )
+       do_overflow_check (SVC_REGS_SIZE + \stack_hole)
+       .endif
+
 #ifdef CONFIG_THUMB2_KERNEL
-       add     sp, r1                  @ get SP in a GPR without
-       sub     r1, sp, r1              @ using a temp register
        tst     r1, #4                  @ test stack pointer alignment
        sub     r1, sp, r1              @ restore original R1
        sub     sp, r1                  @ restore original SP
@@ -827,12 +836,20 @@ ENTRY(__switch_to)
        str     r7, [r8]
 #endif
        mov     r0, r5
-#if !defined(CONFIG_THUMB2_KERNEL)
+#if !defined(CONFIG_THUMB2_KERNEL) && !defined(CONFIG_VMAP_STACK)
        set_current r7
        ldmia   r4, {r4 - sl, fp, sp, pc}       @ Load all regs saved previously
 #else
        mov     r1, r7
        ldmia   r4, {r4 - sl, fp, ip, lr}       @ Load all regs saved previously
+#ifdef CONFIG_VMAP_STACK
+       @
+       @ Do a dummy read from the new stack while running from the old one so
+       @ that we can rely on do_translation_fault() to fix up any stale PMD
+       @ entries covering the vmalloc region.
+       @
+       ldr     r2, [ip]
+#endif
 
        @ When CONFIG_THREAD_INFO_IN_TASK=n, the update of SP itself is what
        @ effectuates the task switch, as that is what causes the observable
@@ -849,6 +866,76 @@ ENTRY(__switch_to)
  UNWIND(.fnend         )
 ENDPROC(__switch_to)
 
+#ifdef CONFIG_VMAP_STACK
+       .text
+       .align  2
+__bad_stack:
+       @
+       @ We've just detected an overflow. We need to load the address of this
+       @ CPU's overflow stack into the stack pointer register. We have only one
+       @ scratch register so let's use a sequence of ADDs including one
+       @ involving the PC, and decorate them with PC-relative group
+       @ relocations. As these are ARM only, switch to ARM mode first.
+       @
+       @ We enter here with IP clobbered and its value stashed on the mode
+       @ stack.
+       @
+THUMB( bx      pc              )
+THUMB( nop                     )
+THUMB( .arm                    )
+       mrc     p15, 0, ip, c13, c0, 4          @ Get per-CPU offset
+
+       .globl  overflow_stack_ptr
+       .reloc  0f, R_ARM_ALU_PC_G0_NC, overflow_stack_ptr
+       .reloc  1f, R_ARM_ALU_PC_G1_NC, overflow_stack_ptr
+       .reloc  2f, R_ARM_LDR_PC_G2, overflow_stack_ptr
+       add     ip, ip, pc
+0:     add     ip, ip, #-4
+1:     add     ip, ip, #0
+2:     ldr     ip, [ip, #4]
+
+       str     sp, [ip, #-4]!                  @ Preserve original SP value
+       mov     sp, ip                          @ Switch to overflow stack
+       pop     {ip}                            @ Original SP in IP
+
+#if defined(CONFIG_UNWINDER_FRAME_POINTER) && defined(CONFIG_CC_IS_GCC)
+       mov     ip, ip                          @ mov expected by unwinder
+       push    {fp, ip, lr, pc}                @ GCC flavor frame record
+#else
+       str     ip, [sp, #-8]!                  @ store original SP
+       push    {fpreg, lr}                     @ Clang flavor frame record
+#endif
+UNWIND( ldr    ip, [r0, #4]    )               @ load exception LR
+UNWIND( str    ip, [sp, #12]   )               @ store in the frame record
+       ldr     ip, [r0, #12]                   @ reload IP
+
+       @ Store the original GPRs to the new stack.
+       svc_entry uaccess=0, overflow_check=0
+
+UNWIND( .save   {sp, pc}       )
+UNWIND( .save   {fpreg, lr}    )
+UNWIND( .setfp  fpreg, sp      )
+
+       ldr     fpreg, [sp, #S_SP]              @ Add our frame record
+                                               @ to the linked list
+#if defined(CONFIG_UNWINDER_FRAME_POINTER) && defined(CONFIG_CC_IS_GCC)
+       ldr     r1, [fp, #4]                    @ reload SP at entry
+       add     fp, fp, #12
+#else
+       ldr     r1, [fpreg, #8]
+#endif
+       str     r1, [sp, #S_SP]                 @ store in pt_regs
+
+       @ Stash the regs for handle_bad_stack
+       mov     r0, sp
+
+       @ Time to die
+       bl      handle_bad_stack
+       nop
+UNWIND( .fnend                 )
+ENDPROC(__bad_stack)
+#endif
+
        __INIT
 
 /*
index ae24dd54e9efbd247b7d5c875eaaedcef3a2d195..81df2a3561cac022ae3f8724baccbe974bfa1a1e 100644 (file)
@@ -423,3 +423,40 @@ scno       .req    r7              @ syscall number
 tbl    .req    r8              @ syscall table pointer
 why    .req    r8              @ Linux syscall (!= 0)
 tsk    .req    r9              @ current thread_info
+
+       .macro  do_overflow_check, frame_size:req
+#ifdef CONFIG_VMAP_STACK
+       @
+       @ Test whether the SP has overflowed. Task and IRQ stacks are aligned
+       @ so that SP & BIT(THREAD_SIZE_ORDER + PAGE_SHIFT) should always be
+       @ zero.
+       @
+ARM(   tst     sp, #1 << (THREAD_SIZE_ORDER + PAGE_SHIFT)      )
+THUMB( tst     r1, #1 << (THREAD_SIZE_ORDER + PAGE_SHIFT)      )
+THUMB( it      ne                                              )
+       bne     .Lstack_overflow_check\@
+
+       .pushsection    .text
+.Lstack_overflow_check\@:
+       @
+       @ The stack pointer is not pointing to a valid vmap'ed stack, but it
+       @ may be pointing into the linear map instead, which may happen if we
+       @ are already running from the overflow stack. We cannot detect overflow
+       @ in such cases so just carry on.
+       @
+       str     ip, [r0, #12]                   @ Stash IP on the mode stack
+       ldr_l   ip, high_memory                 @ Start of VMALLOC space
+ARM(   cmp     sp, ip                  )       @ SP in vmalloc space?
+THUMB( cmp     r1, ip                  )
+THUMB( itt     lo                      )
+       ldrlo   ip, [r0, #12]                   @ Restore IP
+       blo     .Lout\@                         @ Carry on
+
+THUMB( sub     r1, sp, r1              )       @ Restore original R1
+THUMB( sub     sp, r1                  )       @ Restore original SP
+       add     sp, sp, #\frame_size            @ Undo svc_entry's SP change
+       b       __bad_stack                     @ Handle VMAP stack overflow
+       .popsection
+.Lout\@:
+#endif
+       .endm
index e05219bca21876046e338e652aedc5fad4806914..5deb40f39999cf9cb7bea730ab20d9181026e0b4 100644 (file)
@@ -56,7 +56,14 @@ static void __init init_irq_stacks(void)
        int cpu;
 
        for_each_possible_cpu(cpu) {
-               stack = (u8 *)__get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER);
+               if (!IS_ENABLED(CONFIG_VMAP_STACK))
+                       stack = (u8 *)__get_free_pages(GFP_KERNEL,
+                                                      THREAD_SIZE_ORDER);
+               else
+                       stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN,
+                                              THREADINFO_GFP, NUMA_NO_NODE,
+                                              __builtin_return_address(0));
+
                if (WARN_ON(!stack))
                        break;
                per_cpu(irq_stack_ptr, cpu) = &stack[THREAD_SIZE];
index 284a80c0b6e1046ff320c98b013714f2b1bfa2d8..039feb7cd590b8714f97704f36fb7231ac2be9b2 100644 (file)
@@ -141,10 +141,10 @@ EXPORT_SYMBOL(outer_cache);
 int __cpu_architecture __read_mostly = CPU_ARCH_UNKNOWN;
 
 struct stack {
-       u32 irq[3];
-       u32 abt[3];
-       u32 und[3];
-       u32 fiq[3];
+       u32 irq[4];
+       u32 abt[4];
+       u32 und[4];
+       u32 fiq[4];
 } ____cacheline_aligned;
 
 #ifndef CONFIG_CPU_V7M
index 43077e11dafdaaca954a078471d4096e17053864..803b51e5cba098a187d1a5ef37672db51601a682 100644 (file)
@@ -67,6 +67,14 @@ ENTRY(__cpu_suspend)
        ldr     r4, =cpu_suspend_size
 #endif
        mov     r5, sp                  @ current virtual SP
+#ifdef CONFIG_VMAP_STACK
+       @ Run the suspend code from the overflow stack so we don't have to rely
+       @ on vmalloc-to-phys conversions anywhere in the arch suspend code.
+       @ The original SP value captured in R5 will be restored on the way out.
+       mov_l   r6, overflow_stack_ptr  @ Base pointer
+       mrc     p15, 0, r7, c13, c0, 4  @ Get per-CPU offset
+       ldr     sp, [r6, r7]            @ Address of this CPU's overflow stack
+#endif
        add     r4, r4, #12             @ Space for pgd, virt sp, phys resume fn
        sub     sp, sp, r4              @ allocate CPU state on stack
        ldr     r3, =sleep_save_sp
index b42c446cec9af9dd59a74605ba33caacfae22ba6..b28a705c49cbbb91afb706c4362c0356d60b144b 100644 (file)
@@ -121,7 +121,8 @@ void dump_backtrace_stm(u32 *stack, u32 instruction, const char *loglvl)
 static int verify_stack(unsigned long sp)
 {
        if (sp < PAGE_OFFSET ||
-           (sp > (unsigned long)high_memory && high_memory != NULL))
+           (!IS_ENABLED(CONFIG_VMAP_STACK) &&
+            sp > (unsigned long)high_memory && high_memory != NULL))
                return -EFAULT;
 
        return 0;
@@ -291,7 +292,8 @@ static int __die(const char *str, int err, struct pt_regs *regs)
 
        if (!user_mode(regs) || in_interrupt()) {
                dump_mem(KERN_EMERG, "Stack: ", regs->ARM_sp,
-                        ALIGN(regs->ARM_sp, THREAD_SIZE));
+                        ALIGN(regs->ARM_sp - THREAD_SIZE, THREAD_ALIGN)
+                        + THREAD_SIZE);
                dump_backtrace(regs, tsk, KERN_EMERG);
                dump_instr(KERN_EMERG, regs);
        }
@@ -838,3 +840,77 @@ void __init early_trap_init(void *vectors_base)
         */
 #endif
 }
+
+#ifdef CONFIG_VMAP_STACK
+
+DECLARE_PER_CPU(u8 *, irq_stack_ptr);
+
+asmlinkage DEFINE_PER_CPU(u8 *, overflow_stack_ptr);
+
+static int __init allocate_overflow_stacks(void)
+{
+       u8 *stack;
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               stack = (u8 *)__get_free_page(GFP_KERNEL);
+               if (WARN_ON(!stack))
+                       return -ENOMEM;
+               per_cpu(overflow_stack_ptr, cpu) = &stack[OVERFLOW_STACK_SIZE];
+       }
+       return 0;
+}
+early_initcall(allocate_overflow_stacks);
+
+asmlinkage void handle_bad_stack(struct pt_regs *regs)
+{
+       unsigned long tsk_stk = (unsigned long)current->stack;
+       unsigned long irq_stk = (unsigned long)this_cpu_read(irq_stack_ptr);
+       unsigned long ovf_stk = (unsigned long)this_cpu_read(overflow_stack_ptr);
+
+       console_verbose();
+       pr_emerg("Insufficient stack space to handle exception!");
+
+       pr_emerg("Task stack:     [0x%08lx..0x%08lx]\n",
+                tsk_stk, tsk_stk + THREAD_SIZE);
+       pr_emerg("IRQ stack:      [0x%08lx..0x%08lx]\n",
+                irq_stk - THREAD_SIZE, irq_stk);
+       pr_emerg("Overflow stack: [0x%08lx..0x%08lx]\n",
+                ovf_stk - OVERFLOW_STACK_SIZE, ovf_stk);
+
+       die("kernel stack overflow", regs, 0);
+}
+
+/*
+ * Normally, we rely on the logic in do_translation_fault() to update stale PMD
+ * entries covering the vmalloc space in a task's page tables when it first
+ * accesses the region in question. Unfortunately, this is not sufficient when
+ * the task stack resides in the vmalloc region, as do_translation_fault() is a
+ * C function that needs a stack to run.
+ *
+ * So we need to ensure that these PMD entries are up to date *before* the MM
+ * switch. As we already have some logic in the MM switch path that takes care
+ * of this, let's trigger it by bumping the counter every time the core vmalloc
+ * code modifies a PMD entry in the vmalloc region.
+ */
+void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
+{
+       if (start > VMALLOC_END || end < VMALLOC_START)
+               return;
+
+       /*
+        * This hooks into the core vmalloc code to receive notifications of
+        * any PMD level changes that have been made to the kernel page tables.
+        * This means it should only be triggered once for every MiB worth of
+        * vmalloc space, given that we don't support huge vmalloc/vmap on ARM,
+        * and that kernel PMD level table entries are rarely (if ever)
+        * updated.
+        *
+        * This means that the counter is going to max out at ~250 for the
+        * typical case. If it overflows, something entirely unexpected has
+        * occurred so let's throw a warning if that happens.
+        */
+       WARN_ON(++init_mm.context.vmalloc_seq == UINT_MAX);
+}
+
+#endif
index e8d729975f12269d4f5074952686c0eecd9f0cfd..c5ea328c428d3bb256c567b91c900c26e95f9352 100644 (file)
@@ -389,7 +389,8 @@ int unwind_frame(struct stackframe *frame)
 
        /* store the highest address on the stack to avoid crossing it*/
        ctrl.sp_low = frame->sp;
-       ctrl.sp_high = ALIGN(ctrl.sp_low, THREAD_SIZE);
+       ctrl.sp_high = ALIGN(ctrl.sp_low - THREAD_SIZE, THREAD_ALIGN)
+                      + THREAD_SIZE;
 
        pr_debug("%s(pc = %08lx lr = %08lx sp = %08lx)\n", __func__,
                 frame->pc, frame->lr, frame->sp);
index f02d617e3359f11bbc759f19b479e6befb22763f..aa12b65a7fd6a8cc60463e9beec35698397daeee 100644 (file)
@@ -138,12 +138,12 @@ SECTIONS
 #ifdef CONFIG_STRICT_KERNEL_RWX
        . = ALIGN(1<<SECTION_SHIFT);
 #else
-       . = ALIGN(THREAD_SIZE);
+       . = ALIGN(THREAD_ALIGN);
 #endif
        __init_end = .;
 
        _sdata = .;
-       RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE)
+       RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN)
        _edata = .;
 
        BSS_SECTION(0, 0, 0)