]> git.baikalelectronics.ru Git - kernel.git/commitdiff
s390/smp: rework absolute lowcore access
authorAlexander Gordeev <agordeev@linux.ibm.com>
Wed, 20 Jul 2022 06:22:01 +0000 (08:22 +0200)
committerAlexander Gordeev <agordeev@linux.ibm.com>
Thu, 28 Jul 2022 16:05:23 +0000 (18:05 +0200)
Temporary unsetting of the prefix page in memcpy_absolute() routine
poses a risk of executing code path with unexpectedly disabled prefix
page. This rework avoids the prefix page uninstalling and disabling
of normal and machine check interrupts when accessing the absolute
zero memory.

Although memcpy_absolute() routine can access the whole memory, it is
only used to update the absolute zero lowcore. This rework therefore
introduces a new mechanism for the absolute zero lowcore access and
scraps memcpy_absolute() routine for good.

Instead, an area is reserved in the virtual memory that is used for
the absolute lowcore access only. That area holds an array of 8KB
virtual mappings - one per CPU. Whenever a CPU is brought online, the
corresponding item is mapped to the real address of the previously
installed prefix page.

The absolute zero lowcore access works like this: a CPU calls the
new primitive get_abs_lowcore() to obtain its 8KB mapping as a
pointer to the struct lowcore. Virtual address references to that
pointer get translated to the real addresses of the prefix page,
which in turn gets swapped with the absolute zero memory addresses
due to prefixing. Once the pointer is not needed it must be released
with put_abs_lowcore() primitive:

struct lowcore *abs_lc;
unsigned long flags;

abs_lc = get_abs_lowcore(&flags);
abs_lc->... = ...;
put_abs_lowcore(abs_lc, flags);

To ensure the described mechanism works large segment- and region-
table entries must be avoided for the 8KB mappings. Failure to do
so results in usage of Region-Frame Absolute Address (RFAA) or
Segment-Frame Absolute Address (SFAA) large page fields. In that
case absolute addresses would be used to address the prefix page
instead of the real ones and the prefixing would get bypassed.

Reviewed-by: Heiko Carstens <hca@linux.ibm.com>
Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
14 files changed:
arch/s390/boot/startup.c
arch/s390/include/asm/abs_lowcore.h [new file with mode: 0644]
arch/s390/include/asm/pgtable.h
arch/s390/include/asm/processor.h
arch/s390/kernel/Makefile
arch/s390/kernel/abs_lowcore.c [new file with mode: 0644]
arch/s390/kernel/ipl.c
arch/s390/kernel/machine_kexec.c
arch/s390/kernel/os_info.c
arch/s390/kernel/setup.c
arch/s390/kernel/smp.c
arch/s390/mm/init.c
arch/s390/mm/maccess.c
arch/s390/mm/vmem.c

index bc48fe82d949a4dc60ebb2dda2f73d2b308d36e1..41b7af7a936567fe6cf2345318ceed4c23702e82 100644 (file)
 #include <asm/sclp.h>
 #include <asm/diag.h>
 #include <asm/uv.h>
+#include <asm/abs_lowcore.h>
 #include "decompressor.h"
 #include "boot.h"
 #include "uv.h"
 
 unsigned long __bootdata_preserved(__kaslr_offset);
+unsigned long __bootdata_preserved(__abs_lowcore);
 unsigned long __bootdata(__amode31_base);
 unsigned long __bootdata_preserved(VMALLOC_START);
 unsigned long __bootdata_preserved(VMALLOC_END);
@@ -180,7 +182,8 @@ static void setup_kernel_memory_layout(void)
        /* force vmalloc and modules below kasan shadow */
        vmax = min(vmax, KASAN_SHADOW_START);
 #endif
-       MODULES_END = vmax;
+       __abs_lowcore = round_down(vmax - ABS_LOWCORE_MAP_SIZE, sizeof(struct lowcore));
+       MODULES_END = round_down(__abs_lowcore, _SEGMENT_SIZE);
        MODULES_VADDR = MODULES_END - MODULES_LEN;
        VMALLOC_END = MODULES_VADDR;
 
diff --git a/arch/s390/include/asm/abs_lowcore.h b/arch/s390/include/asm/abs_lowcore.h
new file mode 100644 (file)
index 0000000..bdef8d2
--- /dev/null
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_S390_ABS_LOWCORE_H
+#define _ASM_S390_ABS_LOWCORE_H
+
+#include <asm/lowcore.h>
+
+#define ABS_LOWCORE_MAP_SIZE   (NR_CPUS * sizeof(struct lowcore))
+
+extern unsigned long __abs_lowcore;
+extern bool abs_lowcore_mapped;
+
+struct lowcore *get_abs_lowcore(unsigned long *flags);
+void put_abs_lowcore(struct lowcore *lc, unsigned long flags);
+int abs_lowcore_map(int cpu, struct lowcore *lc);
+void abs_lowcore_unmap(int cpu);
+
+#endif /* _ASM_ABS_S390_LOWCORE_H */
index a397b072a580048d623dcb7c20edcbafb5844a44..82506ebd544b5e04cdaa0697f5f4b200f8cfc874 100644 (file)
@@ -1781,6 +1781,8 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset)
 
 extern int vmem_add_mapping(unsigned long start, unsigned long size);
 extern void vmem_remove_mapping(unsigned long start, unsigned long size);
+extern int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot);
+extern void vmem_unmap_4k_page(unsigned long addr);
 extern int s390_enable_sie(void);
 extern int s390_enable_skey(void);
 extern void s390_reset_cmma(struct mm_struct *mm);
index bd66f8e349492f2e0b70f51a43a40732151fd703..93677ae89e7ea6d042de51fa7ed74fb95b5ff768 100644 (file)
@@ -307,21 +307,6 @@ static __always_inline void __noreturn disabled_wait(void)
 #define ARCH_LOW_ADDRESS_LIMIT 0x7fffffffUL
 
 extern int memcpy_real(void *, unsigned long, size_t);
-extern void memcpy_absolute(void *, void *, size_t);
-
-#define put_abs_lowcore(member, x) do {                                        \
-       unsigned long __abs_address = offsetof(struct lowcore, member); \
-       __typeof__(((struct lowcore *)0)->member) __tmp = (x);          \
-                                                                       \
-       memcpy_absolute(__va(__abs_address), &__tmp, sizeof(__tmp));    \
-} while (0)
-
-#define get_abs_lowcore(x, member) do {                                        \
-       unsigned long __abs_address = offsetof(struct lowcore, member); \
-       __typeof__(((struct lowcore *)0)->member) *__ptr = &(x);        \
-                                                                       \
-       memcpy_absolute(__ptr, __va(__abs_address), sizeof(*__ptr));    \
-} while (0)
 
 extern int s390_isolate_bp(void);
 extern int s390_isolate_bp_guest(void);
index 3cbfa9fddd9a924ffcfe875e3ae14ec0a31484eb..45e4b2f41e05af5362bfd59ec85bad79990b423f 100644 (file)
@@ -40,7 +40,7 @@ obj-y += sysinfo.o lgr.o os_info.o machine_kexec.o
 obj-y  += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o
 obj-y  += entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o
 obj-y  += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o
-obj-y  += smp.o text_amode31.o stacktrace.o
+obj-y  += smp.o text_amode31.o stacktrace.o abs_lowcore.o
 
 extra-y                                += head64.o vmlinux.lds
 
diff --git a/arch/s390/kernel/abs_lowcore.c b/arch/s390/kernel/abs_lowcore.c
new file mode 100644 (file)
index 0000000..dc9f0ec
--- /dev/null
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/pgtable.h>
+#include <asm/pgtable.h>
+#include <asm/abs_lowcore.h>
+
+#define ABS_LOWCORE_UNMAPPED   1
+#define ABS_LOWCORE_LAP_ON     2
+#define ABS_LOWCORE_IRQS_ON    4
+
+unsigned long __bootdata_preserved(__abs_lowcore);
+bool __ro_after_init abs_lowcore_mapped;
+
+int abs_lowcore_map(int cpu, struct lowcore *lc)
+{
+       unsigned long addr = __abs_lowcore + (cpu * sizeof(struct lowcore));
+       unsigned long phys = __pa(lc);
+       int rc, i;
+
+       for (i = 0; i < LC_PAGES; i++) {
+               rc = vmem_map_4k_page(addr, phys, PAGE_KERNEL);
+               if (rc) {
+                       for (--i; i >= 0; i--) {
+                               addr -= PAGE_SIZE;
+                               vmem_unmap_4k_page(addr);
+                       }
+                       return rc;
+               }
+               addr += PAGE_SIZE;
+               phys += PAGE_SIZE;
+       }
+       return 0;
+}
+
+void abs_lowcore_unmap(int cpu)
+{
+       unsigned long addr = __abs_lowcore + (cpu * sizeof(struct lowcore));
+       int i;
+
+       for (i = 0; i < LC_PAGES; i++) {
+               vmem_unmap_4k_page(addr);
+               addr += PAGE_SIZE;
+       }
+}
+
+struct lowcore *get_abs_lowcore(unsigned long *flags)
+{
+       unsigned long irq_flags;
+       union ctlreg0 cr0;
+       int cpu;
+
+       *flags = 0;
+       cpu = get_cpu();
+       if (abs_lowcore_mapped) {
+               return ((struct lowcore *)__abs_lowcore) + cpu;
+       } else {
+               if (cpu != 0)
+                       panic("Invalid unmapped absolute lowcore access\n");
+               local_irq_save(irq_flags);
+               if (!irqs_disabled_flags(irq_flags))
+                       *flags |= ABS_LOWCORE_IRQS_ON;
+               __ctl_store(cr0.val, 0, 0);
+               if (cr0.lap) {
+                       *flags |= ABS_LOWCORE_LAP_ON;
+                       __ctl_clear_bit(0, 28);
+               }
+               *flags |= ABS_LOWCORE_UNMAPPED;
+               return lowcore_ptr[0];
+       }
+}
+
+void put_abs_lowcore(struct lowcore *lc, unsigned long flags)
+{
+       if (abs_lowcore_mapped) {
+               if (flags)
+                       panic("Invalid mapped absolute lowcore release\n");
+       } else {
+               if (smp_processor_id() != 0)
+                       panic("Invalid mapped absolute lowcore access\n");
+               if (!(flags & ABS_LOWCORE_UNMAPPED))
+                       panic("Invalid unmapped absolute lowcore release\n");
+               if (flags & ABS_LOWCORE_LAP_ON)
+                       __ctl_set_bit(0, 28);
+               if (flags & ABS_LOWCORE_IRQS_ON)
+                       local_irq_enable();
+       }
+       put_cpu();
+}
index 1cc85b8ff42e5d30a055f5fb1d286941f4b78f08..325cbf69ebbde014bdb5744dbac50ba1ac8192b9 100644 (file)
@@ -29,6 +29,7 @@
 #include <asm/sclp.h>
 #include <asm/checksum.h>
 #include <asm/debug.h>
+#include <asm/abs_lowcore.h>
 #include <asm/os_info.h>
 #include <asm/sections.h>
 #include <asm/boot_data.h>
@@ -1642,12 +1643,16 @@ static struct shutdown_action __refdata dump_action = {
 static void dump_reipl_run(struct shutdown_trigger *trigger)
 {
        unsigned long ipib = (unsigned long) reipl_block_actual;
+       struct lowcore *abs_lc;
+       unsigned long flags;
        unsigned int csum;
 
        csum = (__force unsigned int)
               csum_partial(reipl_block_actual, reipl_block_actual->hdr.len, 0);
-       put_abs_lowcore(ipib, ipib);
-       put_abs_lowcore(ipib_checksum, csum);
+       abs_lc = get_abs_lowcore(&flags);
+       abs_lc->ipib = ipib;
+       abs_lc->ipib_checksum = csum;
+       put_abs_lowcore(abs_lc, flags);
        dump_run(trigger);
 }
 
index ab761c008f9813e8a4526b05e1947c244c1e3c6b..4579b42286d5fbd713f12dabbbfd654c7d94655e 100644 (file)
@@ -21,6 +21,7 @@
 #include <asm/elf.h>
 #include <asm/asm-offsets.h>
 #include <asm/cacheflush.h>
+#include <asm/abs_lowcore.h>
 #include <asm/os_info.h>
 #include <asm/set_memory.h>
 #include <asm/stacktrace.h>
@@ -222,13 +223,18 @@ void machine_kexec_cleanup(struct kimage *image)
 
 void arch_crash_save_vmcoreinfo(void)
 {
+       struct lowcore *abs_lc;
+       unsigned long flags;
+
        VMCOREINFO_SYMBOL(lowcore_ptr);
        VMCOREINFO_SYMBOL(high_memory);
        VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS);
        vmcoreinfo_append_str("SAMODE31=%lx\n", __samode31);
        vmcoreinfo_append_str("EAMODE31=%lx\n", __eamode31);
        vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset());
-       put_abs_lowcore(vmcore_info, paddr_vmcoreinfo_note());
+       abs_lc = get_abs_lowcore(&flags);
+       abs_lc->vmcore_info = paddr_vmcoreinfo_note();
+       put_abs_lowcore(abs_lc, flags);
 }
 
 void machine_shutdown(void)
index 1acc2e05d70f07bffebe98ea7605d0580596bf76..506ccb74d2d0bcd5b18279c364c0d14af94d080f 100644 (file)
@@ -13,7 +13,7 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <asm/checksum.h>
-#include <asm/lowcore.h>
+#include <asm/abs_lowcore.h>
 #include <asm/os_info.h>
 #include <asm/asm-offsets.h>
 
@@ -57,13 +57,16 @@ void os_info_entry_add(int nr, void *ptr, u64 size)
  */
 void __init os_info_init(void)
 {
-       void *ptr = &os_info;
+       struct lowcore *abs_lc;
+       unsigned long flags;
 
        os_info.version_major = OS_INFO_VERSION_MAJOR;
        os_info.version_minor = OS_INFO_VERSION_MINOR;
        os_info.magic = OS_INFO_MAGIC;
        os_info.csum = os_info_csum(&os_info);
-       put_abs_lowcore(os_info, __pa(ptr));
+       abs_lc = get_abs_lowcore(&flags);
+       abs_lc->os_info = __pa(&os_info);
+       put_abs_lowcore(abs_lc, flags);
 }
 
 #ifdef CONFIG_CRASH_DUMP
index 8f483132901ede0e77190366635bf70d602a11b6..91139a16a44fb2da36ffa632af6f543ea07883af 100644 (file)
@@ -58,7 +58,7 @@
 #include <asm/smp.h>
 #include <asm/mmu_context.h>
 #include <asm/cpcmd.h>
-#include <asm/lowcore.h>
+#include <asm/abs_lowcore.h>
 #include <asm/nmi.h>
 #include <asm/irq.h>
 #include <asm/page.h>
@@ -411,8 +411,9 @@ void __init arch_call_rest_init(void)
 static void __init setup_lowcore_dat_off(void)
 {
        unsigned long int_psw_mask = PSW_KERNEL_BITS;
+       struct lowcore *abs_lc, *lc;
        unsigned long mcck_stack;
-       struct lowcore *lc;
+       unsigned long flags;
 
        if (IS_ENABLED(CONFIG_KASAN))
                int_psw_mask |= PSW_MASK_DAT;
@@ -474,11 +475,13 @@ static void __init setup_lowcore_dat_off(void)
        lc->restart_data = 0;
        lc->restart_source = -1U;
 
-       put_abs_lowcore(restart_stack, lc->restart_stack);
-       put_abs_lowcore(restart_fn, lc->restart_fn);
-       put_abs_lowcore(restart_data, lc->restart_data);
-       put_abs_lowcore(restart_source, lc->restart_source);
-       put_abs_lowcore(restart_psw, lc->restart_psw);
+       abs_lc = get_abs_lowcore(&flags);
+       abs_lc->restart_stack = lc->restart_stack;
+       abs_lc->restart_fn = lc->restart_fn;
+       abs_lc->restart_data = lc->restart_data;
+       abs_lc->restart_source = lc->restart_source;
+       abs_lc->restart_psw = lc->restart_psw;
+       put_abs_lowcore(abs_lc, flags);
 
        mcck_stack = (unsigned long)memblock_alloc(THREAD_SIZE, THREAD_SIZE);
        if (!mcck_stack)
@@ -499,8 +502,8 @@ static void __init setup_lowcore_dat_off(void)
 
 static void __init setup_lowcore_dat_on(void)
 {
-       struct lowcore *lc = lowcore_ptr[0];
-       int cr;
+       struct lowcore *abs_lc;
+       unsigned long flags;
 
        __ctl_clear_bit(0, 28);
        S390_lowcore.external_new_psw.mask |= PSW_MASK_DAT;
@@ -509,10 +512,15 @@ static void __init setup_lowcore_dat_on(void)
        S390_lowcore.io_new_psw.mask |= PSW_MASK_DAT;
        __ctl_set_bit(0, 28);
        __ctl_store(S390_lowcore.cregs_save_area, 0, 15);
-       put_abs_lowcore(restart_flags, RESTART_FLAG_CTLREGS);
-       put_abs_lowcore(program_new_psw, lc->program_new_psw);
-       for (cr = 0; cr < ARRAY_SIZE(lc->cregs_save_area); cr++)
-               put_abs_lowcore(cregs_save_area[cr], lc->cregs_save_area[cr]);
+       abs_lc = get_abs_lowcore(&flags);
+       abs_lc->restart_flags = RESTART_FLAG_CTLREGS;
+       abs_lc->program_new_psw = S390_lowcore.program_new_psw;
+       memcpy(abs_lc->cregs_save_area, S390_lowcore.cregs_save_area,
+              sizeof(abs_lc->cregs_save_area));
+       put_abs_lowcore(abs_lc, flags);
+       if (abs_lowcore_map(0, lowcore_ptr[0]))
+               panic("Couldn't setup absolute lowcore");
+       abs_lowcore_mapped = true;
 }
 
 static struct resource code_resource = {
index 30c91d56593374fcb8b43a1be5a5fe6eefa00a3c..40876d809ea69a00f85141ceb3a34970e1c4aea2 100644 (file)
@@ -45,7 +45,7 @@
 #include <asm/irq.h>
 #include <asm/tlbflush.h>
 #include <asm/vtimer.h>
-#include <asm/lowcore.h>
+#include <asm/abs_lowcore.h>
 #include <asm/sclp.h>
 #include <asm/debug.h>
 #include <asm/os_info.h>
@@ -212,10 +212,14 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu)
        lc->preempt_count = PREEMPT_DISABLED;
        if (nmi_alloc_mcesa(&lc->mcesad))
                goto out;
+       if (abs_lowcore_map(cpu, lc))
+               goto out_mcesa;
        lowcore_ptr[cpu] = lc;
        pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, __pa(lc));
        return 0;
 
+out_mcesa:
+       nmi_free_mcesa(&lc->mcesad);
 out:
        stack_free(mcck_stack);
        stack_free(async_stack);
@@ -237,6 +241,7 @@ static void pcpu_free_lowcore(struct pcpu *pcpu)
        mcck_stack = lc->mcck_stack - STACK_INIT_OFFSET;
        pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, 0);
        lowcore_ptr[cpu] = NULL;
+       abs_lowcore_unmap(cpu);
        nmi_free_mcesa(&lc->mcesad);
        stack_free(async_stack);
        stack_free(mcck_stack);
@@ -315,9 +320,12 @@ static void pcpu_delegate(struct pcpu *pcpu,
                          pcpu_delegate_fn *func,
                          void *data, unsigned long stack)
 {
-       struct lowcore *lc = lowcore_ptr[pcpu - pcpu_devices];
-       unsigned int source_cpu = stap();
+       struct lowcore *lc, *abs_lc;
+       unsigned int source_cpu;
+       unsigned long flags;
 
+       lc = lowcore_ptr[pcpu - pcpu_devices];
+       source_cpu = stap();
        __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT);
        if (pcpu->address == source_cpu) {
                call_on_stack(2, stack, void, __pcpu_delegate,
@@ -332,10 +340,12 @@ static void pcpu_delegate(struct pcpu *pcpu,
                lc->restart_data = (unsigned long)data;
                lc->restart_source = source_cpu;
        } else {
-               put_abs_lowcore(restart_stack, stack);
-               put_abs_lowcore(restart_fn, (unsigned long)func);
-               put_abs_lowcore(restart_data, (unsigned long)data);
-               put_abs_lowcore(restart_source, source_cpu);
+               abs_lc = get_abs_lowcore(&flags);
+               abs_lc->restart_stack = stack;
+               abs_lc->restart_fn = (unsigned long)func;
+               abs_lc->restart_data = (unsigned long)data;
+               abs_lc->restart_source = source_cpu;
+               put_abs_lowcore(abs_lc, flags);
        }
        __bpon();
        asm volatile(
@@ -581,6 +591,8 @@ static DEFINE_SPINLOCK(ctl_lock);
 void smp_ctl_set_clear_bit(int cr, int bit, bool set)
 {
        struct ec_creg_mask_parms parms = { .cr = cr, };
+       struct lowcore *abs_lc;
+       unsigned long flags;
        u64 ctlreg;
 
        if (set) {
@@ -591,9 +603,11 @@ void smp_ctl_set_clear_bit(int cr, int bit, bool set)
                parms.andval = ~(1UL << bit);
        }
        spin_lock(&ctl_lock);
-       get_abs_lowcore(ctlreg, cregs_save_area[cr]);
+       abs_lc = get_abs_lowcore(&flags);
+       ctlreg = abs_lc->cregs_save_area[cr];
        ctlreg = (ctlreg & parms.andval) | parms.orval;
-       put_abs_lowcore(cregs_save_area[cr], ctlreg);
+       abs_lc->cregs_save_area[cr] = ctlreg;
+       put_abs_lowcore(abs_lc, flags);
        spin_unlock(&ctl_lock);
        on_each_cpu(smp_ctl_bit_callback, &parms, 1);
 }
@@ -1281,6 +1295,8 @@ static int __init smp_reinit_ipl_cpu(void)
        __ctl_clear_bit(0, 28); /* disable lowcore protection */
        S390_lowcore.mcesad = mcesad;
        __ctl_load(cr0, 0, 0);
+       if (abs_lowcore_map(0, lc))
+               panic("Couldn't remap absolute lowcore");
        lowcore_ptr[0] = lc;
        local_mcck_enable();
        local_irq_restore(flags);
index 6a0ac00d5a42b45a26c0c29946a2758a02401c15..7b6873ac99d1e5cfc2bcfc3b810cf6fb8065d7ab 100644 (file)
@@ -38,7 +38,7 @@
 #include <asm/kfence.h>
 #include <asm/ptdump.h>
 #include <asm/dma.h>
-#include <asm/lowcore.h>
+#include <asm/abs_lowcore.h>
 #include <asm/tlb.h>
 #include <asm/tlbflush.h>
 #include <asm/sections.h>
index d6d84e02f35ab2cb6ac20c35ccea6c6be4faabb1..b8451ddbb3d6b5998281cac9e83312d86b697667 100644 (file)
@@ -15,6 +15,7 @@
 #include <asm/asm-extable.h>
 #include <asm/ctl_reg.h>
 #include <asm/io.h>
+#include <asm/abs_lowcore.h>
 #include <asm/stacktrace.h>
 
 static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t size)
@@ -148,46 +149,20 @@ int memcpy_real(void *dest, unsigned long src, size_t count)
 }
 
 /*
- * Copy memory in absolute mode (kernel to kernel)
+ * Find CPU that owns swapped prefix page
  */
-void memcpy_absolute(void *dest, void *src, size_t count)
-{
-       unsigned long cr0, flags, prefix;
-
-       flags = arch_local_irq_save();
-       __ctl_store(cr0, 0, 0);
-       __ctl_clear_bit(0, 28); /* disable lowcore protection */
-       prefix = store_prefix();
-       if (prefix) {
-               local_mcck_disable();
-               set_prefix(0);
-               memcpy(dest, src, count);
-               set_prefix(prefix);
-               local_mcck_enable();
-       } else {
-               memcpy(dest, src, count);
-       }
-       __ctl_load(cr0, 0, 0);
-       arch_local_irq_restore(flags);
-}
-
-/*
- * Check if physical address is within prefix or zero page
- */
-static int is_swapped(phys_addr_t addr)
+static int get_swapped_owner(phys_addr_t addr)
 {
        phys_addr_t lc;
        int cpu;
 
-       if (addr < sizeof(struct lowcore))
-               return 1;
        for_each_online_cpu(cpu) {
                lc = virt_to_phys(lowcore_ptr[cpu]);
                if (addr > lc + sizeof(struct lowcore) - 1 || addr < lc)
                        continue;
-               return 1;
+               return cpu;
        }
-       return 0;
+       return -1;
 }
 
 /*
@@ -200,17 +175,35 @@ void *xlate_dev_mem_ptr(phys_addr_t addr)
 {
        void *ptr = phys_to_virt(addr);
        void *bounce = ptr;
+       struct lowcore *abs_lc;
+       unsigned long flags;
        unsigned long size;
+       int this_cpu, cpu;
 
        cpus_read_lock();
-       preempt_disable();
-       if (is_swapped(addr)) {
-               size = PAGE_SIZE - (addr & ~PAGE_MASK);
-               bounce = (void *) __get_free_page(GFP_ATOMIC);
-               if (bounce)
-                       memcpy_absolute(bounce, ptr, size);
+       this_cpu = get_cpu();
+       if (addr >= sizeof(struct lowcore)) {
+               cpu = get_swapped_owner(addr);
+               if (cpu < 0)
+                       goto out;
+       }
+       bounce = (void *)__get_free_page(GFP_ATOMIC);
+       if (!bounce)
+               goto out;
+       size = PAGE_SIZE - (addr & ~PAGE_MASK);
+       if (addr < sizeof(struct lowcore)) {
+               abs_lc = get_abs_lowcore(&flags);
+               ptr = (void *)abs_lc + addr;
+               memcpy(bounce, ptr, size);
+               put_abs_lowcore(abs_lc, flags);
+       } else if (cpu == this_cpu) {
+               ptr = (void *)(addr - virt_to_phys(lowcore_ptr[cpu]));
+               memcpy(bounce, ptr, size);
+       } else {
+               memcpy(bounce, ptr, size);
        }
-       preempt_enable();
+out:
+       put_cpu();
        cpus_read_unlock();
        return bounce;
 }
index c2583f921ca8d100251079ab701a4ce11ea1ac95..203ba2bfea59c3c5beacfba890c0b3c02f0d1342 100644 (file)
@@ -560,6 +560,91 @@ int vmem_add_mapping(unsigned long start, unsigned long size)
        return ret;
 }
 
+/*
+ * Allocate new or return existing page-table entry, but do not map it
+ * to any physical address. If missing, allocate segment- and region-
+ * table entries along. Meeting a large segment- or region-table entry
+ * while traversing is an error, since the function is expected to be
+ * called against virtual regions reserverd for 4KB mappings only.
+ */
+static pte_t *vmem_get_alloc_pte(unsigned long addr)
+{
+       pte_t *ptep = NULL;
+       pgd_t *pgd;
+       p4d_t *p4d;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       pgd = pgd_offset_k(addr);
+       if (pgd_none(*pgd)) {
+               p4d = vmem_crst_alloc(_REGION2_ENTRY_EMPTY);
+               if (!p4d)
+                       goto out;
+               pgd_populate(&init_mm, pgd, p4d);
+       }
+       p4d = p4d_offset(pgd, addr);
+       if (p4d_none(*p4d)) {
+               pud = vmem_crst_alloc(_REGION3_ENTRY_EMPTY);
+               if (!pud)
+                       goto out;
+               p4d_populate(&init_mm, p4d, pud);
+       }
+       pud = pud_offset(p4d, addr);
+       if (pud_none(*pud)) {
+               pmd = vmem_crst_alloc(_SEGMENT_ENTRY_EMPTY);
+               if (!pmd)
+                       goto out;
+               pud_populate(&init_mm, pud, pmd);
+       } else if (WARN_ON_ONCE(pud_large(*pud))) {
+               goto out;
+       }
+       pmd = pmd_offset(pud, addr);
+       if (pmd_none(*pmd)) {
+               pte = vmem_pte_alloc();
+               if (!pte)
+                       goto out;
+               pmd_populate(&init_mm, pmd, pte);
+       } else if (WARN_ON_ONCE(pmd_large(*pmd))) {
+               goto out;
+       }
+       ptep = pte_offset_kernel(pmd, addr);
+out:
+       return ptep;
+}
+
+int vmem_map_4k_page(unsigned long addr, unsigned long phys, pgprot_t prot)
+{
+       pte_t *ptep, pte;
+       int rc = 0;
+
+       if (!IS_ALIGNED(addr, PAGE_SIZE))
+               return -EINVAL;
+       mutex_lock(&vmem_mutex);
+       ptep = vmem_get_alloc_pte(addr);
+       if (!ptep) {
+               rc = -ENOMEM;
+               goto out;
+       }
+       __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL);
+       pte = mk_pte_phys(phys, prot);
+       set_pte(ptep, pte);
+out:
+       mutex_unlock(&vmem_mutex);
+       return rc;
+}
+
+void vmem_unmap_4k_page(unsigned long addr)
+{
+       pte_t *ptep;
+
+       mutex_lock(&vmem_mutex);
+       ptep = virt_to_kpte(addr);
+       __ptep_ipte(addr, ptep, 0, 0, IPTE_GLOBAL);
+       pte_clear(&init_mm, addr, ptep);
+       mutex_unlock(&vmem_mutex);
+}
+
 /*
  * map whole physical memory to virtual memory (identity mapping)
  * we reserve enough space in the vmalloc area for vmemmap to hotplug