]> git.baikalelectronics.ru Git - kernel.git/commitdiff
mm: teach core mm about pte markers
authorPeter Xu <peterx@redhat.com>
Fri, 13 May 2022 03:22:52 +0000 (20:22 -0700)
committerAndrew Morton <akpm@linux-foundation.org>
Fri, 13 May 2022 14:20:09 +0000 (07:20 -0700)
This patch still does not use pte marker in any way, however it teaches
the core mm about the pte marker idea.

For example, handle_pte_marker() is introduced that will parse and handle
all the pte marker faults.

Many of the places are more about commenting it up - so that we know
there's the possibility of pte marker showing up, and why we don't need
special code for the cases.

[peterx@redhat.com: userfaultfd.c needs swapops.h]
Link: https://lkml.kernel.org/r/YmRlVj3cdizYJsr0@xz-m1.local
Link: https://lkml.kernel.org/r/20220405014833.14015-1-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
fs/userfaultfd.c
mm/filemap.c
mm/hmm.c
mm/memcontrol.c
mm/memory.c
mm/mincore.c
mm/mprotect.c

index aa0c47cb0d165ac37c77d4942564213b47623dc2..78b68e0f9774c53c1e27e0c2b83f372fb1c87a2d 100644 (file)
@@ -29,6 +29,7 @@
 #include <linux/ioctl.h>
 #include <linux/security.h>
 #include <linux/hugetlb.h>
+#include <linux/swapops.h>
 
 int sysctl_unprivileged_userfaultfd __read_mostly;
 
@@ -249,9 +250,10 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
 
        /*
         * Lockless access: we're in a wait_event so it's ok if it
-        * changes under us.
+        * changes under us.  PTE markers should be handled the same as none
+        * ptes here.
         */
-       if (huge_pte_none(pte))
+       if (huge_pte_none_mostly(pte))
                ret = true;
        if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
                ret = true;
@@ -330,9 +332,10 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
        pte = pte_offset_map(pmd, address);
        /*
         * Lockless access: we're in a wait_event so it's ok if it
-        * changes under us.
+        * changes under us.  PTE markers should be handled the same as none
+        * ptes here.
         */
-       if (pte_none(*pte))
+       if (pte_none_mostly(*pte))
                ret = true;
        if (!pte_write(*pte) && (reason & VM_UFFD_WP))
                ret = true;
index 9a1eef6c5d350e6fadb77be23a48c34818d6310e..de34c6c10384148e988b07223757e7891a25c45a 100644 (file)
@@ -3376,6 +3376,11 @@ again:
                vmf->pte += xas.xa_index - last_pgoff;
                last_pgoff = xas.xa_index;
 
+               /*
+                * NOTE: If there're PTE markers, we'll leave them to be
+                * handled in the specific fault path, and it'll prohibit the
+                * fault-around logic.
+                */
                if (!pte_none(*vmf->pte))
                        goto unlock;
 
index af71aac3140e4b3f7f0af4645a5c3ae47d81e8b2..3fd3242c5e50fef17f4832ad196458207f579954 100644 (file)
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -239,7 +239,7 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
        pte_t pte = *ptep;
        uint64_t pfn_req_flags = *hmm_pfn;
 
-       if (pte_none(pte)) {
+       if (pte_none_mostly(pte)) {
                required_fault =
                        hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
                if (required_fault)
index 235c85e17c2d1038da5896730770a0b44bc81f1e..6e74ca98c86241b636437025e709c776addd4db6 100644 (file)
@@ -5640,10 +5640,14 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 
        if (pte_present(ptent))
                page = mc_handle_present_pte(vma, addr, ptent);
+       else if (pte_none_mostly(ptent))
+               /*
+                * PTE markers should be treated as a none pte here, separated
+                * from other swap handling below.
+                */
+               page = mc_handle_file_pte(vma, addr, ptent);
        else if (is_swap_pte(ptent))
                page = mc_handle_swap_pte(vma, ptent, &ent);
-       else if (pte_none(ptent))
-               page = mc_handle_file_pte(vma, addr, ptent);
 
        if (!page && !ent.val)
                return ret;
index 90212057a546e15a6a8b968cee767b3a642b8ca5..9743c8b74bf2e777572ff78cb9ff85609c08c2ad 100644 (file)
@@ -100,6 +100,8 @@ struct page *mem_map;
 EXPORT_SYMBOL(mem_map);
 #endif
 
+static vm_fault_t do_fault(struct vm_fault *vmf);
+
 /*
  * A number of key systems in x86 including ioremap() rely on the assumption
  * that high_memory defines the upper bound on direct map memory, then end
@@ -1415,6 +1417,8 @@ again:
                        if (!should_zap_page(details, page))
                                continue;
                        rss[mm_counter(page)]--;
+               } else if (is_pte_marker_entry(entry)) {
+                       /* By default, simply drop all pte markers when zap */
                } else if (is_hwpoison_entry(entry)) {
                        if (!should_zap_cows(details))
                                continue;
@@ -3555,6 +3559,23 @@ static inline bool should_try_to_free_swap(struct page *page,
                page_count(page) == 2;
 }
 
+static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
+{
+       swp_entry_t entry = pte_to_swp_entry(vmf->orig_pte);
+       unsigned long marker = pte_marker_get(entry);
+
+       /*
+        * PTE markers should always be with file-backed memories, and the
+        * marker should never be empty.  If anything weird happened, the best
+        * thing to do is to kill the process along with its mm.
+        */
+       if (WARN_ON_ONCE(vma_is_anonymous(vmf->vma) || !marker))
+               return VM_FAULT_SIGBUS;
+
+       /* TODO: handle pte markers */
+       return 0;
+}
+
 /*
  * We enter with non-exclusive mmap_lock (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
@@ -3592,6 +3613,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                        ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
                } else if (is_hwpoison_entry(entry)) {
                        ret = VM_FAULT_HWPOISON;
+               } else if (is_pte_marker_entry(entry)) {
+                       ret = handle_pte_marker(vmf);
                } else {
                        print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL);
                        ret = VM_FAULT_SIGBUS;
index f4f627325e12d0cee8fecf74ca6e5adc13caf76b..fa200c14185fc8a5c2733f8f2b7cc771d858f011 100644 (file)
@@ -122,7 +122,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        for (; addr != end; ptep++, addr += PAGE_SIZE) {
                pte_t pte = *ptep;
 
-               if (pte_none(pte))
+               /* We need to do cache lookup too for pte markers */
+               if (pte_none_mostly(pte))
                        __mincore_unmapped_range(addr, addr + PAGE_SIZE,
                                                 vma, vec);
                else if (pte_present(pte))
index 20a46f21cca8403fefcc666d591dc3c758a8b7fe..e84694267b0fac85daa5b4cc405fdc299286dcc4 100644 (file)
@@ -193,6 +193,9 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
                                        newpte = pte_swp_mksoft_dirty(newpte);
                                if (pte_swp_uffd_wp(oldpte))
                                        newpte = pte_swp_mkuffd_wp(newpte);
+                       } else if (is_pte_marker_entry(entry)) {
+                               /* Skip it, the same as none pte */
+                               continue;
                        } else {
                                newpte = oldpte;
                        }