mm, hugetlb: fix racy resv_huge_pages underflow on UFFDIO_COPY

author Mina Almasry <almasrymina@google.com>

Thu, 1 Jul 2021 01:48:19 +0000 (18:48 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 1 Jul 2021 03:47:26 +0000 (20:47 -0700)
author Mina Almasry <almasrymina@google.com>
Thu, 1 Jul 2021 01:48:19 +0000 (18:48 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 1 Jul 2021 03:47:26 +0000 (20:47 -0700)
diff --git a/include/linux/migrate.h b/include/linux/migrate.h

index 4bb4e519e3f56db06fc0237754bf82b923a4fd87..7b7b7397727851d9ee573ba2bea79519eff8d1b0 100644 (file)
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -51,6 +51,7 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
                                   struct page *newpage, struct page *page);
  extern int migrate_page_move_mapping(struct address_space *mapping,
                 struct page *newpage, struct page *page, int extra_count);
+extern void copy_huge_page(struct page *dst, struct page *src);
  #else
  
  static inline void putback_movable_pages(struct list_head *l) {}
@@ -77,6 +78,9 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
         return -ENOSYS;
  }
  
+static inline void copy_huge_page(struct page *dst, struct page *src)
+{
+}
  #endif /* CONFIG_MIGRATION */
  
  #ifdef CONFIG_COMPACTION
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 88f2178ad7c992921aea02dd93471761d111b8ce..b14f4d1749b215fa55ec4d4bcf5b5c43acb6ed17 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -30,6 +30,7 @@
  #include <linux/numa.h>
  #include <linux/llist.h>
  #include <linux/cma.h>
+#include <linux/migrate.h>
  
  #include <asm/page.h>
  #include <asm/pgalloc.h>
@@ -5076,20 +5077,17 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                             struct page **pagep)
  {
         bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
-       struct address_space *mapping;
-       pgoff_t idx;
+       struct hstate *h = hstate_vma(dst_vma);
+       struct address_space *mapping = dst_vma->vm_file->f_mapping;
+       pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
         unsigned long size;
         int vm_shared = dst_vma->vm_flags & VM_SHARED;
-       struct hstate *h = hstate_vma(dst_vma);
         pte_t _dst_pte;
         spinlock_t *ptl;
-       int ret;
+       int ret = -ENOMEM;
         struct page *page;
         int writable;
  
-       mapping = dst_vma->vm_file->f_mapping;
-       idx = vma_hugecache_offset(h, dst_vma, dst_addr);
-
         if (is_continue) {
                 ret = -EFAULT;
                 page = find_lock_page(mapping, idx);
@@ -5118,12 +5116,44 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                 /* fallback to copy_from_user outside mmap_lock */
                 if (unlikely(ret)) {
                         ret = -ENOENT;
+                       /* Free the allocated page which may have
+                        * consumed a reservation.
+                        */
+                       restore_reserve_on_error(h, dst_vma, dst_addr, page);
+                       put_page(page);
+
+                       /* Allocate a temporary page to hold the copied
+                        * contents.
+                        */
+                       page = alloc_huge_page_vma(h, dst_vma, dst_addr);
+                       if (!page) {
+                               ret = -ENOMEM;
+                               goto out;
+                       }
                         *pagep = page;
-                       /* don't free the page */
+                       /* Set the outparam pagep and return to the caller to
+                        * copy the contents outside the lock. Don't free the
+                        * page.
+                        */
                         goto out;
                 }
         } else {
-               page = *pagep;
+               if (vm_shared &&
+                   hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
+                       put_page(*pagep);
+                       ret = -EEXIST;
+                       *pagep = NULL;
+                       goto out;
+               }
+
+               page = alloc_huge_page(dst_vma, dst_addr, 0);
+               if (IS_ERR(page)) {
+                       ret = -ENOMEM;
+                       *pagep = NULL;
+                       goto out;
+               }
+               copy_huge_page(page, *pagep);
+               put_page(*pagep);
                 *pagep = NULL;
         }
  
diff --git a/mm/migrate.c b/mm/migrate.c

index 75a15f0a2698767afe9dbffdef6ab9e78af3830b..8fc766e52e527e127d17f0e1c1d6a8f54d0b15ee 100644 (file)
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -553,7 +553,7 @@ static void __copy_gigantic_page(struct page *dst, struct page *src,
         }
  }
  
-static void copy_huge_page(struct page *dst, struct page *src)
+void copy_huge_page(struct page *dst, struct page *src)
  {
         int i;
         int nr_pages;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c

index 63a73e164d5510fba964eec777b7486b634a1343..da5535d2d99033965df01c1c81d046be3029177d 100644 (file)
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -209,7 +209,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
                                               unsigned long len,
                                               enum mcopy_atomic_mode mode)
  {
-       int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
         int vm_shared = dst_vma->vm_flags & VM_SHARED;
         ssize_t err;
         pte_t *dst_pte;
@@ -308,7 +307,6 @@ retry:
  
                 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                 i_mmap_unlock_read(mapping);
-               vm_alloc_shared = vm_shared;
  
                 cond_resched();
  
@@ -346,54 +344,8 @@ retry:
  out_unlock:
         mmap_read_unlock(dst_mm);
  out:
-       if (page) {
-               /*
-                * We encountered an error and are about to free a newly
-                * allocated huge page.
-                *
-                * Reservation handling is very subtle, and is different for
-                * private and shared mappings.  See the routine
-                * restore_reserve_on_error for details.  Unfortunately, we
-                * can not call restore_reserve_on_error now as it would
-                * require holding mmap_lock.
-                *
-                * If a reservation for the page existed in the reservation
-                * map of a private mapping, the map was modified to indicate
-                * the reservation was consumed when the page was allocated.
-                * We clear the HPageRestoreReserve flag now so that the global
-                * reserve count will not be incremented in free_huge_page.
-                * The reservation map will still indicate the reservation
-                * was consumed and possibly prevent later page allocation.
-                * This is better than leaking a global reservation.  If no
-                * reservation existed, it is still safe to clear
-                * HPageRestoreReserve as no adjustments to reservation counts
-                * were made during allocation.
-                *
-                * The reservation map for shared mappings indicates which
-                * pages have reservations.  When a huge page is allocated
-                * for an address with a reservation, no change is made to
-                * the reserve map.  In this case HPageRestoreReserve will be
-                * set to indicate that the global reservation count should be
-                * incremented when the page is freed.  This is the desired
-                * behavior.  However, when a huge page is allocated for an
-                * address without a reservation a reservation entry is added
-                * to the reservation map, and HPageRestoreReserve will not be
-                * set. When the page is freed, the global reserve count will
-                * NOT be incremented and it will appear as though we have
-                * leaked reserved page.  In this case, set HPageRestoreReserve
-                * so that the global reserve count will be incremented to
-                * match the reservation map entry which was created.
-                *
-                * Note that vm_alloc_shared is based on the flags of the vma
-                * for which the page was originally allocated.  dst_vma could
-                * be different or NULL on error.
-                */
-               if (vm_alloc_shared)
-                       SetHPageRestoreReserve(page);
-               else
-                       ClearHPageRestoreReserve(page);
+       if (page)
                 put_page(page);
-       }
         BUG_ON(copied < 0);
         BUG_ON(err > 0);
         BUG_ON(!copied && !err);
author	Mina Almasry <almasrymina@google.com>
	Thu, 1 Jul 2021 01:48:19 +0000 (18:48 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 1 Jul 2021 03:47:26 +0000 (20:47 -0700)
include/linux/migrate.h		patch \| blob \| history
mm/hugetlb.c		patch \| blob \| history
mm/migrate.c		patch \| blob \| history
mm/userfaultfd.c		patch \| blob \| history