]> git.baikalelectronics.ru Git - kernel.git/commitdiff
mm/hugetlb: only drop uffd-wp special pte if required
authorPeter Xu <peterx@redhat.com>
Fri, 13 May 2022 03:22:55 +0000 (20:22 -0700)
committerAndrew Morton <akpm@linux-foundation.org>
Fri, 13 May 2022 14:20:11 +0000 (07:20 -0700)
As with shmem uffd-wp special ptes, only drop the uffd-wp special swap pte
if unmapping an entire vma or synchronized such that faults can not race
with the unmap operation.  This requires passing zap_flags all the way to
the lowest level hugetlb unmap routine: __unmap_hugepage_range.

In general, unmap calls originated in hugetlbfs code will pass the
ZAP_FLAG_DROP_MARKER flag as synchronization is in place to prevent
faults.  The exception is hole punch which will first unmap without any
synchronization.  Later when hole punch actually removes the page from the
file, it will check to see if there was a subsequent fault and if so take
the hugetlb fault mutex while unmapping again.  This second unmap will
pass in ZAP_FLAG_DROP_MARKER.

The justification of "whether to apply ZAP_FLAG_DROP_MARKER flag when
unmap a hugetlb range" is (IMHO): we should never reach a state when a
page fault could errornously fault in a page-cache page that was
wr-protected to be writable, even in an extremely short period.  That
could happen if e.g.  we pass ZAP_FLAG_DROP_MARKER when
hugetlbfs_punch_hole() calls hugetlb_vmdelete_list(), because if a page
faults after that call and before remove_inode_hugepages() is executed,
the page cache can be mapped writable again in the small racy window, that
can cause unexpected data overwritten.

[peterx@redhat.com: fix sparse warning]
Link: https://lkml.kernel.org/r/Ylcdw8I1L5iAoWhb@xz-m1.local
[akpm@linux-foundation.org: move zap_flags_t from mm.h to mm_types.h to fix build issues]
Link: https://lkml.kernel.org/r/20220405014915.14873-1-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Jerome Glisse <jglisse@redhat.com>
Cc: "Kirill A . Shutemov" <kirill@shutemov.name>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
fs/hugetlbfs/inode.c
include/linux/hugetlb.h
include/linux/mm.h
include/linux/mm_types.h
mm/hugetlb.c
mm/memory.c

index 591599829e2a6659e355d5ca7e2da1e9474e6007..5945caccf003ef96e91574ded30ef1fa86359185 100644 (file)
@@ -405,7 +405,8 @@ static void remove_huge_page(struct page *page)
 }
 
 static void
-hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
+hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
+                     zap_flags_t zap_flags)
 {
        struct vm_area_struct *vma;
 
@@ -439,7 +440,7 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
                }
 
                unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
-                                                                       NULL);
+                                    NULL, zap_flags);
        }
 }
 
@@ -517,7 +518,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
                                mutex_lock(&hugetlb_fault_mutex_table[hash]);
                                hugetlb_vmdelete_list(&mapping->i_mmap,
                                        index * pages_per_huge_page(h),
-                                       (index + 1) * pages_per_huge_page(h));
+                                       (index + 1) * pages_per_huge_page(h),
+                                       ZAP_FLAG_DROP_MARKER);
                                i_mmap_unlock_write(mapping);
                        }
 
@@ -583,7 +585,8 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
        i_mmap_lock_write(mapping);
        i_size_write(inode, offset);
        if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
-               hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
+               hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0,
+                                     ZAP_FLAG_DROP_MARKER);
        i_mmap_unlock_write(mapping);
        remove_inode_hugepages(inode, offset, LLONG_MAX);
 }
@@ -616,8 +619,8 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
                i_mmap_lock_write(mapping);
                if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
                        hugetlb_vmdelete_list(&mapping->i_mmap,
-                                               hole_start >> PAGE_SHIFT,
-                                               hole_end  >> PAGE_SHIFT);
+                                             hole_start >> PAGE_SHIFT,
+                                             hole_end >> PAGE_SHIFT, 0);
                i_mmap_unlock_write(mapping);
                remove_inode_hugepages(inode, hole_start, hole_end);
                inode_unlock(inode);
index f1143f1fb4443bedf4615b1406843f7f87b45e40..19cec415f54685191f8e809de3928ff6462d832a 100644 (file)
@@ -143,11 +143,12 @@ long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
                         unsigned long *, unsigned long *, long, unsigned int,
                         int *);
 void unmap_hugepage_range(struct vm_area_struct *,
-                         unsigned long, unsigned long, struct page *);
+                         unsigned long, unsigned long, struct page *,
+                         zap_flags_t);
 void __unmap_hugepage_range_final(struct mmu_gather *tlb,
                          struct vm_area_struct *vma,
                          unsigned long start, unsigned long end,
-                         struct page *ref_page);
+                         struct page *ref_page, zap_flags_t zap_flags);
 void hugetlb_report_meminfo(struct seq_file *);
 int hugetlb_report_node_meminfo(char *buf, int len, int nid);
 void hugetlb_show_meminfo(void);
@@ -406,7 +407,8 @@ static inline unsigned long hugetlb_change_protection(
 
 static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb,
                        struct vm_area_struct *vma, unsigned long start,
-                       unsigned long end, struct page *ref_page)
+                       unsigned long end, struct page *ref_page,
+                       zap_flags_t zap_flags)
 {
        BUG();
 }
index 61786259e52a617f4d506908437f28cc56489cbe..de32c038338751bbc155793dff7a65e369d5a996 100644 (file)
@@ -3428,8 +3428,6 @@ madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
 }
 #endif
 
-typedef unsigned int __bitwise zap_flags_t;
-
 /*
  * Whether to drop the pte markers, for example, the uffd-wp information for
  * file-backed memory.  This should only be specified when we will completely
index dd382270ae409f1fa0a61b5a8aea0e2c13f6d5d9..b34ff2cdbc4face79515b1881188c3e89f36034f 100644 (file)
@@ -863,4 +863,6 @@ enum fault_flag {
        FAULT_FLAG_ORIG_PTE_VALID =     1 << 11,
 };
 
+typedef unsigned int __bitwise zap_flags_t;
+
 #endif /* _LINUX_MM_TYPES_H */
index ec9774ed84c018ae67994d9e62e485682a476559..99281aecbd289a3495822c8e38288c7e9c7a8e34 100644 (file)
@@ -4973,7 +4973,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
 
 static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
                                   unsigned long start, unsigned long end,
-                                  struct page *ref_page)
+                                  struct page *ref_page, zap_flags_t zap_flags)
 {
        struct mm_struct *mm = vma->vm_mm;
        unsigned long address;
@@ -5029,7 +5029,18 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
                 * unmapped and its refcount is dropped, so just clear pte here.
                 */
                if (unlikely(!pte_present(pte))) {
-                       huge_pte_clear(mm, address, ptep, sz);
+                       /*
+                        * If the pte was wr-protected by uffd-wp in any of the
+                        * swap forms, meanwhile the caller does not want to
+                        * drop the uffd-wp bit in this zap, then replace the
+                        * pte with a marker.
+                        */
+                       if (pte_swp_uffd_wp_any(pte) &&
+                           !(zap_flags & ZAP_FLAG_DROP_MARKER))
+                               set_huge_pte_at(mm, address, ptep,
+                                               make_pte_marker(PTE_MARKER_UFFD_WP));
+                       else
+                               huge_pte_clear(mm, address, ptep, sz);
                        spin_unlock(ptl);
                        continue;
                }
@@ -5057,7 +5068,11 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
                tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
                if (huge_pte_dirty(pte))
                        set_page_dirty(page);
-
+               /* Leave a uffd-wp pte marker if needed */
+               if (huge_pte_uffd_wp(pte) &&
+                   !(zap_flags & ZAP_FLAG_DROP_MARKER))
+                       set_huge_pte_at(mm, address, ptep,
+                                       make_pte_marker(PTE_MARKER_UFFD_WP));
                hugetlb_count_sub(pages_per_huge_page(h), mm);
                page_remove_rmap(page, vma, true);
 
@@ -5091,9 +5106,10 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
 
 void __unmap_hugepage_range_final(struct mmu_gather *tlb,
                          struct vm_area_struct *vma, unsigned long start,
-                         unsigned long end, struct page *ref_page)
+                         unsigned long end, struct page *ref_page,
+                         zap_flags_t zap_flags)
 {
-       __unmap_hugepage_range(tlb, vma, start, end, ref_page);
+       __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
 
        /*
         * Clear this flag so that x86's huge_pmd_share page_table_shareable
@@ -5109,12 +5125,13 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
 }
 
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
-                         unsigned long end, struct page *ref_page)
+                         unsigned long end, struct page *ref_page,
+                         zap_flags_t zap_flags)
 {
        struct mmu_gather tlb;
 
        tlb_gather_mmu(&tlb, vma->vm_mm);
-       __unmap_hugepage_range(&tlb, vma, start, end, ref_page);
+       __unmap_hugepage_range(&tlb, vma, start, end, ref_page, zap_flags);
        tlb_finish_mmu(&tlb);
 }
 
@@ -5169,7 +5186,7 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
                 */
                if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
                        unmap_hugepage_range(iter_vma, address,
-                                            address + huge_page_size(h), page);
+                                            address + huge_page_size(h), page, 0);
        }
        i_mmap_unlock_write(mapping);
 }
index 8827157cf3928d76894c49035c95557a8888e4c8..82adda885605c83c80dc450fdaaf70f5c5d58ba9 100644 (file)
@@ -1675,8 +1675,11 @@ static void unmap_single_vma(struct mmu_gather *tlb,
                         * safe to do nothing in this case.
                         */
                        if (vma->vm_file) {
+                               zap_flags_t zap_flags = details ?
+                                   details->zap_flags : 0;
                                i_mmap_lock_write(vma->vm_file->f_mapping);
-                               __unmap_hugepage_range_final(tlb, vma, start, end, NULL);
+                               __unmap_hugepage_range_final(tlb, vma, start, end,
+                                                            NULL, zap_flags);
                                i_mmap_unlock_write(vma->vm_file->f_mapping);
                        }
                } else