mm/mmu_notifier: avoid double notification when it is useless

author Jérôme Glisse <jglisse@redhat.com>

Thu, 16 Nov 2017 01:34:07 +0000 (17:34 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 16 Nov 2017 02:21:03 +0000 (18:21 -0800)
author Jérôme Glisse <jglisse@redhat.com>
Thu, 16 Nov 2017 01:34:07 +0000 (17:34 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 16 Nov 2017 02:21:03 +0000 (18:21 -0800)
diff --git a/Documentation/vm/mmu_notifier.txt b/Documentation/vm/mmu_notifier.txt

new file mode 100644 (file)

index 0000000..23b4625
--- /dev/null
+++ b/Documentation/vm/mmu_notifier.txt
@@ -0,0 +1,93 @@
+When do you need to notify inside page table lock ?
+
+When clearing a pte/pmd we are given a choice to notify the event through
+(notify version of *_clear_flush call mmu_notifier_invalidate_range) under
+the page table lock. But that notification is not necessary in all cases.
+
+For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when device use
+thing like ATS/PASID to get the IOMMU to walk the CPU page table to access a
+process virtual address space). There is only 2 cases when you need to notify
+those secondary TLB while holding page table lock when clearing a pte/pmd:
+
+  A) page backing address is free before mmu_notifier_invalidate_range_end()
+  B) a page table entry is updated to point to a new page (COW, write fault
+     on zero page, __replace_page(), ...)
+
+Case A is obvious you do not want to take the risk for the device to write to
+a page that might now be used by some completely different task.
+
+Case B is more subtle. For correctness it requires the following sequence to
+happen:
+  - take page table lock
+  - clear page table entry and notify ([pmd/pte]p_huge_clear_flush_notify())
+  - set page table entry to point to new page
+
+If clearing the page table entry is not followed by a notify before setting
+the new pte/pmd value then you can break memory model like C11 or C++11 for
+the device.
+
+Consider the following scenario (device use a feature similar to ATS/PASID):
+
+Two address addrA and addrB such that |addrA - addrB| >= PAGE_SIZE we assume
+they are write protected for COW (other case of B apply too).
+
+[Time N] --------------------------------------------------------------------
+CPU-thread-0  {try to write to addrA}
+CPU-thread-1  {try to write to addrB}
+CPU-thread-2  {}
+CPU-thread-3  {}
+DEV-thread-0  {read addrA and populate device TLB}
+DEV-thread-2  {read addrB and populate device TLB}
+[Time N+1] ------------------------------------------------------------------
+CPU-thread-0  {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}}
+CPU-thread-1  {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}}
+CPU-thread-2  {}
+CPU-thread-3  {}
+DEV-thread-0  {}
+DEV-thread-2  {}
+[Time N+2] ------------------------------------------------------------------
+CPU-thread-0  {COW_step1: {update page table to point to new page for addrA}}
+CPU-thread-1  {COW_step1: {update page table to point to new page for addrB}}
+CPU-thread-2  {}
+CPU-thread-3  {}
+DEV-thread-0  {}
+DEV-thread-2  {}
+[Time N+3] ------------------------------------------------------------------
+CPU-thread-0  {preempted}
+CPU-thread-1  {preempted}
+CPU-thread-2  {write to addrA which is a write to new page}
+CPU-thread-3  {}
+DEV-thread-0  {}
+DEV-thread-2  {}
+[Time N+3] ------------------------------------------------------------------
+CPU-thread-0  {preempted}
+CPU-thread-1  {preempted}
+CPU-thread-2  {}
+CPU-thread-3  {write to addrB which is a write to new page}
+DEV-thread-0  {}
+DEV-thread-2  {}
+[Time N+4] ------------------------------------------------------------------
+CPU-thread-0  {preempted}
+CPU-thread-1  {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}}
+CPU-thread-2  {}
+CPU-thread-3  {}
+DEV-thread-0  {}
+DEV-thread-2  {}
+[Time N+5] ------------------------------------------------------------------
+CPU-thread-0  {preempted}
+CPU-thread-1  {}
+CPU-thread-2  {}
+CPU-thread-3  {}
+DEV-thread-0  {read addrA from old page}
+DEV-thread-2  {read addrB from new page}
+
+So here because at time N+2 the clear page table entry was not pair with a
+notification to invalidate the secondary TLB, the device see the new value for
+addrB before seing the new value for addrA. This break total memory ordering
+for the device.
+
+When changing a pte to write protect or to point to a new write protected page
+with same content (KSM) it is fine to delay the mmu_notifier_invalidate_range
+call to mmu_notifier_invalidate_range_end() outside the page table lock. This
+is true even if the thread doing the page table update is preempted right after
+releasing page table lock but before call mmu_notifier_invalidate_range_end().
diff --git a/fs/dax.c b/fs/dax.c

index f3a44a7c14b35f674c34867af97d6a7681adb1eb..9ec797424e4f9ff293a341417f7b53aa46652e1e 100644 (file)
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -614,6 +614,13 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
                 if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl))
                         continue;
  
+               /*
+                * No need to call mmu_notifier_invalidate_range() as we are
+                * downgrading page table protection not changing it to point
+                * to a new page.
+                *
+                * See Documentation/vm/mmu_notifier.txt
+                */
                 if (pmdp) {
  #ifdef CONFIG_FS_DAX_PMD
                         pmd_t pmd;
@@ -628,7 +635,6 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping,
                         pmd = pmd_wrprotect(pmd);
                         pmd = pmd_mkclean(pmd);
                         set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-                       mmu_notifier_invalidate_range(vma->vm_mm, start, end);
  unlock_pmd:
                         spin_unlock(ptl);
  #endif
@@ -643,7 +649,6 @@ unlock_pmd:
                         pte = pte_wrprotect(pte);
                         pte = pte_mkclean(pte);
                         set_pte_at(vma->vm_mm, address, ptep, pte);
-                       mmu_notifier_invalidate_range(vma->vm_mm, start, end);
  unlock_pte:
                         pte_unmap_unlock(ptep, ptl);
                 }
diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h

index 2cf1c3c807f6500375bec36667d95db3704f3716..130831718e95d736d0e4f8b73bf2e3b32d7f76de 100644 (file)
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -156,7 +156,8 @@ struct mmu_notifier_ops {
          * shared page-tables, it not necessary to implement the
          * invalidate_range_start()/end() notifiers, as
          * invalidate_range() alread catches the points in time when an
-        * external TLB range needs to be flushed.
+        * external TLB range needs to be flushed. For more in depth
+        * discussion on this see Documentation/vm/mmu_notifier.txt
          *
          * The invalidate_range() function is called under the ptl
          * spin-lock and not allowed to sleep.
diff --git a/mm/huge_memory.c b/mm/huge_memory.c

index 003f7bcd0952cf7a415824f8c76c0fac74cfbdae..07ae73f4ef9149eb6b71b1ed9aca9b31f5a9d185 100644 (file)
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1189,8 +1189,15 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd,
                 goto out_free_pages;
         VM_BUG_ON_PAGE(!PageHead(page), page);
  
+       /*
+        * Leave pmd empty until pte is filled note we must notify here as
+        * concurrent CPU thread might write to new page before the call to
+        * mmu_notifier_invalidate_range_end() happens which can lead to a
+        * device seeing memory write in different order than CPU.
+        *
+        * See Documentation/vm/mmu_notifier.txt
+        */
         pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
-       /* leave pmd empty until pte is filled */
  
         pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
         pmd_populate(vma->vm_mm, &_pmd, pgtable);
@@ -2029,8 +2036,15 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
         pmd_t _pmd;
         int i;
  
-       /* leave pmd empty until pte is filled */
-       pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+       /*
+        * Leave pmd empty until pte is filled note that it is fine to delay
+        * notification until mmu_notifier_invalidate_range_end() as we are
+        * replacing a zero pmd write protected page with a zero pte write
+        * protected page.
+        *
+        * See Documentation/vm/mmu_notifier.txt
+        */
+       pmdp_huge_clear_flush(vma, haddr, pmd);
  
         pgtable = pgtable_trans_huge_withdraw(mm, pmd);
         pmd_populate(mm, &_pmd, pgtable);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 2d2ff5e8bf2bc035eb300ee16dbdaadcdb0279dd..681b300185c0c0383bb240d6a898849bf777f46b 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3256,9 +3256,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                         set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
                 } else {
                         if (cow) {
+                               /*
+                                * No need to notify as we are downgrading page
+                                * table protection not changing it to point
+                                * to a new page.
+                                *
+                                * See Documentation/vm/mmu_notifier.txt
+                                */
                                 huge_ptep_set_wrprotect(src, addr, src_pte);
-                               mmu_notifier_invalidate_range(src, mmun_start,
-                                                                  mmun_end);
                         }
                         entry = huge_ptep_get(src_pte);
                         ptepage = pte_page(entry);
@@ -4318,7 +4323,12 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
          * and that page table be reused and filled with junk.
          */
         flush_hugetlb_tlb_range(vma, start, end);
-       mmu_notifier_invalidate_range(mm, start, end);
+       /*
+        * No need to call mmu_notifier_invalidate_range() we are downgrading
+        * page table protection not changing it to point to a new page.
+        *
+        * See Documentation/vm/mmu_notifier.txt
+        */
         i_mmap_unlock_write(vma->vm_file->f_mapping);
         mmu_notifier_invalidate_range_end(mm, start, end);
  
diff --git a/mm/ksm.c b/mm/ksm.c

index 6cb60f46cce55761b0ff9d3523be69a706523972..be8f4576f84211499e269f4c69f993a975a8e0a9 100644 (file)
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1052,8 +1052,13 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
                  * So we clear the pte and flush the tlb before the check
                  * this assure us that no O_DIRECT can happen after the check
                  * or in the middle of the check.
+                *
+                * No need to notify as we are downgrading page table to read
+                * only not changing it to point to a new page.
+                *
+                * See Documentation/vm/mmu_notifier.txt
                  */
-               entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte);
+               entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte);
                 /*
                  * Check that no O_DIRECT or similar I/O is in progress on the
                  * page
@@ -1136,7 +1141,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
         }
  
         flush_cache_page(vma, addr, pte_pfn(*ptep));
-       ptep_clear_flush_notify(vma, addr, ptep);
+       /*
+        * No need to notify as we are replacing a read only page with another
+        * read only page with the same content.
+        *
+        * See Documentation/vm/mmu_notifier.txt
+        */
+       ptep_clear_flush(vma, addr, ptep);
         set_pte_at_notify(mm, addr, ptep, newpte);
  
         page_remove_rmap(page, false);
diff --git a/mm/rmap.c b/mm/rmap.c

index b874c4761e8422829610d9a1173c56139db5823b..7dfc0975de4b925e27f0e5afb485a692358e84b3 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -939,10 +939,15 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma,
  #endif
                 }
  
-               if (ret) {
-                       mmu_notifier_invalidate_range(vma->vm_mm, cstart, cend);
+               /*
+                * No need to call mmu_notifier_invalidate_range() as we are
+                * downgrading page table protection not changing it to point
+                * to a new page.
+                *
+                * See Documentation/vm/mmu_notifier.txt
+                */
+               if (ret)
                         (*cleaned)++;
-               }
         }
  
         mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
@@ -1426,6 +1431,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                         if (pte_soft_dirty(pteval))
                                 swp_pte = pte_swp_mksoft_dirty(swp_pte);
                         set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
+                       /*
+                        * No need to invalidate here it will synchronize on
+                        * against the special swap migration pte.
+                        */
                         goto discard;
                 }
  
@@ -1483,6 +1492,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                          * will take care of the rest.
                          */
                         dec_mm_counter(mm, mm_counter(page));
+                       /* We have to invalidate as we cleared the pte */
+                       mmu_notifier_invalidate_range(mm, address,
+                                                     address + PAGE_SIZE);
                 } else if (IS_ENABLED(CONFIG_MIGRATION) &&
                                 (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) {
                         swp_entry_t entry;
@@ -1498,6 +1510,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                         if (pte_soft_dirty(pteval))
                                 swp_pte = pte_swp_mksoft_dirty(swp_pte);
                         set_pte_at(mm, address, pvmw.pte, swp_pte);
+                       /*
+                        * No need to invalidate here it will synchronize on
+                        * against the special swap migration pte.
+                        */
                 } else if (PageAnon(page)) {
                         swp_entry_t entry = { .val = page_private(subpage) };
                         pte_t swp_pte;
@@ -1509,6 +1525,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                                 WARN_ON_ONCE(1);
                                 ret = false;
                                 /* We have to invalidate as we cleared the pte */
+                               mmu_notifier_invalidate_range(mm, address,
+                                                       address + PAGE_SIZE);
                                 page_vma_mapped_walk_done(&pvmw);
                                 break;
                         }
@@ -1516,6 +1534,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                         /* MADV_FREE page check */
                         if (!PageSwapBacked(page)) {
                                 if (!PageDirty(page)) {
+                                       /* Invalidate as we cleared the pte */
+                                       mmu_notifier_invalidate_range(mm,
+                                               address, address + PAGE_SIZE);
                                         dec_mm_counter(mm, MM_ANONPAGES);
                                         goto discard;
                                 }
@@ -1549,13 +1570,39 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
                         if (pte_soft_dirty(pteval))
                                 swp_pte = pte_swp_mksoft_dirty(swp_pte);
                         set_pte_at(mm, address, pvmw.pte, swp_pte);
-               } else
+                       /* Invalidate as we cleared the pte */
+                       mmu_notifier_invalidate_range(mm, address,
+                                                     address + PAGE_SIZE);
+               } else {
+                       /*
+                        * We should not need to notify here as we reach this
+                        * case only from freeze_page() itself only call from
+                        * split_huge_page_to_list() so everything below must
+                        * be true:
+                        *   - page is not anonymous
+                        *   - page is locked
+                        *
+                        * So as it is a locked file back page thus it can not
+                        * be remove from the page cache and replace by a new
+                        * page before mmu_notifier_invalidate_range_end so no
+                        * concurrent thread might update its page table to
+                        * point at new page while a device still is using this
+                        * page.
+                        *
+                        * See Documentation/vm/mmu_notifier.txt
+                        */
                         dec_mm_counter(mm, mm_counter_file(page));
+               }
  discard:
+               /*
+                * No need to call mmu_notifier_invalidate_range() it has be
+                * done above for all cases requiring it to happen under page
+                * table lock before mmu_notifier_invalidate_range_end()
+                *
+                * See Documentation/vm/mmu_notifier.txt
+                */
                 page_remove_rmap(subpage, PageHuge(page));
                 put_page(page);
-               mmu_notifier_invalidate_range(mm, address,
-                                             address + PAGE_SIZE);
         }
  
         mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
author	Jérôme Glisse <jglisse@redhat.com>
	Thu, 16 Nov 2017 01:34:07 +0000 (17:34 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 16 Nov 2017 02:21:03 +0000 (18:21 -0800)
Documentation/vm/mmu_notifier.txt	[new file with mode: 0644]	patch \| blob
fs/dax.c		patch \| blob \| history
include/linux/mmu_notifier.h		patch \| blob \| history
mm/huge_memory.c		patch \| blob \| history
mm/hugetlb.c		patch \| blob \| history
mm/ksm.c		patch \| blob \| history
mm/rmap.c		patch \| blob \| history