memcg, oom: move out_of_memory back to the charge path

author Michal Hocko <mhocko@suse.com>

Fri, 17 Aug 2018 22:47:11 +0000 (15:47 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 17 Aug 2018 23:20:30 +0000 (16:20 -0700)
author Michal Hocko <mhocko@suse.com>
Fri, 17 Aug 2018 22:47:11 +0000 (15:47 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 17 Aug 2018 23:20:30 +0000 (16:20 -0700)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h

index 50e3e807b4278c603d414a4daef3b256aed35d9a..57a202f316830cfd6729709f7e3540e1fd3e2587 100644 (file)
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -507,16 +507,16 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg);
  void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
                                 struct task_struct *p);
  
-static inline void mem_cgroup_oom_enable(void)
+static inline void mem_cgroup_enter_user_fault(void)
  {
-       WARN_ON(current->memcg_may_oom);
-       current->memcg_may_oom = 1;
+       WARN_ON(current->in_user_fault);
+       current->in_user_fault = 1;
  }
  
-static inline void mem_cgroup_oom_disable(void)
+static inline void mem_cgroup_exit_user_fault(void)
  {
-       WARN_ON(!current->memcg_may_oom);
-       current->memcg_may_oom = 0;
+       WARN_ON(!current->in_user_fault);
+       current->in_user_fault = 0;
  }
  
  static inline bool task_in_memcg_oom(struct task_struct *p)
@@ -961,11 +961,11 @@ static inline void mem_cgroup_handle_over_high(void)
  {
  }
  
-static inline void mem_cgroup_oom_enable(void)
+static inline void mem_cgroup_enter_user_fault(void)
  {
  }
  
-static inline void mem_cgroup_oom_disable(void)
+static inline void mem_cgroup_exit_user_fault(void)
  {
  }
  
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 1827f4a7a6de07501313dd12233c31e06792f638..066a2c328653b232d2599fef844e57a55010cee7 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -722,7 +722,7 @@ struct task_struct {
         unsigned                        restore_sigmask:1;
  #endif
  #ifdef CONFIG_MEMCG
-       unsigned                        memcg_may_oom:1;
+       unsigned                        in_user_fault:1;
  #ifndef CONFIG_SLOB
         unsigned                        memcg_kmem_skip_account:1;
  #endif
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index c071af193986972868f7e304b275ee9eff02c51a..d6724bed57d85569d9902f418e842e081ac2a474 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1534,28 +1534,53 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
                 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
  }
  
-static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
+enum oom_status {
+       OOM_SUCCESS,
+       OOM_FAILED,
+       OOM_ASYNC,
+       OOM_SKIPPED
+};
+
+static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
  {
-       if (!current->memcg_may_oom || order > PAGE_ALLOC_COSTLY_ORDER)
-               return;
+       if (order > PAGE_ALLOC_COSTLY_ORDER)
+               return OOM_SKIPPED;
+
         /*
          * We are in the middle of the charge context here, so we
          * don't want to block when potentially sitting on a callstack
          * that holds all kinds of filesystem and mm locks.
          *
-        * Also, the caller may handle a failed allocation gracefully
-        * (like optional page cache readahead) and so an OOM killer
-        * invocation might not even be necessary.
+        * cgroup1 allows disabling the OOM killer and waiting for outside
+        * handling until the charge can succeed; remember the context and put
+        * the task to sleep at the end of the page fault when all locks are
+        * released.
+        *
+        * On the other hand, in-kernel OOM killer allows for an async victim
+        * memory reclaim (oom_reaper) and that means that we are not solely
+        * relying on the oom victim to make a forward progress and we can
+        * invoke the oom killer here.
          *
-        * That's why we don't do anything here except remember the
-        * OOM context and then deal with it at the end of the page
-        * fault when the stack is unwound, the locks are released,
-        * and when we know whether the fault was overall successful.
+        * Please note that mem_cgroup_out_of_memory might fail to find a
+        * victim and then we have to bail out from the charge path.
          */
-       css_get(&memcg->css);
-       current->memcg_in_oom = memcg;
-       current->memcg_oom_gfp_mask = mask;
-       current->memcg_oom_order = order;
+       if (memcg->oom_kill_disable) {
+               if (!current->in_user_fault)
+                       return OOM_SKIPPED;
+               css_get(&memcg->css);
+               current->memcg_in_oom = memcg;
+               current->memcg_oom_gfp_mask = mask;
+               current->memcg_oom_order = order;
+
+               return OOM_ASYNC;
+       }
+
+       if (mem_cgroup_out_of_memory(memcg, mask, order))
+               return OOM_SUCCESS;
+
+       WARN(1,"Memory cgroup charge failed because of no reclaimable memory! "
+               "This looks like a misconfiguration or a kernel bug.");
+       return OOM_FAILED;
  }
  
  /**
@@ -1950,6 +1975,8 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
         unsigned long nr_reclaimed;
         bool may_swap = true;
         bool drained = false;
+       bool oomed = false;
+       enum oom_status oom_status;
  
         if (mem_cgroup_is_root(memcg))
                 return 0;
@@ -2037,6 +2064,9 @@ retry:
         if (nr_retries--)
                 goto retry;
  
+       if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed)
+               goto nomem;
+
         if (gfp_mask & __GFP_NOFAIL)
                 goto force;
  
@@ -2045,8 +2075,23 @@ retry:
  
         memcg_memory_event(mem_over_limit, MEMCG_OOM);
  
-       mem_cgroup_oom(mem_over_limit, gfp_mask,
+       /*
+        * keep retrying as long as the memcg oom killer is able to make
+        * a forward progress or bypass the charge if the oom killer
+        * couldn't make any progress.
+        */
+       oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
                        get_order(nr_pages * PAGE_SIZE));
+       switch (oom_status) {
+       case OOM_SUCCESS:
+               nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+               oomed = true;
+               goto retry;
+       case OOM_FAILED:
+               goto force;
+       default:
+               goto nomem;
+       }
  nomem:
         if (!(gfp_mask & __GFP_NOFAIL))
                 return -ENOMEM;
diff --git a/mm/memory.c b/mm/memory.c

index 175f344e152368080a754e11eb8f9e731068eaa5..ae2ec887508b44a4d7ec2366388944f39853a3f0 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4153,7 +4153,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
          * space.  Kernel faults are handled more gracefully.
          */
         if (flags & FAULT_FLAG_USER)
-               mem_cgroup_oom_enable();
+               mem_cgroup_enter_user_fault();
  
         if (unlikely(is_vm_hugetlb_page(vma)))
                 ret = hugetlb_fault(vma->vm_mm, vma, address, flags);
@@ -4161,7 +4161,7 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
                 ret = __handle_mm_fault(vma, address, flags);
  
         if (flags & FAULT_FLAG_USER) {
-               mem_cgroup_oom_disable();
+               mem_cgroup_exit_user_fault();
                 /*
                  * The task may have entered a memcg OOM situation but
                  * if the allocation error was handled gracefully (no
author	Michal Hocko <mhocko@suse.com>
	Fri, 17 Aug 2018 22:47:11 +0000 (15:47 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 17 Aug 2018 23:20:30 +0000 (16:20 -0700)
include/linux/memcontrol.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
mm/memcontrol.c		patch \| blob \| history
mm/memory.c		patch \| blob \| history