psi: cgroup support

author Johannes Weiner <hannes@cmpxchg.org>

Fri, 26 Oct 2018 22:06:31 +0000 (15:06 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 26 Oct 2018 23:26:32 +0000 (16:26 -0700)
author Johannes Weiner <hannes@cmpxchg.org>
Fri, 26 Oct 2018 22:06:31 +0000 (15:06 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 26 Oct 2018 23:26:32 +0000 (16:26 -0700)
diff --git a/Documentation/accounting/psi.txt b/Documentation/accounting/psi.txt

index 3753a82f1cf5fcfe3914fe12ac0cbbe65a02d19b..b8ca28b60215a48f1ee99cfae36f20e0b8d0e8da 100644 (file)
--- a/Documentation/accounting/psi.txt
+++ b/Documentation/accounting/psi.txt
@@ -62,3 +62,12 @@ well as medium and long term trends. The total absolute stall time is
  tracked and exported as well, to allow detection of latency spikes
  which wouldn't necessarily make a dent in the time averages, or to
  average trends over custom time frames.
+
+Cgroup2 interface
+=================
+
+In a system with a CONFIG_CGROUP=y kernel and the cgroup2 filesystem
+mounted, pressure stall information is also tracked for tasks grouped
+into cgroups. Each subdirectory in the cgroupfs mountpoint contains
+cpu.pressure, memory.pressure, and io.pressure files; the format is
+the same as the /proc/pressure/ files.
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst

index caf36105a1c7b50b3b76d53afcb2d75d094fee77..8389d6f72a77ce50cecd2d0e862a10a84e5c3972 100644 (file)
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -966,6 +966,12 @@ All time durations are in microseconds.
         $PERIOD duration.  "max" for $MAX indicates no limit.  If only
         one number is written, $MAX is updated.
  
+  cpu.pressure
+       A read-only nested-key file which exists on non-root cgroups.
+
+       Shows pressure stall information for CPU. See
+       Documentation/accounting/psi.txt for details.
+
  
  Memory
  ------
@@ -1271,6 +1277,12 @@ PAGE_SIZE multiple when read back.
         higher than the limit for an extended period of time.  This
         reduces the impact on the workload and memory management.
  
+  memory.pressure
+       A read-only nested-key file which exists on non-root cgroups.
+
+       Shows pressure stall information for memory. See
+       Documentation/accounting/psi.txt for details.
+
  
  Usage Guidelines
  ~~~~~~~~~~~~~~~~
@@ -1408,6 +1420,12 @@ IO Interface Files
  
           8:16 rbps=2097152 wbps=max riops=max wiops=max
  
+  io.pressure
+       A read-only nested-key file which exists on non-root cgroups.
+
+       Shows pressure stall information for IO. See
+       Documentation/accounting/psi.txt for details.
+
  
  Writeback
  ~~~~~~~~~
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h

index 22254c1fe1c5c9dfd8444fec171f55764bbf1030..5e1694fe035b918922e6b0d287380e0f5f7a9816 100644 (file)
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -20,6 +20,7 @@
  #include <linux/u64_stats_sync.h>
  #include <linux/workqueue.h>
  #include <linux/bpf-cgroup.h>
+#include <linux/psi_types.h>
  
  #ifdef CONFIG_CGROUPS
  
@@ -436,6 +437,9 @@ struct cgroup {
         /* used to schedule release agent */
         struct work_struct release_agent_work;
  
+       /* used to track pressure stalls */
+       struct psi_group psi;
+
         /* used to store eBPF programs */
         struct cgroup_bpf bpf;
  
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h

index b622d660860509d266008ebcf75175af7839c496..9968332cceed0e64e5fc9bdb814507b0bf67451b 100644 (file)
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -650,6 +650,11 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
         pr_cont_kernfs_path(cgrp->kn);
  }
  
+static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
+{
+       return &cgrp->psi;
+}
+
  static inline void cgroup_init_kthreadd(void)
  {
         /*
@@ -703,6 +708,16 @@ static inline union kernfs_node_id *cgroup_get_kernfs_id(struct cgroup *cgrp)
         return NULL;
  }
  
+static inline struct cgroup *cgroup_parent(struct cgroup *cgrp)
+{
+       return NULL;
+}
+
+static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
+{
+       return NULL;
+}
+
  static inline bool task_under_cgroup_hierarchy(struct task_struct *task,
                                                struct cgroup *ancestor)
  {
diff --git a/include/linux/psi.h b/include/linux/psi.h

index b0daf050de58f33f0d6a4156d636eb3a334722b3..8e0725aac0aa82fc45997ca21bddfb40a431a9ab 100644 (file)
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -4,6 +4,9 @@
  #include <linux/psi_types.h>
  #include <linux/sched.h>
  
+struct seq_file;
+struct css_set;
+
  #ifdef CONFIG_PSI
  
  extern bool psi_disabled;
@@ -16,6 +19,14 @@ void psi_memstall_tick(struct task_struct *task, int cpu);
  void psi_memstall_enter(unsigned long *flags);
  void psi_memstall_leave(unsigned long *flags);
  
+int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
+
+#ifdef CONFIG_CGROUPS
+int psi_cgroup_alloc(struct cgroup *cgrp);
+void psi_cgroup_free(struct cgroup *cgrp);
+void cgroup_move_task(struct task_struct *p, struct css_set *to);
+#endif
+
  #else /* CONFIG_PSI */
  
  static inline void psi_init(void) {}
@@ -23,6 +34,20 @@ static inline void psi_init(void) {}
  static inline void psi_memstall_enter(unsigned long *flags) {}
  static inline void psi_memstall_leave(unsigned long *flags) {}
  
+#ifdef CONFIG_CGROUPS
+static inline int psi_cgroup_alloc(struct cgroup *cgrp)
+{
+       return 0;
+}
+static inline void psi_cgroup_free(struct cgroup *cgrp)
+{
+}
+static inline void cgroup_move_task(struct task_struct *p, struct css_set *to)
+{
+       rcu_assign_pointer(p->cgroups, to);
+}
+#endif
+
  #endif /* CONFIG_PSI */
  
  #endif /* _LINUX_PSI_H */
diff --git a/init/Kconfig b/init/Kconfig

index 26e639df55174a9a44310acca3126b81b3a2ae07..a4112e95724a05e85cc5a1d5e7c651f02ecfafeb 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -501,6 +501,10 @@ config PSI
           the share of walltime in which some or all tasks in the system are
           delayed due to contention of the respective resource.
  
+         In kernels with cgroup support, cgroups (cgroup2 only) will
+         have cpu.pressure, memory.pressure, and io.pressure files,
+         which aggregate pressure stalls for the grouped tasks only.
+
           For more details see Documentation/accounting/psi.txt.
  
           Say N if unsure.
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c

index 4c1cf0969a80e9c63d111ab60e23b180f402cde0..8b79318810ad5c63d9e70cd634f6d6bc928659ef 100644 (file)
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -55,6 +55,7 @@
  #include <linux/nsproxy.h>
  #include <linux/file.h>
  #include <linux/sched/cputime.h>
+#include <linux/psi.h>
  #include <net/sock.h>
  
  #define CREATE_TRACE_POINTS
@@ -862,7 +863,7 @@ static void css_set_move_task(struct task_struct *task,
                  */
                 WARN_ON_ONCE(task->flags & PF_EXITING);
  
-               rcu_assign_pointer(task->cgroups, to_cset);
+               cgroup_move_task(task, to_cset);
                 list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
                                                              &to_cset->tasks);
         }
@@ -3446,6 +3447,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
         return ret;
  }
  
+#ifdef CONFIG_PSI
+static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
+{
+       return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO);
+}
+static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
+{
+       return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM);
+}
+static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
+{
+       return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU);
+}
+#endif
+
  static int cgroup_file_open(struct kernfs_open_file *of)
  {
         struct cftype *cft = of->kn->priv;
@@ -4576,6 +4592,23 @@ static struct cftype cgroup_base_files[] = {
                 .flags = CFTYPE_NOT_ON_ROOT,
                 .seq_show = cpu_stat_show,
         },
+#ifdef CONFIG_PSI
+       {
+               .name = "io.pressure",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cgroup_io_pressure_show,
+       },
+       {
+               .name = "memory.pressure",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cgroup_memory_pressure_show,
+       },
+       {
+               .name = "cpu.pressure",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = cgroup_cpu_pressure_show,
+       },
+#endif
         { }     /* terminate */
  };
  
@@ -4636,6 +4669,7 @@ static void css_free_rwork_fn(struct work_struct *work)
                          */
                         cgroup_put(cgroup_parent(cgrp));
                         kernfs_put(cgrp->kn);
+                       psi_cgroup_free(cgrp);
                         if (cgroup_on_dfl(cgrp))
                                 cgroup_rstat_exit(cgrp);
                         kfree(cgrp);
@@ -4892,10 +4926,15 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
         cgrp->self.parent = &parent->self;
         cgrp->root = root;
         cgrp->level = level;
-       ret = cgroup_bpf_inherit(cgrp);
+
+       ret = psi_cgroup_alloc(cgrp);
         if (ret)
                 goto out_idr_free;
  
+       ret = cgroup_bpf_inherit(cgrp);
+       if (ret)
+               goto out_psi_free;
+
         for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
                 cgrp->ancestor_ids[tcgrp->level] = tcgrp->id;
  
@@ -4933,6 +4972,8 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
  
         return cgrp;
  
+out_psi_free:
+       psi_cgroup_free(cgrp);
  out_idr_free:
         cgroup_idr_remove(&root->cgroup_idr, cgrp->id);
  out_stat_exit:
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c

index 595414599b988c9ce6aefd16ac1fe98fadda0481..7cdecfc010af83f1f5d8679536433f288aa847d7 100644 (file)
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -473,9 +473,35 @@ static void psi_group_change(struct psi_group *group, int cpu,
                 schedule_delayed_work(&group->clock_work, PSI_FREQ);
  }
  
+static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
+{
+#ifdef CONFIG_CGROUPS
+       struct cgroup *cgroup = NULL;
+
+       if (!*iter)
+               cgroup = task->cgroups->dfl_cgrp;
+       else if (*iter == &psi_system)
+               return NULL;
+       else
+               cgroup = cgroup_parent(*iter);
+
+       if (cgroup && cgroup_parent(cgroup)) {
+               *iter = cgroup;
+               return cgroup_psi(cgroup);
+       }
+#else
+       if (*iter)
+               return NULL;
+#endif
+       *iter = &psi_system;
+       return &psi_system;
+}
+
  void psi_task_change(struct task_struct *task, int clear, int set)
  {
         int cpu = task_cpu(task);
+       struct psi_group *group;
+       void *iter = NULL;
  
         if (!task->pid)
                 return;
@@ -492,17 +518,23 @@ void psi_task_change(struct task_struct *task, int clear, int set)
         task->psi_flags &= ~clear;
         task->psi_flags |= set;
  
-       psi_group_change(&psi_system, cpu, clear, set);
+       while ((group = iterate_groups(task, &iter)))
+               psi_group_change(group, cpu, clear, set);
  }
  
  void psi_memstall_tick(struct task_struct *task, int cpu)
  {
-       struct psi_group_cpu *groupc;
+       struct psi_group *group;
+       void *iter = NULL;
  
-       groupc = per_cpu_ptr(psi_system.pcpu, cpu);
-       write_seqcount_begin(&groupc->seq);
-       record_times(groupc, cpu, true);
-       write_seqcount_end(&groupc->seq);
+       while ((group = iterate_groups(task, &iter))) {
+               struct psi_group_cpu *groupc;
+
+               groupc = per_cpu_ptr(group->pcpu, cpu);
+               write_seqcount_begin(&groupc->seq);
+               record_times(groupc, cpu, true);
+               write_seqcount_end(&groupc->seq);
+       }
  }
  
  /**
@@ -565,8 +597,78 @@ void psi_memstall_leave(unsigned long *flags)
         rq_unlock_irq(rq, &rf);
  }
  
-static int psi_show(struct seq_file *m, struct psi_group *group,
-                   enum psi_res res)
+#ifdef CONFIG_CGROUPS
+int psi_cgroup_alloc(struct cgroup *cgroup)
+{
+       if (psi_disabled)
+               return 0;
+
+       cgroup->psi.pcpu = alloc_percpu(struct psi_group_cpu);
+       if (!cgroup->psi.pcpu)
+               return -ENOMEM;
+       group_init(&cgroup->psi);
+       return 0;
+}
+
+void psi_cgroup_free(struct cgroup *cgroup)
+{
+       if (psi_disabled)
+               return;
+
+       cancel_delayed_work_sync(&cgroup->psi.clock_work);
+       free_percpu(cgroup->psi.pcpu);
+}
+
+/**
+ * cgroup_move_task - move task to a different cgroup
+ * @task: the task
+ * @to: the target css_set
+ *
+ * Move task to a new cgroup and safely migrate its associated stall
+ * state between the different groups.
+ *
+ * This function acquires the task's rq lock to lock out concurrent
+ * changes to the task's scheduling state and - in case the task is
+ * running - concurrent changes to its stall state.
+ */
+void cgroup_move_task(struct task_struct *task, struct css_set *to)
+{
+       bool move_psi = !psi_disabled;
+       unsigned int task_flags = 0;
+       struct rq_flags rf;
+       struct rq *rq;
+
+       if (move_psi) {
+               rq = task_rq_lock(task, &rf);
+
+               if (task_on_rq_queued(task))
+                       task_flags = TSK_RUNNING;
+               else if (task->in_iowait)
+                       task_flags = TSK_IOWAIT;
+
+               if (task->flags & PF_MEMSTALL)
+                       task_flags |= TSK_MEMSTALL;
+
+               if (task_flags)
+                       psi_task_change(task, task_flags, 0);
+       }
+
+       /*
+        * Lame to do this here, but the scheduler cannot be locked
+        * from the outside, so we move cgroups from inside sched/.
+        */
+       rcu_assign_pointer(task->cgroups, to);
+
+       if (move_psi) {
+               if (task_flags)
+                       psi_task_change(task, 0, task_flags);
+
+               task_rq_unlock(rq, task, &rf);
+       }
+}
+#endif /* CONFIG_CGROUPS */
+
+int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
  {
         int full;
author	Johannes Weiner <hannes@cmpxchg.org>
	Fri, 26 Oct 2018 22:06:31 +0000 (15:06 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 26 Oct 2018 23:26:32 +0000 (16:26 -0700)
Documentation/accounting/psi.txt		patch \| blob \| history
Documentation/admin-guide/cgroup-v2.rst		patch \| blob \| history
include/linux/cgroup-defs.h		patch \| blob \| history
include/linux/cgroup.h		patch \| blob \| history
include/linux/psi.h		patch \| blob \| history
init/Kconfig		patch \| blob \| history
kernel/cgroup/cgroup.c		patch \| blob \| history
kernel/sched/psi.c		patch \| blob \| history