sched/core: add forced idle accounting for cgroups

author Josh Don <joshdon@google.com>

Wed, 29 Jun 2022 21:14:26 +0000 (14:14 -0700)

committer Peter Zijlstra <peterz@infradead.org>

Mon, 4 Jul 2022 07:23:07 +0000 (09:23 +0200)
author Josh Don <joshdon@google.com>
Wed, 29 Jun 2022 21:14:26 +0000 (14:14 -0700)
committer Peter Zijlstra <peterz@infradead.org>
Mon, 4 Jul 2022 07:23:07 +0000 (09:23 +0200)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h

index 1bfcfb1af3524f46388d240a4246218a6b042f6f..025fd0e84a316489e9ec11b0dde908a40d4c19b4 100644 (file)
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -287,6 +287,10 @@ struct css_set {
  
  struct cgroup_base_stat {
         struct task_cputime cputime;
+
+#ifdef CONFIG_SCHED_CORE
+       u64 forceidle_sum;
+#endif
  };
  
  /*
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h

index 69ae6b27846452392b9beacb2ba2fbb7278c731c..ddb5a358fd829f453d20f0c702f84cda168eef25 100644 (file)
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -28,6 +28,9 @@ enum cpu_usage_stat {
         CPUTIME_STEAL,
         CPUTIME_GUEST,
         CPUTIME_GUEST_NICE,
+#ifdef CONFIG_SCHED_CORE
+       CPUTIME_FORCEIDLE,
+#endif
         NR_STATS,
  };
  
@@ -115,4 +118,8 @@ extern void account_process_tick(struct task_struct *, int user);
  
  extern void account_idle_ticks(unsigned long ticks);
  
+#ifdef CONFIG_SCHED_CORE
+extern void __account_forceidle_time(struct task_struct *tsk, u64 delta);
+#endif
+
  #endif /* _LINUX_KERNEL_STAT_H */
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c

index 24b5c2ab55983abb4321d4e0ec792d5c8b2e1b7b..feb59380c89627e30dbe197722ff2c5fc2f830ba 100644 (file)
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -310,6 +310,9 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
         dst_bstat->cputime.utime += src_bstat->cputime.utime;
         dst_bstat->cputime.stime += src_bstat->cputime.stime;
         dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
+#ifdef CONFIG_SCHED_CORE
+       dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
+#endif
  }
  
  static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
@@ -318,6 +321,9 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
         dst_bstat->cputime.utime -= src_bstat->cputime.utime;
         dst_bstat->cputime.stime -= src_bstat->cputime.stime;
         dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
+#ifdef CONFIG_SCHED_CORE
+       dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
+#endif
  }
  
  static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
@@ -398,6 +404,11 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
         case CPUTIME_SOFTIRQ:
                 rstatc->bstat.cputime.stime += delta_exec;
                 break;
+#ifdef CONFIG_SCHED_CORE
+       case CPUTIME_FORCEIDLE:
+               rstatc->bstat.forceidle_sum += delta_exec;
+               break;
+#endif
         default:
                 break;
         }
@@ -411,8 +422,9 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
   * with how it is done by __cgroup_account_cputime_field for each bit of
   * cpu time attributed to a cgroup.
   */
-static void root_cgroup_cputime(struct task_cputime *cputime)
+static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
  {
+       struct task_cputime *cputime = &bstat->cputime;
         int i;
  
         cputime->stime = 0;
@@ -438,6 +450,10 @@ static void root_cgroup_cputime(struct task_cputime *cputime)
                 cputime->sum_exec_runtime += user;
                 cputime->sum_exec_runtime += sys;
                 cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
+
+#ifdef CONFIG_SCHED_CORE
+               bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
+#endif
         }
  }
  
@@ -445,27 +461,43 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
  {
         struct cgroup *cgrp = seq_css(seq)->cgroup;
         u64 usage, utime, stime;
-       struct task_cputime cputime;
+       struct cgroup_base_stat bstat;
+#ifdef CONFIG_SCHED_CORE
+       u64 forceidle_time;
+#endif
  
         if (cgroup_parent(cgrp)) {
                 cgroup_rstat_flush_hold(cgrp);
                 usage = cgrp->bstat.cputime.sum_exec_runtime;
                 cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
                                &utime, &stime);
+#ifdef CONFIG_SCHED_CORE
+               forceidle_time = cgrp->bstat.forceidle_sum;
+#endif
                 cgroup_rstat_flush_release();
         } else {
-               root_cgroup_cputime(&cputime);
-               usage = cputime.sum_exec_runtime;
-               utime = cputime.utime;
-               stime = cputime.stime;
+               root_cgroup_cputime(&bstat);
+               usage = bstat.cputime.sum_exec_runtime;
+               utime = bstat.cputime.utime;
+               stime = bstat.cputime.stime;
+#ifdef CONFIG_SCHED_CORE
+               forceidle_time = bstat.forceidle_sum;
+#endif
         }
  
         do_div(usage, NSEC_PER_USEC);
         do_div(utime, NSEC_PER_USEC);
         do_div(stime, NSEC_PER_USEC);
+#ifdef CONFIG_SCHED_CORE
+       do_div(forceidle_time, NSEC_PER_USEC);
+#endif
  
         seq_printf(seq, "usage_usec %llu\n"
                    "user_usec %llu\n"
                    "system_usec %llu\n",
                    usage, utime, stime);
+
+#ifdef CONFIG_SCHED_CORE
+       seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
+#endif
  }
diff --git a/kernel/sched/core_sched.c b/kernel/sched/core_sched.c

index 38a2cec21014d8805f433cdb87e71553546fe600..5103502da7baa651ef9ad330e7145328a1cb5773 100644 (file)
--- a/kernel/sched/core_sched.c
+++ b/kernel/sched/core_sched.c
@@ -277,7 +277,11 @@ void __sched_core_account_forceidle(struct rq *rq)
                 if (p == rq_i->idle)
                         continue;
  
-               __schedstat_add(p->stats.core_forceidle_sum, delta);
+               /*
+                * Note: this will account forceidle to the current cpu, even
+                * if it comes from our SMT sibling.
+                */
+               __account_forceidle_time(p, delta);
         }
  }
  
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c

index 78a233d43757fca50aa693092790f8e41bdd649f..95fc778537434da8f1689cc2a788450da8fc2b1d 100644 (file)
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -226,6 +226,21 @@ void account_idle_time(u64 cputime)
                 cpustat[CPUTIME_IDLE] += cputime;
  }
  
+
+#ifdef CONFIG_SCHED_CORE
+/*
+ * Account for forceidle time due to core scheduling.
+ *
+ * REQUIRES: schedstat is enabled.
+ */
+void __account_forceidle_time(struct task_struct *p, u64 delta)
+{
+       __schedstat_add(p->stats.core_forceidle_sum, delta);
+
+       task_group_account_field(p, CPUTIME_FORCEIDLE, delta);
+}
+#endif
+
  /*
   * When a guest is interrupted for a longer amount of time, missed clock
   * ticks are not redelivered later. Due to that, this function may on
author	Josh Don <joshdon@google.com>
	Wed, 29 Jun 2022 21:14:26 +0000 (14:14 -0700)
committer	Peter Zijlstra <peterz@infradead.org>
	Mon, 4 Jul 2022 07:23:07 +0000 (09:23 +0200)
include/linux/cgroup-defs.h		patch \| blob \| history
include/linux/kernel_stat.h		patch \| blob \| history
kernel/cgroup/rstat.c		patch \| blob \| history
kernel/sched/core_sched.c		patch \| blob \| history
kernel/sched/cputime.c		patch \| blob \| history