]> git.baikalelectronics.ru Git - kernel.git/commitdiff
xfs: throttle inode inactivation queuing on memory reclaim
authorDarrick J. Wong <djwong@kernel.org>
Fri, 6 Aug 2021 18:05:43 +0000 (11:05 -0700)
committerDarrick J. Wong <djwong@kernel.org>
Mon, 9 Aug 2021 18:13:17 +0000 (11:13 -0700)
Now that we defer inode inactivation, we've decoupled the process of
unlinking or closing an inode from the process of inactivating it.  In
theory this should lead to better throughput since we now inactivate the
queued inodes in batches instead of one at a time.

Unfortunately, one of the primary risks with this decoupling is the loss
of rate control feedback between the frontend and background threads.
In other words, a rm -rf /* thread can run the system out of memory if
it can queue inodes for inactivation and jump to a new CPU faster than
the background threads can actually clear the deferred work.  The
workers can get scheduled off the CPU if they have to do IO, etc.

To solve this problem, we configure a shrinker so that it will activate
the /second/ time the shrinkers are called.  The custom shrinker will
queue all percpu deferred inactivation workers immediately and set a
flag to force frontend callers who are releasing a vfs inode to wait for
the inactivation workers.

On my test VM with 560M of RAM and a 2TB filesystem, this seems to solve
most of the OOMing problem when deleting 10 million inodes.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
fs/xfs/xfs_icache.c
fs/xfs/xfs_icache.h
fs/xfs/xfs_mount.c
fs/xfs/xfs_mount.h
fs/xfs/xfs_trace.h

index 93ab83dfa36eddd6d4a9e7a4b100aeb78742953b..e7e69e55b7680ab50c5a96ed141a8439d2a0fda6 100644 (file)
@@ -1893,8 +1893,9 @@ xfs_inodegc_worker(
                return;
 
        ip = llist_entry(node, struct xfs_inode, i_gclist);
-       trace_xfs_inodegc_worker(ip->i_mount, __return_address);
+       trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits));
 
+       WRITE_ONCE(gc->shrinker_hits, 0);
        llist_for_each_entry_safe(ip, n, node, i_gclist) {
                xfs_iflags_set(ip, XFS_INACTIVATING);
                xfs_inodegc_inactivate(ip);
@@ -2028,6 +2029,7 @@ xfs_inodegc_want_queue_work(
 /*
  * Make the frontend wait for inactivations when:
  *
+ *  - Memory shrinkers queued the inactivation worker and it hasn't finished.
  *  - The queue depth exceeds the maximum allowable percpu backlog.
  *
  * Note: If the current thread is running a transaction, we don't ever want to
@@ -2036,11 +2038,15 @@ xfs_inodegc_want_queue_work(
 static inline bool
 xfs_inodegc_want_flush_work(
        struct xfs_inode        *ip,
-       unsigned int            items)
+       unsigned int            items,
+       unsigned int            shrinker_hits)
 {
        if (current->journal_info)
                return false;
 
+       if (shrinker_hits > 0)
+               return true;
+
        if (items > XFS_INODEGC_MAX_BACKLOG)
                return true;
 
@@ -2059,6 +2065,7 @@ xfs_inodegc_queue(
        struct xfs_mount        *mp = ip->i_mount;
        struct xfs_inodegc      *gc;
        int                     items;
+       unsigned int            shrinker_hits;
 
        trace_xfs_inode_set_need_inactive(ip);
        spin_lock(&ip->i_flags_lock);
@@ -2069,6 +2076,7 @@ xfs_inodegc_queue(
        llist_add(&ip->i_gclist, &gc->list);
        items = READ_ONCE(gc->items);
        WRITE_ONCE(gc->items, items + 1);
+       shrinker_hits = READ_ONCE(gc->shrinker_hits);
        put_cpu_ptr(gc);
 
        if (!xfs_is_inodegc_enabled(mp))
@@ -2079,7 +2087,7 @@ xfs_inodegc_queue(
                queue_work(mp->m_inodegc_wq, &gc->work);
        }
 
-       if (xfs_inodegc_want_flush_work(ip, items)) {
+       if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
                trace_xfs_inodegc_throttle(mp, __return_address);
                flush_work(&gc->work);
        }
@@ -2159,3 +2167,91 @@ xfs_inode_mark_reclaimable(
        xfs_qm_dqdetach(ip);
        xfs_inodegc_set_reclaimable(ip);
 }
+
+/*
+ * Register a phony shrinker so that we can run background inodegc sooner when
+ * there's memory pressure.  Inactivation does not itself free any memory but
+ * it does make inodes reclaimable, which eventually frees memory.
+ *
+ * The count function, seek value, and batch value are crafted to trigger the
+ * scan function during the second round of scanning.  Hopefully this means
+ * that we reclaimed enough memory that initiating metadata transactions won't
+ * make things worse.
+ */
+#define XFS_INODEGC_SHRINKER_COUNT     (1UL << DEF_PRIORITY)
+#define XFS_INODEGC_SHRINKER_BATCH     ((XFS_INODEGC_SHRINKER_COUNT / 2) + 1)
+
+static unsigned long
+xfs_inodegc_shrinker_count(
+       struct shrinker         *shrink,
+       struct shrink_control   *sc)
+{
+       struct xfs_mount        *mp = container_of(shrink, struct xfs_mount,
+                                                  m_inodegc_shrinker);
+       struct xfs_inodegc      *gc;
+       int                     cpu;
+
+       if (!xfs_is_inodegc_enabled(mp))
+               return 0;
+
+       for_each_online_cpu(cpu) {
+               gc = per_cpu_ptr(mp->m_inodegc, cpu);
+               if (!llist_empty(&gc->list))
+                       return XFS_INODEGC_SHRINKER_COUNT;
+       }
+
+       return 0;
+}
+
+static unsigned long
+xfs_inodegc_shrinker_scan(
+       struct shrinker         *shrink,
+       struct shrink_control   *sc)
+{
+       struct xfs_mount        *mp = container_of(shrink, struct xfs_mount,
+                                                  m_inodegc_shrinker);
+       struct xfs_inodegc      *gc;
+       int                     cpu;
+       bool                    no_items = true;
+
+       if (!xfs_is_inodegc_enabled(mp))
+               return SHRINK_STOP;
+
+       trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address);
+
+       for_each_online_cpu(cpu) {
+               gc = per_cpu_ptr(mp->m_inodegc, cpu);
+               if (!llist_empty(&gc->list)) {
+                       unsigned int    h = READ_ONCE(gc->shrinker_hits);
+
+                       WRITE_ONCE(gc->shrinker_hits, h + 1);
+                       queue_work_on(cpu, mp->m_inodegc_wq, &gc->work);
+                       no_items = false;
+               }
+       }
+
+       /*
+        * If there are no inodes to inactivate, we don't want the shrinker
+        * to think there's deferred work to call us back about.
+        */
+       if (no_items)
+               return LONG_MAX;
+
+       return SHRINK_STOP;
+}
+
+/* Register a shrinker so we can accelerate inodegc and throttle queuing. */
+int
+xfs_inodegc_register_shrinker(
+       struct xfs_mount        *mp)
+{
+       struct shrinker         *shrink = &mp->m_inodegc_shrinker;
+
+       shrink->count_objects = xfs_inodegc_shrinker_count;
+       shrink->scan_objects = xfs_inodegc_shrinker_scan;
+       shrink->seeks = 0;
+       shrink->flags = SHRINKER_NONSLAB;
+       shrink->batch = XFS_INODEGC_SHRINKER_BATCH;
+
+       return register_shrinker(shrink);
+}
index 18c2d224aa78b7585272d489b3e09e61d329775f..2e4cfddf8b8ed8faeb49bd5c54e0c501fb2b4b8c 100644 (file)
@@ -80,5 +80,6 @@ void xfs_inodegc_flush(struct xfs_mount *mp);
 void xfs_inodegc_stop(struct xfs_mount *mp);
 void xfs_inodegc_start(struct xfs_mount *mp);
 void xfs_inodegc_cpu_dead(struct xfs_mount *mp, unsigned int cpu);
+int xfs_inodegc_register_shrinker(struct xfs_mount *mp);
 
 #endif
index b81f2fc734bd21b152f99dc07eab84bf4510b953..ff08192d8d2ad063840c5dee16b8b757f0999d68 100644 (file)
@@ -769,6 +769,10 @@ xfs_mountfs(
                goto out_free_perag;
        }
 
+       error = xfs_inodegc_register_shrinker(mp);
+       if (error)
+               goto out_fail_wait;
+
        /*
         * Log's mount-time initialization. The first part of recovery can place
         * some items on the AIL, to be handled when recovery is finished or
@@ -779,7 +783,7 @@ xfs_mountfs(
                              XFS_FSB_TO_BB(mp, sbp->sb_logblocks));
        if (error) {
                xfs_warn(mp, "log mount failed");
-               goto out_fail_wait;
+               goto out_inodegc_shrinker;
        }
 
        /* Make sure the summary counts are ok. */
@@ -974,6 +978,8 @@ xfs_mountfs(
        xfs_unmount_flush_inodes(mp);
  out_log_dealloc:
        xfs_log_mount_cancel(mp);
+ out_inodegc_shrinker:
+       unregister_shrinker(&mp->m_inodegc_shrinker);
  out_fail_wait:
        if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
                xfs_buftarg_drain(mp->m_logdev_targp);
@@ -1054,6 +1060,7 @@ xfs_unmountfs(
 #if defined(DEBUG)
        xfs_errortag_clearall(mp);
 #endif
+       unregister_shrinker(&mp->m_inodegc_shrinker);
        xfs_free_perag(mp);
 
        xfs_errortag_del(mp);
index 4b3ce6109e7a022abeecec4055bb5ca941dea7e9..91a10233d04c3d57fd901b243c137990f4f20c1d 100644 (file)
@@ -65,6 +65,7 @@ struct xfs_inodegc {
 
        /* approximate count of inodes in the list */
        unsigned int            items;
+       unsigned int            shrinker_hits;
 };
 
 /*
@@ -210,6 +211,8 @@ typedef struct xfs_mount {
        xfs_agnumber_t          m_agirotor;     /* last ag dir inode alloced */
        spinlock_t              m_agirotor_lock;/* .. and lock protecting it */
 
+       /* Memory shrinker to throttle and reprioritize inodegc */
+       struct shrinker         m_inodegc_shrinker;
        /*
         * Workqueue item so that we can coalesce multiple inode flush attempts
         * into a single flush.
index 4a6616490315b905576b790cd276aa40b8947478..57ce91dcc0a66eca2d83b53f94b02bfdc5399cc5 100644 (file)
@@ -157,6 +157,22 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_put);
 DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag);
 DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag);
 
+TRACE_EVENT(xfs_inodegc_worker,
+       TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits),
+       TP_ARGS(mp, shrinker_hits),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(unsigned int, shrinker_hits)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->shrinker_hits = shrinker_hits;
+       ),
+       TP_printk("dev %d:%d shrinker_hits %u",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->shrinker_hits)
+);
+
 DECLARE_EVENT_CLASS(xfs_fs_class,
        TP_PROTO(struct xfs_mount *mp, void *caller_ip),
        TP_ARGS(mp, caller_ip),
@@ -191,7 +207,6 @@ DEFINE_EVENT(xfs_fs_class, name,                                    \
 DEFINE_FS_EVENT(xfs_inodegc_flush);
 DEFINE_FS_EVENT(xfs_inodegc_start);
 DEFINE_FS_EVENT(xfs_inodegc_stop);
-DEFINE_FS_EVENT(xfs_inodegc_worker);
 DEFINE_FS_EVENT(xfs_inodegc_queue);
 DEFINE_FS_EVENT(xfs_inodegc_throttle);
 DEFINE_FS_EVENT(xfs_fs_sync_fs);
@@ -200,6 +215,26 @@ DEFINE_FS_EVENT(xfs_blockgc_stop);
 DEFINE_FS_EVENT(xfs_blockgc_worker);
 DEFINE_FS_EVENT(xfs_blockgc_flush_all);
 
+TRACE_EVENT(xfs_inodegc_shrinker_scan,
+       TP_PROTO(struct xfs_mount *mp, struct shrink_control *sc,
+                void *caller_ip),
+       TP_ARGS(mp, sc, caller_ip),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(unsigned long, nr_to_scan)
+               __field(void *, caller_ip)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->nr_to_scan = sc->nr_to_scan;
+               __entry->caller_ip = caller_ip;
+       ),
+       TP_printk("dev %d:%d nr_to_scan %lu caller %pS",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->nr_to_scan,
+                 __entry->caller_ip)
+);
+
 DECLARE_EVENT_CLASS(xfs_ag_class,
        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno),
        TP_ARGS(mp, agno),