]> git.baikalelectronics.ru Git - kernel.git/commitdiff
drm/i915: Fix request ref counting during error capture & debugfs dump
authorJohn Harrison <John.C.Harrison@Intel.com>
Fri, 27 Jan 2023 00:28:36 +0000 (16:28 -0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 9 Feb 2023 10:28:07 +0000 (11:28 +0100)
[ Upstream commit 86d8ddc74124c3fdfc139f246ba6da15e45e86e3 ]

When GuC support was added to error capture, the reference counting
around the request object was broken. Fix it up.

The context based search manages the spinlocking around the search
internally. So it needs to grab the reference count internally as
well. The execlist only request based search relies on external
locking, so it needs an external reference count but within the
spinlock not outside it.

The only other caller of the context based search is the code for
dumping engine state to debugfs. That code wasn't previously getting
an explicit reference at all as it does everything while holding the
execlist specific spinlock. So, that needs updaing as well as that
spinlock doesn't help when using GuC submission. Rather than trying to
conditionally get/put depending on submission model, just change it to
always do the get/put.

v2: Explicitly document adding an extra blank line in some dense code
(Andy Shevchenko). Fix multiple potential null pointer derefs in case
of no request found (some spotted by Tvrtko, but there was more!).
Also fix a leaked request in case of !started and another in
__guc_reset_context now that intel_context_find_active_request is
actually reference counting the returned request.
v3: Add a _get suffix to intel_context_find_active_request now that it
grabs a reference (Daniele).
v4: Split the intel_guc_find_hung_context change to a separate patch
and rename intel_context_find_active_request_get to
intel_context_get_active_request (Tvrtko).
v5: s/locking/reference counting/ in commit message (Tvrtko)

Fixes: ffb7fdae4c72 ("drm/i915/guc: Fix for error capture after full GPU reset with GuC")
Fixes: 674632970f3d ("drm/i915/guc: Capture error state on context reset")
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Reviewed-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Acked-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Andrzej Hajda <andrzej.hajda@intel.com>
Cc: Matthew Auld <matthew.auld@intel.com>
Cc: Matt Roper <matthew.d.roper@intel.com>
Cc: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Cc: Michael Cheng <michael.cheng@intel.com>
Cc: Lucas De Marchi <lucas.demarchi@intel.com>
Cc: Tejas Upadhyay <tejaskumarx.surendrakumar.upadhyay@intel.com>
Cc: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Aravind Iddamsetty <aravind.iddamsetty@intel.com>
Cc: Alan Previn <alan.previn.teres.alexis@intel.com>
Cc: Bruce Chang <yu.bruce.chang@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20230127002842.3169194-3-John.C.Harrison@Intel.com
(cherry picked from commit 3700e353781e27f1bc7222f51f2cc36cbeb9b4ec)
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
drivers/gpu/drm/i915/gt/intel_context.c
drivers/gpu/drm/i915/gt/intel_context.h
drivers/gpu/drm/i915/gt/intel_engine_cs.c
drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
drivers/gpu/drm/i915/i915_gpu_error.c

index e94365b08f1efc133c8b6e88f2ba43156f414fa9..2aa63ec521b89844df7bdea1dd399e04f8de7bce 100644 (file)
@@ -528,7 +528,7 @@ retry:
        return rq;
 }
 
-struct i915_request *intel_context_find_active_request(struct intel_context *ce)
+struct i915_request *intel_context_get_active_request(struct intel_context *ce)
 {
        struct intel_context *parent = intel_context_to_parent(ce);
        struct i915_request *rq, *active = NULL;
@@ -552,6 +552,8 @@ struct i915_request *intel_context_find_active_request(struct intel_context *ce)
 
                active = rq;
        }
+       if (active)
+               active = i915_request_get_rcu(active);
        spin_unlock_irqrestore(&parent->guc_state.lock, flags);
 
        return active;
index be09fb2e883a547d8e132d7f858d10ff7d5e6dd0..4ab6c8ddd6ecca6c8d1b69999b67e97765225f0d 100644 (file)
@@ -268,8 +268,7 @@ int intel_context_prepare_remote_request(struct intel_context *ce,
 
 struct i915_request *intel_context_create_request(struct intel_context *ce);
 
-struct i915_request *
-intel_context_find_active_request(struct intel_context *ce);
+struct i915_request *intel_context_get_active_request(struct intel_context *ce);
 
 static inline bool intel_context_is_barrier(const struct intel_context *ce)
 {
index fcbccd8d244e937cc201dcae3f0a96188cdb2597..4327c6d91ce94f380b6fb3b16517a1f084708508 100644 (file)
@@ -2201,9 +2201,11 @@ static void engine_dump_active_requests(struct intel_engine_cs *engine, struct d
        if (guc) {
                ce = intel_engine_get_hung_context(engine);
                if (ce)
-                       hung_rq = intel_context_find_active_request(ce);
+                       hung_rq = intel_context_get_active_request(ce);
        } else {
                hung_rq = intel_engine_execlist_find_hung_request(engine);
+               if (hung_rq)
+                       hung_rq = i915_request_get_rcu(hung_rq);
        }
 
        if (hung_rq)
@@ -2214,6 +2216,8 @@ static void engine_dump_active_requests(struct intel_engine_cs *engine, struct d
        else
                intel_engine_dump_active_requests(&engine->sched_engine->requests,
                                                  hung_rq, m);
+       if (hung_rq)
+               i915_request_put(hung_rq);
 }
 
 void intel_engine_dump(struct intel_engine_cs *engine,
index 259162002c3ace24a83e12fc9559e072a3d9fbfe..0ec07dad1dcf15e8c518e8483b1e1d12e150f2e2 100644 (file)
@@ -1685,7 +1685,7 @@ static void __guc_reset_context(struct intel_context *ce, intel_engine_mask_t st
                        goto next_context;
 
                guilty = false;
-               rq = intel_context_find_active_request(ce);
+               rq = intel_context_get_active_request(ce);
                if (!rq) {
                        head = ce->ring->tail;
                        goto out_replay;
@@ -1698,6 +1698,7 @@ static void __guc_reset_context(struct intel_context *ce, intel_engine_mask_t st
                head = intel_ring_wrap(ce->ring, rq->head);
 
                __i915_request_reset(rq, guilty);
+               i915_request_put(rq);
 out_replay:
                guc_reset_state(ce, head, guilty);
 next_context:
index 9ea2fe34e7d307f6a40c979ea5a9860a02f9dbdc..a8ee4cd2ff164a0d4e798220a545ae4366b5d2d2 100644 (file)
@@ -1603,7 +1603,7 @@ capture_engine(struct intel_engine_cs *engine,
        ce = intel_engine_get_hung_context(engine);
        if (ce) {
                intel_engine_clear_hung_context(engine);
-               rq = intel_context_find_active_request(ce);
+               rq = intel_context_get_active_request(ce);
                if (!rq || !i915_request_started(rq))
                        goto no_request_capture;
        } else {
@@ -1614,21 +1614,18 @@ capture_engine(struct intel_engine_cs *engine,
                if (!intel_uc_uses_guc_submission(&engine->gt->uc)) {
                        spin_lock_irqsave(&engine->sched_engine->lock, flags);
                        rq = intel_engine_execlist_find_hung_request(engine);
+                       if (rq)
+                               rq = i915_request_get_rcu(rq);
                        spin_unlock_irqrestore(&engine->sched_engine->lock,
                                               flags);
                }
        }
-       if (rq)
-               rq = i915_request_get_rcu(rq);
-
        if (!rq)
                goto no_request_capture;
 
        capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL);
-       if (!capture) {
-               i915_request_put(rq);
+       if (!capture)
                goto no_request_capture;
-       }
        if (dump_flags & CORE_DUMP_FLAG_IS_GUC_CAPTURE)
                intel_guc_capture_get_matching_node(engine->gt, ee, ce);
 
@@ -1638,6 +1635,8 @@ capture_engine(struct intel_engine_cs *engine,
        return ee;
 
 no_request_capture:
+       if (rq)
+               i915_request_put(rq);
        kfree(ee);
        return NULL;
 }