]> git.baikalelectronics.ru Git - kernel.git/commitdiff
drm/i915/guc: Capture error state on context reset
authorMatthew Brost <matthew.brost@intel.com>
Tue, 27 Jul 2021 00:23:33 +0000 (17:23 -0700)
committerJohn Harrison <John.C.Harrison@Intel.com>
Wed, 28 Jul 2021 00:31:59 +0000 (17:31 -0700)
We receive notification of an engine reset from GuC at its
completion. Meaning GuC has potentially cleared any HW state
we may have been interested in capturing. GuC resumes scheduling
on the engine post-reset, as the resets are meant to be transparent,
further muddling our error state.

There is ongoing work to define an API for a GuC debug state dump. The
suggestion for now is to manually disable FW initiated resets in cases
where debug state is needed.

Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20210727002348.97202-19-matthew.brost@intel.com
drivers/gpu/drm/i915/gt/intel_context.c
drivers/gpu/drm/i915/gt/intel_context.h
drivers/gpu/drm/i915/gt/intel_engine.h
drivers/gpu/drm/i915/gt/intel_engine_cs.c
drivers/gpu/drm/i915/gt/intel_engine_types.h
drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
drivers/gpu/drm/i915/i915_gpu_error.c

index 0bf4a13e975960ede13b17ed58d0db48ce3db626..237b70e9874445e2d2a9d766bc49f36cbd43d479 100644 (file)
@@ -509,6 +509,26 @@ retry:
        return rq;
 }
 
+struct i915_request *intel_context_find_active_request(struct intel_context *ce)
+{
+       struct i915_request *rq, *active = NULL;
+       unsigned long flags;
+
+       GEM_BUG_ON(!intel_engine_uses_guc(ce->engine));
+
+       spin_lock_irqsave(&ce->guc_active.lock, flags);
+       list_for_each_entry_reverse(rq, &ce->guc_active.requests,
+                                   sched.link) {
+               if (i915_request_completed(rq))
+                       break;
+
+               active = rq;
+       }
+       spin_unlock_irqrestore(&ce->guc_active.lock, flags);
+
+       return active;
+}
+
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
 #include "selftest_context.c"
 #endif
index 974ef85320c28835463f323334b36f08b0311143..2ed9bf5f91a59acb9e926759241462ac47cc2779 100644 (file)
@@ -200,6 +200,9 @@ int intel_context_prepare_remote_request(struct intel_context *ce,
 
 struct i915_request *intel_context_create_request(struct intel_context *ce);
 
+struct i915_request *
+intel_context_find_active_request(struct intel_context *ce);
+
 static inline bool intel_context_is_barrier(const struct intel_context *ce)
 {
        return test_bit(CONTEXT_BARRIER_BIT, &ce->flags);
index 8fc76dc8bf9855bfeed31acb56b00bbc53b635d8..1db2d3efc71f99dff55e538049a9a65d9ab8dd14 100644 (file)
@@ -245,7 +245,7 @@ ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine,
                                   ktime_t *now);
 
 struct i915_request *
-intel_engine_find_active_request(struct intel_engine_cs *engine);
+intel_engine_execlist_find_hung_request(struct intel_engine_cs *engine);
 
 u32 intel_engine_context_size(struct intel_gt *gt, u8 class);
 struct intel_context *
@@ -313,4 +313,23 @@ intel_engine_get_sibling(struct intel_engine_cs *engine, unsigned int sibling)
        return engine->cops->get_sibling(engine, sibling);
 }
 
+static inline void
+intel_engine_set_hung_context(struct intel_engine_cs *engine,
+                             struct intel_context *ce)
+{
+       engine->hung_ce = ce;
+}
+
+static inline void
+intel_engine_clear_hung_context(struct intel_engine_cs *engine)
+{
+       intel_engine_set_hung_context(engine, NULL);
+}
+
+static inline struct intel_context *
+intel_engine_get_hung_context(struct intel_engine_cs *engine)
+{
+       return engine->hung_ce;
+}
+
 #endif /* _INTEL_RINGBUFFER_H_ */
index 1eaa658507e1ce6af54846de28f7530fa6e1170a..0da7868c5a13e53629e4b6e0c71a6130338637d1 100644 (file)
@@ -1731,7 +1731,7 @@ void intel_engine_dump(struct intel_engine_cs *engine,
        drm_printf(m, "\tRequests:\n");
 
        spin_lock_irqsave(&engine->sched_engine->lock, flags);
-       rq = intel_engine_find_active_request(engine);
+       rq = intel_engine_execlist_find_hung_request(engine);
        if (rq) {
                struct intel_timeline *tl = get_timeline(rq);
 
@@ -1842,10 +1842,17 @@ static bool match_ring(struct i915_request *rq)
 }
 
 struct i915_request *
-intel_engine_find_active_request(struct intel_engine_cs *engine)
+intel_engine_execlist_find_hung_request(struct intel_engine_cs *engine)
 {
        struct i915_request *request, *active = NULL;
 
+       /*
+        * This search does not work in GuC submission mode. However, the GuC
+        * will report the hanging context directly to the driver itself. So
+        * the driver should never get here when in GuC mode.
+        */
+       GEM_BUG_ON(intel_uc_uses_guc_submission(&engine->gt->uc));
+
        /*
         * We are called by the error capture, reset and to dump engine
         * state at random points in time. In particular, note that neither is
index 1c7e2724cdae1ffef99031491fb7b768e2c25540..260cce15cb626dcd1e87b34b4491e717b659dbb8 100644 (file)
@@ -298,6 +298,8 @@ struct intel_engine_cs {
        /* keep a request in reserve for a [pm] barrier under oom */
        struct i915_request *request_pool;
 
+       struct intel_context *hung_ce;
+
        struct llist_head barrier_tasks;
 
        struct intel_context *kernel_context; /* pinned */
index d76494e3e271e4b954e088e8b908fcd60ef3f93e..7d6ca0d54f9e5cda40b87e3861cd8fbcef026240 100644 (file)
@@ -726,24 +726,6 @@ __unwind_incomplete_requests(struct intel_context *ce)
        spin_unlock_irqrestore(&sched_engine->lock, flags);
 }
 
-static struct i915_request *context_find_active_request(struct intel_context *ce)
-{
-       struct i915_request *rq, *active = NULL;
-       unsigned long flags;
-
-       spin_lock_irqsave(&ce->guc_active.lock, flags);
-       list_for_each_entry_reverse(rq, &ce->guc_active.requests,
-                                   sched.link) {
-               if (i915_request_completed(rq))
-                       break;
-
-               active = rq;
-       }
-       spin_unlock_irqrestore(&ce->guc_active.lock, flags);
-
-       return active;
-}
-
 static void __guc_reset_context(struct intel_context *ce, bool stalled)
 {
        struct i915_request *rq;
@@ -757,7 +739,7 @@ static void __guc_reset_context(struct intel_context *ce, bool stalled)
         */
        clr_context_enabled(ce);
 
-       rq = context_find_active_request(ce);
+       rq = intel_context_find_active_request(ce);
        if (!rq) {
                head = ce->ring->tail;
                stalled = false;
@@ -2201,6 +2183,20 @@ int intel_guc_sched_done_process_msg(struct intel_guc *guc,
        return 0;
 }
 
+static void capture_error_state(struct intel_guc *guc,
+                               struct intel_context *ce)
+{
+       struct intel_gt *gt = guc_to_gt(guc);
+       struct drm_i915_private *i915 = gt->i915;
+       struct intel_engine_cs *engine = __context_to_physical_engine(ce);
+       intel_wakeref_t wakeref;
+
+       intel_engine_set_hung_context(engine, ce);
+       with_intel_runtime_pm(&i915->runtime_pm, wakeref)
+               i915_capture_error_state(gt, engine->mask);
+       atomic_inc(&i915->gpu_error.reset_engine_count[engine->uabi_class]);
+}
+
 static void guc_context_replay(struct intel_context *ce)
 {
        struct i915_sched_engine *sched_engine = ce->engine->sched_engine;
@@ -2213,6 +2209,7 @@ static void guc_handle_context_reset(struct intel_guc *guc,
                                     struct intel_context *ce)
 {
        trace_intel_context_reset(ce);
+       capture_error_state(guc, ce);
        guc_context_replay(ce);
 }
 
index a2c58b54a59282278ad575ddda8f30dab9c0b039..0f08bcfbe9641559d3184f6e66e7c460d0563420 100644 (file)
@@ -1429,20 +1429,37 @@ capture_engine(struct intel_engine_cs *engine,
 {
        struct intel_engine_capture_vma *capture = NULL;
        struct intel_engine_coredump *ee;
-       struct i915_request *rq;
+       struct intel_context *ce;
+       struct i915_request *rq = NULL;
        unsigned long flags;
 
        ee = intel_engine_coredump_alloc(engine, GFP_KERNEL);
        if (!ee)
                return NULL;
 
-       spin_lock_irqsave(&engine->sched_engine->lock, flags);
-       rq = intel_engine_find_active_request(engine);
+       ce = intel_engine_get_hung_context(engine);
+       if (ce) {
+               intel_engine_clear_hung_context(engine);
+               rq = intel_context_find_active_request(ce);
+               if (!rq || !i915_request_started(rq))
+                       goto no_request_capture;
+       } else {
+               /*
+                * Getting here with GuC enabled means it is a forced error capture
+                * with no actual hang. So, no need to attempt the execlist search.
+                */
+               if (!intel_uc_uses_guc_submission(&engine->gt->uc)) {
+                       spin_lock_irqsave(&engine->sched_engine->lock, flags);
+                       rq = intel_engine_execlist_find_hung_request(engine);
+                       spin_unlock_irqrestore(&engine->sched_engine->lock,
+                                              flags);
+               }
+       }
        if (rq)
                capture = intel_engine_coredump_add_request(ee, rq,
                                                            ATOMIC_MAYFAIL);
-       spin_unlock_irqrestore(&engine->sched_engine->lock, flags);
        if (!capture) {
+no_request_capture:
                kfree(ee);
                return NULL;
        }