]> git.baikalelectronics.ru Git - kernel.git/commitdiff
drm/i915/guc: Fix for error capture after full GPU reset with GuC
authorJohn Harrison <John.C.Harrison@Intel.com>
Tue, 27 Jul 2021 00:23:34 +0000 (17:23 -0700)
committerJohn Harrison <John.C.Harrison@Intel.com>
Wed, 28 Jul 2021 00:32:02 +0000 (17:32 -0700)
In the case of a full GPU reset (e.g. because GuC has died or because
GuC's hang detection has been disabled), the driver can't rely on GuC
reporting the guilty context. Instead, the driver needs to scan all
active contexts and find one that is currently executing, as per the
execlist mode behaviour. In GuC mode, this scan is different to
execlist mode as the active request list is handled very differently.

Similarly, the request state dump in debugfs needs to be handled
differently when in GuC submission mode.

Also refactured some of the request scanning code to avoid duplication
across the multiple code paths that are now replicating it.

Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Signed-off-by: Matthew Brost <matthew.brost@intel.com>
Reviewed-by: Matthew Brost <matthew.brost@intel.com>
Signed-off-by: John Harrison <John.C.Harrison@Intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20210727002348.97202-20-matthew.brost@intel.com
drivers/gpu/drm/i915/gt/intel_engine.h
drivers/gpu/drm/i915/gt/intel_engine_cs.c
drivers/gpu/drm/i915/gt/intel_engine_heartbeat.c
drivers/gpu/drm/i915/gt/intel_reset.c
drivers/gpu/drm/i915/gt/uc/intel_guc.h
drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c
drivers/gpu/drm/i915/gt/uc/intel_guc_submission.h
drivers/gpu/drm/i915/i915_request.c
drivers/gpu/drm/i915/i915_request.h

index 1db2d3efc71f99dff55e538049a9a65d9ab8dd14..c2a5640ae05597e689cef42ca60c86866e91ffc4 100644 (file)
@@ -240,6 +240,9 @@ __printf(3, 4)
 void intel_engine_dump(struct intel_engine_cs *engine,
                       struct drm_printer *m,
                       const char *header, ...);
+void intel_engine_dump_active_requests(struct list_head *requests,
+                                      struct i915_request *hung_rq,
+                                      struct drm_printer *m);
 
 ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine,
                                   ktime_t *now);
index 0da7868c5a13e53629e4b6e0c71a6130338637d1..dea0e522c5c727d2881ff2cdd54b2f66c19f1cf5 100644 (file)
@@ -1684,6 +1684,98 @@ static void print_properties(struct intel_engine_cs *engine,
                           read_ul(&engine->defaults, p->offset));
 }
 
+static void engine_dump_request(struct i915_request *rq, struct drm_printer *m, const char *msg)
+{
+       struct intel_timeline *tl = get_timeline(rq);
+
+       i915_request_show(m, rq, msg, 0);
+
+       drm_printf(m, "\t\tring->start:  0x%08x\n",
+                  i915_ggtt_offset(rq->ring->vma));
+       drm_printf(m, "\t\tring->head:   0x%08x\n",
+                  rq->ring->head);
+       drm_printf(m, "\t\tring->tail:   0x%08x\n",
+                  rq->ring->tail);
+       drm_printf(m, "\t\tring->emit:   0x%08x\n",
+                  rq->ring->emit);
+       drm_printf(m, "\t\tring->space:  0x%08x\n",
+                  rq->ring->space);
+
+       if (tl) {
+               drm_printf(m, "\t\tring->hwsp:   0x%08x\n",
+                          tl->hwsp_offset);
+               intel_timeline_put(tl);
+       }
+
+       print_request_ring(m, rq);
+
+       if (rq->context->lrc_reg_state) {
+               drm_printf(m, "Logical Ring Context:\n");
+               hexdump(m, rq->context->lrc_reg_state, PAGE_SIZE);
+       }
+}
+
+void intel_engine_dump_active_requests(struct list_head *requests,
+                                      struct i915_request *hung_rq,
+                                      struct drm_printer *m)
+{
+       struct i915_request *rq;
+       const char *msg;
+       enum i915_request_state state;
+
+       list_for_each_entry(rq, requests, sched.link) {
+               if (rq == hung_rq)
+                       continue;
+
+               state = i915_test_request_state(rq);
+               if (state < I915_REQUEST_QUEUED)
+                       continue;
+
+               if (state == I915_REQUEST_ACTIVE)
+                       msg = "\t\tactive on engine";
+               else
+                       msg = "\t\tactive in queue";
+
+               engine_dump_request(rq, m, msg);
+       }
+}
+
+static void engine_dump_active_requests(struct intel_engine_cs *engine, struct drm_printer *m)
+{
+       struct i915_request *hung_rq = NULL;
+       struct intel_context *ce;
+       bool guc;
+
+       /*
+        * No need for an engine->irq_seqno_barrier() before the seqno reads.
+        * The GPU is still running so requests are still executing and any
+        * hardware reads will be out of date by the time they are reported.
+        * But the intention here is just to report an instantaneous snapshot
+        * so that's fine.
+        */
+       lockdep_assert_held(&engine->sched_engine->lock);
+
+       drm_printf(m, "\tRequests:\n");
+
+       guc = intel_uc_uses_guc_submission(&engine->gt->uc);
+       if (guc) {
+               ce = intel_engine_get_hung_context(engine);
+               if (ce)
+                       hung_rq = intel_context_find_active_request(ce);
+       } else {
+               hung_rq = intel_engine_execlist_find_hung_request(engine);
+       }
+
+       if (hung_rq)
+               engine_dump_request(hung_rq, m, "\t\thung");
+
+       if (guc)
+               intel_guc_dump_active_requests(engine, hung_rq, m);
+       else
+               intel_engine_dump_active_requests(&engine->sched_engine->requests,
+                                                 hung_rq, m);
+}
+
 void intel_engine_dump(struct intel_engine_cs *engine,
                       struct drm_printer *m,
                       const char *header, ...)
@@ -1728,39 +1820,9 @@ void intel_engine_dump(struct intel_engine_cs *engine,
                   i915_reset_count(error));
        print_properties(engine, m);
 
-       drm_printf(m, "\tRequests:\n");
-
        spin_lock_irqsave(&engine->sched_engine->lock, flags);
-       rq = intel_engine_execlist_find_hung_request(engine);
-       if (rq) {
-               struct intel_timeline *tl = get_timeline(rq);
+       engine_dump_active_requests(engine, m);
 
-               i915_request_show(m, rq, "\t\tactive ", 0);
-
-               drm_printf(m, "\t\tring->start:  0x%08x\n",
-                          i915_ggtt_offset(rq->ring->vma));
-               drm_printf(m, "\t\tring->head:   0x%08x\n",
-                          rq->ring->head);
-               drm_printf(m, "\t\tring->tail:   0x%08x\n",
-                          rq->ring->tail);
-               drm_printf(m, "\t\tring->emit:   0x%08x\n",
-                          rq->ring->emit);
-               drm_printf(m, "\t\tring->space:  0x%08x\n",
-                          rq->ring->space);
-
-               if (tl) {
-                       drm_printf(m, "\t\tring->hwsp:   0x%08x\n",
-                                  tl->hwsp_offset);
-                       intel_timeline_put(tl);
-               }
-
-               print_request_ring(m, rq);
-
-               if (rq->context->lrc_reg_state) {
-                       drm_printf(m, "Logical Ring Context:\n");
-                       hexdump(m, rq->context->lrc_reg_state, PAGE_SIZE);
-               }
-       }
        drm_printf(m, "\tOn hold?: %lu\n",
                   list_count(&engine->sched_engine->hold));
        spin_unlock_irqrestore(&engine->sched_engine->lock, flags);
@@ -1834,13 +1896,6 @@ intel_engine_create_virtual(struct intel_engine_cs **siblings,
        return siblings[0]->cops->create_virtual(siblings, count);
 }
 
-static bool match_ring(struct i915_request *rq)
-{
-       u32 ring = ENGINE_READ(rq->engine, RING_START);
-
-       return ring == i915_ggtt_offset(rq->ring->vma);
-}
-
 struct i915_request *
 intel_engine_execlist_find_hung_request(struct intel_engine_cs *engine)
 {
@@ -1884,14 +1939,7 @@ intel_engine_execlist_find_hung_request(struct intel_engine_cs *engine)
 
        list_for_each_entry(request, &engine->sched_engine->requests,
                            sched.link) {
-               if (__i915_request_is_complete(request))
-                       continue;
-
-               if (!__i915_request_has_started(request))
-                       continue;
-
-               /* More than one preemptible request may match! */
-               if (!match_ring(request))
+               if (i915_test_request_state(request) != I915_REQUEST_ACTIVE)
                        continue;
 
                active = request;
index 0b16f19c384eb22b7aa8c17221c3728e3a154929..74775ae961b2baf4ffcb10cb3961b60813c806be 100644 (file)
@@ -90,6 +90,14 @@ reset_engine(struct intel_engine_cs *engine, struct i915_request *rq)
        if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
                show_heartbeat(rq, engine);
 
+       if (intel_engine_uses_guc(engine))
+               /*
+                * GuC itself is toast or GuC's hang detection
+                * is disabled. Either way, need to find the
+                * hang culprit manually.
+                */
+               intel_guc_find_hung_context(engine);
+
        intel_gt_handle_error(engine->gt, engine->mask,
                              I915_ERROR_CAPTURE,
                              "stopped heartbeat on %s",
index 721a10e2215e35e0987a226e127b9eaefaf80ca5..4d281bc8a38ce028c5aabf590abe32c020406bb4 100644 (file)
@@ -156,7 +156,7 @@ void __i915_request_reset(struct i915_request *rq, bool guilty)
        if (guilty) {
                i915_request_set_error_once(rq, -EIO);
                __i915_request_skip(rq);
-               if (mark_guilty(rq))
+               if (mark_guilty(rq) && !intel_engine_uses_guc(rq->engine))
                        skip_context(rq);
        } else {
                i915_request_set_error_once(rq, -EAGAIN);
index f3c69160cb7e1977b007351c6682c2e229ed3468..f355a70bbec45713992512adfc1dcf9c08f08937 100644 (file)
@@ -275,6 +275,8 @@ int intel_guc_context_reset_process_msg(struct intel_guc *guc,
 int intel_guc_engine_failure_process_msg(struct intel_guc *guc,
                                         const u32 *msg, u32 len);
 
+void intel_guc_find_hung_context(struct intel_engine_cs *engine);
+
 void intel_guc_submission_reset_prepare(struct intel_guc *guc);
 void intel_guc_submission_reset(struct intel_guc *guc, bool stalled);
 void intel_guc_submission_reset_finish(struct intel_guc *guc);
index 7d6ca0d54f9e5cda40b87e3861cd8fbcef026240..76c2d927e1c3490bc6d3193393df54efcaa27d7d 100644 (file)
@@ -2277,6 +2277,73 @@ int intel_guc_engine_failure_process_msg(struct intel_guc *guc,
        return 0;
 }
 
+void intel_guc_find_hung_context(struct intel_engine_cs *engine)
+{
+       struct intel_guc *guc = &engine->gt->uc.guc;
+       struct intel_context *ce;
+       struct i915_request *rq;
+       unsigned long index;
+
+       /* Reset called during driver load? GuC not yet initialised! */
+       if (unlikely(!guc_submission_initialized(guc)))
+               return;
+
+       xa_for_each(&guc->context_lookup, index, ce) {
+               if (!intel_context_is_pinned(ce))
+                       continue;
+
+               if (intel_engine_is_virtual(ce->engine)) {
+                       if (!(ce->engine->mask & engine->mask))
+                               continue;
+               } else {
+                       if (ce->engine != engine)
+                               continue;
+               }
+
+               list_for_each_entry(rq, &ce->guc_active.requests, sched.link) {
+                       if (i915_test_request_state(rq) != I915_REQUEST_ACTIVE)
+                               continue;
+
+                       intel_engine_set_hung_context(engine, ce);
+
+                       /* Can only cope with one hang at a time... */
+                       return;
+               }
+       }
+}
+
+void intel_guc_dump_active_requests(struct intel_engine_cs *engine,
+                                   struct i915_request *hung_rq,
+                                   struct drm_printer *m)
+{
+       struct intel_guc *guc = &engine->gt->uc.guc;
+       struct intel_context *ce;
+       unsigned long index;
+       unsigned long flags;
+
+       /* Reset called during driver load? GuC not yet initialised! */
+       if (unlikely(!guc_submission_initialized(guc)))
+               return;
+
+       xa_for_each(&guc->context_lookup, index, ce) {
+               if (!intel_context_is_pinned(ce))
+                       continue;
+
+               if (intel_engine_is_virtual(ce->engine)) {
+                       if (!(ce->engine->mask & engine->mask))
+                               continue;
+               } else {
+                       if (ce->engine != engine)
+                               continue;
+               }
+
+               spin_lock_irqsave(&ce->guc_active.lock, flags);
+               intel_engine_dump_active_requests(&ce->guc_active.requests,
+                                                 hung_rq, m);
+               spin_unlock_irqrestore(&ce->guc_active.lock, flags);
+       }
+}
+
 void intel_guc_submission_print_info(struct intel_guc *guc,
                                     struct drm_printer *p)
 {
index 08ff77c5c50e7b9afd3a6f460612d63bd0ab865a..03bc1c83a4d2cb4d384b97e4a858809412f7697b 100644 (file)
@@ -25,6 +25,9 @@ void intel_guc_submission_print_info(struct intel_guc *guc,
                                     struct drm_printer *p);
 void intel_guc_submission_print_context_info(struct intel_guc *guc,
                                             struct drm_printer *p);
+void intel_guc_dump_active_requests(struct intel_engine_cs *engine,
+                                   struct i915_request *hung_rq,
+                                   struct drm_printer *m);
 
 bool intel_guc_virtual_engine_has_heartbeat(const struct intel_engine_cs *ve);
 
index aeef456798978842d99ff0d19020b82ab9ac80f3..28f38b02a5d2623c234630dcb475903bf426de43 100644 (file)
@@ -2041,6 +2041,47 @@ void i915_request_show(struct drm_printer *m,
                   name);
 }
 
+static bool engine_match_ring(struct intel_engine_cs *engine, struct i915_request *rq)
+{
+       u32 ring = ENGINE_READ(engine, RING_START);
+
+       return ring == i915_ggtt_offset(rq->ring->vma);
+}
+
+static bool match_ring(struct i915_request *rq)
+{
+       struct intel_engine_cs *engine;
+       bool found;
+       int i;
+
+       if (!intel_engine_is_virtual(rq->engine))
+               return engine_match_ring(rq->engine, rq);
+
+       found = false;
+       i = 0;
+       while ((engine = intel_engine_get_sibling(rq->engine, i++))) {
+               found = engine_match_ring(engine, rq);
+               if (found)
+                       break;
+       }
+
+       return found;
+}
+
+enum i915_request_state i915_test_request_state(struct i915_request *rq)
+{
+       if (i915_request_completed(rq))
+               return I915_REQUEST_COMPLETE;
+
+       if (!i915_request_started(rq))
+               return I915_REQUEST_PENDING;
+
+       if (match_ring(rq))
+               return I915_REQUEST_ACTIVE;
+
+       return I915_REQUEST_QUEUED;
+}
+
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
 #include "selftests/mock_request.c"
 #include "selftests/i915_request.c"
index 128030f43bbff729d0e0d1350da556928ee2b688..ac0e3326c067b0d997b040a2f0175351be58a208 100644 (file)
@@ -649,4 +649,14 @@ i915_request_active_engine(struct i915_request *rq,
 
 void i915_request_notify_execute_cb_imm(struct i915_request *rq);
 
+enum i915_request_state {
+       I915_REQUEST_UNKNOWN = 0,
+       I915_REQUEST_COMPLETE,
+       I915_REQUEST_PENDING,
+       I915_REQUEST_QUEUED,
+       I915_REQUEST_ACTIVE,
+};
+
+enum i915_request_state i915_test_request_state(struct i915_request *rq);
+
 #endif /* I915_REQUEST_H */