drm/i915: Only include active engines in the capture state

author Chris Wilson <chris@chris-wilson.co.uk>

Thu, 8 Aug 2019 14:45:11 +0000 (15:45 +0100)

committer Chris Wilson <chris@chris-wilson.co.uk>

Thu, 8 Aug 2019 20:54:08 +0000 (21:54 +0100)
author Chris Wilson <chris@chris-wilson.co.uk>
Thu, 8 Aug 2019 14:45:11 +0000 (15:45 +0100)
committer Chris Wilson <chris@chris-wilson.co.uk>
Thu, 8 Aug 2019 20:54:08 +0000 (21:54 +0100)
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c

index 86333cd561946e63df049746dd177cbb94e13146..9822ed30656ea3b2de16c00764765b291fa6c3b0 100644 (file)
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -49,27 +49,6 @@
  #define ALLOW_FAIL (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN)
  #define ATOMIC_MAYFAIL (GFP_ATOMIC | __GFP_NOWARN)
  
-static inline const struct intel_engine_cs *
-engine_lookup(const struct drm_i915_private *i915, unsigned int id)
-{
-       if (id >= I915_NUM_ENGINES)
-               return NULL;
-
-       return i915->engine[id];
-}
-
-static inline const char *
-__engine_name(const struct intel_engine_cs *engine)
-{
-       return engine ? engine->name : "";
-}
-
-static const char *
-engine_name(const struct drm_i915_private *i915, unsigned int id)
-{
-       return __engine_name(engine_lookup(i915, id));
-}
-
  static void __sg_set_buf(struct scatterlist *sg,
                          void *addr, unsigned int len, loff_t it)
  {
@@ -447,7 +426,7 @@ static void error_print_instdone(struct drm_i915_error_state_buf *m,
         err_printf(m, "  INSTDONE: 0x%08x\n",
                    ee->instdone.instdone);
  
-       if (ee->engine_id != RCS0 || INTEL_GEN(m->i915) <= 3)
+       if (ee->engine->class != RENDER_CLASS || INTEL_GEN(m->i915) <= 3)
                 return;
  
         err_printf(m, "  SC_INSTDONE: 0x%08x\n",
@@ -501,8 +480,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
  {
         int n;
  
-       err_printf(m, "%s command stream:\n",
-                  engine_name(m->i915, ee->engine_id));
+       err_printf(m, "%s command stream:\n", ee->engine->name);
         err_printf(m, "  IDLE?: %s\n", yesno(ee->idle));
         err_printf(m, "  START: 0x%08x\n", ee->start);
         err_printf(m, "  HEAD:  0x%08x [0x%08x]\n", ee->head, ee->rq_head);
@@ -578,9 +556,9 @@ void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
  }
  
  static void print_error_obj(struct drm_i915_error_state_buf *m,
-                           struct intel_engine_cs *engine,
+                           const struct intel_engine_cs *engine,
                             const char *name,
-                           struct drm_i915_error_object *obj)
+                           const struct drm_i915_error_object *obj)
  {
         char out[ASCII85_BUFSZ];
         int page;
@@ -677,7 +655,7 @@ static void err_free_sgl(struct scatterlist *sgl)
  static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
                                struct i915_gpu_state *error)
  {
-       struct drm_i915_error_object *obj;
+       const struct drm_i915_error_engine *ee;
         struct timespec64 ts;
         int i, j;
  
@@ -702,15 +680,12 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
                    jiffies_to_msecs(jiffies - error->capture),
                    jiffies_to_msecs(error->capture - error->epoch));
  
-       for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-               if (!error->engine[i].context.pid)
-                       continue;
-
+       for (ee = error->engine; ee; ee = ee->next)
                 err_printf(m, "Active process (on ring %s): %s [%d]\n",
-                          engine_name(m->i915, i),
-                          error->engine[i].context.comm,
-                          error->engine[i].context.pid);
-       }
+                          ee->engine->name,
+                          ee->context.comm,
+                          ee->context.pid);
+
         err_printf(m, "Reset count: %u\n", error->reset_count);
         err_printf(m, "Suspend count: %u\n", error->suspend_count);
         err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform));
@@ -758,17 +733,15 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
         if (IS_GEN(m->i915, 7))
                 err_printf(m, "ERR_INT: 0x%08x\n", error->err_int);
  
-       for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-               if (error->engine[i].engine_id != -1)
-                       error_print_engine(m, &error->engine[i], error->epoch);
-       }
+       for (ee = error->engine; ee; ee = ee->next)
+               error_print_engine(m, ee, error->epoch);
  
-       for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-               const struct drm_i915_error_engine *ee = &error->engine[i];
+       for (ee = error->engine; ee; ee = ee->next) {
+               const struct drm_i915_error_object *obj;
  
                 obj = ee->batchbuffer;
                 if (obj) {
-                       err_puts(m, m->i915->engine[i]->name);
+                       err_puts(m, ee->engine->name);
                         if (ee->context.pid)
                                 err_printf(m, " (submitted by %s [%d])",
                                            ee->context.comm,
@@ -776,16 +749,15 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
                         err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
                                    upper_32_bits(obj->gtt_offset),
                                    lower_32_bits(obj->gtt_offset));
-                       print_error_obj(m, m->i915->engine[i], NULL, obj);
+                       print_error_obj(m, ee->engine, NULL, obj);
                 }
  
                 for (j = 0; j < ee->user_bo_count; j++)
-                       print_error_obj(m, m->i915->engine[i],
-                                       "user", ee->user_bo[j]);
+                       print_error_obj(m, ee->engine, "user", ee->user_bo[j]);
  
                 if (ee->num_requests) {
                         err_printf(m, "%s --- %d requests\n",
-                                  m->i915->engine[i]->name,
+                                  ee->engine->name,
                                    ee->num_requests);
                         for (j = 0; j < ee->num_requests; j++)
                                 error_print_request(m, " ",
@@ -793,22 +765,13 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
                                                     error->epoch);
                 }
  
-               print_error_obj(m, m->i915->engine[i],
-                               "ringbuffer", ee->ringbuffer);
-
-               print_error_obj(m, m->i915->engine[i],
-                               "HW Status", ee->hws_page);
-
-               print_error_obj(m, m->i915->engine[i],
-                               "HW context", ee->ctx);
-
-               print_error_obj(m, m->i915->engine[i],
-                               "WA context", ee->wa_ctx);
-
-               print_error_obj(m, m->i915->engine[i],
+               print_error_obj(m, ee->engine, "ringbuffer", ee->ringbuffer);
+               print_error_obj(m, ee->engine, "HW Status", ee->hws_page);
+               print_error_obj(m, ee->engine, "HW context", ee->ctx);
+               print_error_obj(m, ee->engine, "WA context", ee->wa_ctx);
+               print_error_obj(m, ee->engine,
                                 "WA batchbuffer", ee->wa_batchbuffer);
-
-               print_error_obj(m, m->i915->engine[i],
+               print_error_obj(m, ee->engine,
                                 "NULL context", ee->default_state);
         }
  
@@ -957,13 +920,15 @@ void __i915_gpu_state_free(struct kref *error_ref)
  {
         struct i915_gpu_state *error =
                 container_of(error_ref, typeof(*error), ref);
-       long i, j;
+       long i;
  
-       for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-               struct drm_i915_error_engine *ee = &error->engine[i];
+       while (error->engine) {
+               struct drm_i915_error_engine *ee = error->engine;
  
-               for (j = 0; j < ee->user_bo_count; j++)
-                       i915_error_object_free(ee->user_bo[j]);
+               error->engine = ee->next;
+
+               for (i = 0; i < ee->user_bo_count; i++)
+                       i915_error_object_free(ee->user_bo[i]);
                 kfree(ee->user_bo);
  
                 i915_error_object_free(ee->batchbuffer);
@@ -974,6 +939,7 @@ void __i915_gpu_state_free(struct kref *error_ref)
                 i915_error_object_free(ee->wa_ctx);
  
                 kfree(ee->requests);
+               kfree(ee);
         }
  
         kfree(error->overlay);
@@ -1055,23 +1021,17 @@ i915_error_object_create(struct drm_i915_private *i915,
   *
   * It's only a small step better than a random number in its current form.
   */
-static u32 i915_error_generate_code(struct i915_gpu_state *error,
-                                   intel_engine_mask_t engine_mask)
+static u32 i915_error_generate_code(struct i915_gpu_state *error)
  {
+       const struct drm_i915_error_engine *ee = error->engine;
+
         /*
          * IPEHR would be an ideal way to detect errors, as it's the gross
          * measure of "the command that hung." However, has some very common
          * synchronization commands which almost always appear in the case
          * strictly a client bug. Use instdone to differentiate those some.
          */
-       if (engine_mask) {
-               struct drm_i915_error_engine *ee =
-                       &error->engine[ffs(engine_mask)];
-
-               return ee->ipehr ^ ee->instdone.instdone;
-       }
-
-       return 0;
+       return ee ? ee->ipehr ^ ee->instdone.instdone : 0;
  }
  
  static void gem_record_fences(struct i915_gpu_state *error)
@@ -1284,9 +1244,11 @@ static void error_record_engine_execlists(const struct intel_engine_cs *engine,
         ee->num_ports = n;
  }
  
-static void record_context(struct drm_i915_error_context *e,
-                          struct i915_gem_context *ctx)
+static bool record_context(struct drm_i915_error_context *e,
+                          const struct i915_request *rq)
  {
+       const struct i915_gem_context *ctx = rq->gem_context;
+
         if (ctx->pid) {
                 struct task_struct *task;
  
@@ -1303,6 +1265,8 @@ static void record_context(struct drm_i915_error_context *e,
         e->sched_attr = ctx->sched;
         e->guilty = atomic_read(&ctx->guilty_count);
         e->active = atomic_read(&ctx->active_count);
+
+       return i915_gem_context_no_error_capture(ctx);
  }
  
  struct capture_vma {
@@ -1397,74 +1361,67 @@ static void
  gem_record_rings(struct i915_gpu_state *error, struct compress *compress)
  {
         struct drm_i915_private *i915 = error->i915;
-       int i;
+       struct intel_engine_cs *engine;
+       struct drm_i915_error_engine *ee;
+
+       ee = kzalloc(sizeof(*ee), GFP_KERNEL);
+       if (!ee)
+               return;
  
-       for (i = 0; i < I915_NUM_ENGINES; i++) {
-               struct intel_engine_cs *engine = i915->engine[i];
-               struct drm_i915_error_engine *ee = &error->engine[i];
+       for_each_uabi_engine(engine, i915) {
                 struct capture_vma *capture = NULL;
                 struct i915_request *request;
                 unsigned long flags;
  
-               ee->engine_id = -1;
-
-               if (!engine)
-                       continue;
-
-               ee->engine_id = i;
-
                 /* Refill our page pool before entering atomic section */
                 pool_refill(&compress->pool, ALLOW_FAIL);
  
-               error_record_engine_registers(error, engine, ee);
-               error_record_engine_execlists(engine, ee);
-
                 spin_lock_irqsave(&engine->active.lock, flags);
                 request = intel_engine_find_active_request(engine);
-               if (request) {
-                       struct i915_gem_context *ctx = request->gem_context;
-                       struct intel_ring *ring = request->ring;
-
-                       record_context(&ee->context, ctx);
-
-                       /*
-                        * We need to copy these to an anonymous buffer
-                        * as the simplest method to avoid being overwritten
-                        * by userspace.
-                        */
-                       capture = capture_vma(capture,
-                                             request->batch,
-                                             &ee->batchbuffer);
+               if (!request) {
+                       spin_unlock_irqrestore(&engine->active.lock, flags);
+                       continue;
+               }
  
-                       if (HAS_BROKEN_CS_TLB(i915))
-                               capture = capture_vma(capture,
-                                                     engine->gt->scratch,
-                                                     &ee->wa_batchbuffer);
+               error->simulated |= record_context(&ee->context, request);
  
-                       capture = request_record_user_bo(request, ee, capture);
+               /*
+                * We need to copy these to an anonymous buffer
+                * as the simplest method to avoid being overwritten
+                * by userspace.
+                */
+               capture = capture_vma(capture,
+                                     request->batch,
+                                     &ee->batchbuffer);
  
+               if (HAS_BROKEN_CS_TLB(i915))
                         capture = capture_vma(capture,
-                                             request->hw_context->state,
-                                             &ee->ctx);
+                                             engine->gt->scratch,
+                                             &ee->wa_batchbuffer);
  
-                       capture = capture_vma(capture,
-                                             ring->vma,
-                                             &ee->ringbuffer);
+               capture = request_record_user_bo(request, ee, capture);
  
-                       error->simulated |=
-                               i915_gem_context_no_error_capture(ctx);
+               capture = capture_vma(capture,
+                                     request->hw_context->state,
+                                     &ee->ctx);
  
-                       ee->rq_head = request->head;
-                       ee->rq_post = request->postfix;
-                       ee->rq_tail = request->tail;
+               capture = capture_vma(capture,
+                                     request->ring->vma,
+                                     &ee->ringbuffer);
  
-                       ee->cpu_ring_head = ring->head;
-                       ee->cpu_ring_tail = ring->tail;
+               ee->cpu_ring_head = request->ring->head;
+               ee->cpu_ring_tail = request->ring->tail;
  
-                       engine_record_requests(engine, request, ee);
-               }
+               ee->rq_head = request->head;
+               ee->rq_post = request->postfix;
+               ee->rq_tail = request->tail;
+
+               engine_record_requests(engine, request, ee);
                 spin_unlock_irqrestore(&engine->active.lock, flags);
  
+               error_record_engine_registers(error, engine, ee);
+               error_record_engine_execlists(engine, ee);
+
                 while (capture) {
                         struct capture_vma *this = capture;
                         struct i915_vma *vma = *this->slot;
@@ -1491,7 +1448,18 @@ gem_record_rings(struct i915_gpu_state *error, struct compress *compress)
  
                 ee->default_state =
                         capture_object(i915, engine->default_state, compress);
+
+               ee->engine = engine;
+
+               ee->next = error->engine;
+               error->engine = ee;
+
+               ee = kzalloc(sizeof(*ee), GFP_KERNEL);
+               if (!ee)
+                       return;
         }
+
+       kfree(ee);
  }
  
  static void
@@ -1627,24 +1595,18 @@ error_msg(struct i915_gpu_state *error,
           intel_engine_mask_t engines, const char *msg)
  {
         int len;
-       int i;
-
-       for (i = 0; i < ARRAY_SIZE(error->engine); i++)
-               if (!error->engine[i].context.pid)
-                       engines &= ~BIT(i);
  
         len = scnprintf(error->error_msg, sizeof(error->error_msg),
                         "GPU HANG: ecode %d:%x:0x%08x",
                         INTEL_GEN(error->i915), engines,
-                       i915_error_generate_code(error, engines));
-       if (engines) {
+                       i915_error_generate_code(error));
+       if (error->engine) {
                 /* Just show the first executing process, more is confusing */
-               i = __ffs(engines);
                 len += scnprintf(error->error_msg + len,
                                  sizeof(error->error_msg) - len,
                                  ", in %s [%d]",
-                                error->engine[i].context.comm,
-                                error->engine[i].context.pid);
+                                error->engine->context.comm,
+                                error->engine->context.pid);
         }
         if (msg)
                 len += scnprintf(error->error_msg + len,
@@ -1685,12 +1647,10 @@ static void capture_params(struct i915_gpu_state *error)
  
  static unsigned long capture_find_epoch(const struct i915_gpu_state *error)
  {
+       const struct drm_i915_error_engine *ee;
         unsigned long epoch = error->capture;
-       int i;
-
-       for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-               const struct drm_i915_error_engine *ee = &error->engine[i];
  
+       for (ee = error->engine; ee; ee = ee->next) {
                 if (ee->hangcheck_timestamp &&
                     time_before(ee->hangcheck_timestamp, epoch))
                         epoch = ee->hangcheck_timestamp;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h

index a24c35107d165c4e04c77548f6a6369969bec4a4..df9f577666262aec5071dc8a480a413a489c96a3 100644 (file)
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -81,7 +81,8 @@ struct i915_gpu_state {
         struct intel_display_error_state *display;
  
         struct drm_i915_error_engine {
-               int engine_id;
+               const struct intel_engine_cs *engine;
+
                 /* Software tracked state */
                 bool idle;
                 unsigned long hangcheck_timestamp;
@@ -158,7 +159,9 @@ struct i915_gpu_state {
                                 u32 pp_dir_base;
                         };
                 } vm_info;
-       } engine[I915_NUM_ENGINES];
+
+               struct drm_i915_error_engine *next;
+       } *engine;
  
         struct scatterlist *sgl, *fit;
  };
author	Chris Wilson <chris@chris-wilson.co.uk>
	Thu, 8 Aug 2019 14:45:11 +0000 (15:45 +0100)
committer	Chris Wilson <chris@chris-wilson.co.uk>
	Thu, 8 Aug 2019 20:54:08 +0000 (21:54 +0100)
drivers/gpu/drm/i915/i915_gpu_error.c		patch \| blob \| history
drivers/gpu/drm/i915/i915_gpu_error.h		patch \| blob \| history