Revert "drm/i915/gem: Async GPU relocations only"

author Dave Airlie <airlied@redhat.com>

Tue, 8 Sep 2020 05:41:17 +0000 (15:41 +1000)

committer Dave Airlie <airlied@redhat.com>

Tue, 8 Sep 2020 05:45:17 +0000 (15:45 +1000)
author Dave Airlie <airlied@redhat.com>
Tue, 8 Sep 2020 05:41:17 +0000 (15:41 +1000)
committer Dave Airlie <airlied@redhat.com>
Tue, 8 Sep 2020 05:45:17 +0000 (15:45 +1000)
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c

index 6b4ec66cb558dca8b893d8b3d13d7622a5ce1073..64901cf52b7ae3f81ad8c8cc605cb252eb590591 100644 (file)
--- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c
@@ -45,6 +45,13 @@ struct eb_vma_array {
         struct eb_vma vma[];
  };
  
+enum {
+       FORCE_CPU_RELOC = 1,
+       FORCE_GTT_RELOC,
+       FORCE_GPU_RELOC,
+#define DBG_FORCE_RELOC 0 /* choose one of the above! */
+};
+
  #define __EXEC_OBJECT_HAS_PIN          BIT(31)
  #define __EXEC_OBJECT_HAS_FENCE                BIT(30)
  #define __EXEC_OBJECT_NEEDS_MAP                BIT(29)
@@ -253,6 +260,8 @@ struct i915_execbuffer {
          */
         struct reloc_cache {
                 struct drm_mm_node node; /** temporary GTT binding */
+               unsigned long vaddr; /** Current kmap address */
+               unsigned long page; /** Currently mapped page index */
                 unsigned int gen; /** Cached value of INTEL_GEN */
                 bool use_64bit_reloc : 1;
                 bool has_llc : 1;
@@ -596,6 +605,23 @@ eb_add_vma(struct i915_execbuffer *eb,
         }
  }
  
+static inline int use_cpu_reloc(const struct reloc_cache *cache,
+                               const struct drm_i915_gem_object *obj)
+{
+       if (!i915_gem_object_has_struct_page(obj))
+               return false;
+
+       if (DBG_FORCE_RELOC == FORCE_CPU_RELOC)
+               return true;
+
+       if (DBG_FORCE_RELOC == FORCE_GTT_RELOC)
+               return false;
+
+       return (cache->has_llc ||
+               obj->cache_dirty ||
+               obj->cache_level != I915_CACHE_NONE);
+}
+
  static int eb_reserve_vma(const struct i915_execbuffer *eb,
                           struct eb_vma *ev,
                           u64 pin_flags)
@@ -926,6 +952,8 @@ relocation_target(const struct drm_i915_gem_relocation_entry *reloc,
  static void reloc_cache_init(struct reloc_cache *cache,
                              struct drm_i915_private *i915)
  {
+       cache->page = -1;
+       cache->vaddr = 0;
         /* Must be a variable in the struct to allow GCC to unroll. */
         cache->gen = INTEL_GEN(i915);
         cache->has_llc = HAS_LLC(i915);
@@ -1049,6 +1077,181 @@ static int reloc_gpu_flush(struct reloc_cache *cache)
         return err;
  }
  
+static void reloc_cache_reset(struct reloc_cache *cache)
+{
+       void *vaddr;
+
+       if (!cache->vaddr)
+               return;
+
+       vaddr = unmask_page(cache->vaddr);
+       if (cache->vaddr & KMAP) {
+               if (cache->vaddr & CLFLUSH_AFTER)
+                       mb();
+
+               kunmap_atomic(vaddr);
+               i915_gem_object_finish_access((struct drm_i915_gem_object *)cache->node.mm);
+       } else {
+               struct i915_ggtt *ggtt = cache_to_ggtt(cache);
+
+               intel_gt_flush_ggtt_writes(ggtt->vm.gt);
+               io_mapping_unmap_atomic((void __iomem *)vaddr);
+
+               if (drm_mm_node_allocated(&cache->node)) {
+                       ggtt->vm.clear_range(&ggtt->vm,
+                                            cache->node.start,
+                                            cache->node.size);
+                       mutex_lock(&ggtt->vm.mutex);
+                       drm_mm_remove_node(&cache->node);
+                       mutex_unlock(&ggtt->vm.mutex);
+               } else {
+                       i915_vma_unpin((struct i915_vma *)cache->node.mm);
+               }
+       }
+
+       cache->vaddr = 0;
+       cache->page = -1;
+}
+
+static void *reloc_kmap(struct drm_i915_gem_object *obj,
+                       struct reloc_cache *cache,
+                       unsigned long page)
+{
+       void *vaddr;
+
+       if (cache->vaddr) {
+               kunmap_atomic(unmask_page(cache->vaddr));
+       } else {
+               unsigned int flushes;
+               int err;
+
+               err = i915_gem_object_prepare_write(obj, &flushes);
+               if (err)
+                       return ERR_PTR(err);
+
+               BUILD_BUG_ON(KMAP & CLFLUSH_FLAGS);
+               BUILD_BUG_ON((KMAP | CLFLUSH_FLAGS) & PAGE_MASK);
+
+               cache->vaddr = flushes | KMAP;
+               cache->node.mm = (void *)obj;
+               if (flushes)
+                       mb();
+       }
+
+       vaddr = kmap_atomic(i915_gem_object_get_dirty_page(obj, page));
+       cache->vaddr = unmask_flags(cache->vaddr) | (unsigned long)vaddr;
+       cache->page = page;
+
+       return vaddr;
+}
+
+static void *reloc_iomap(struct drm_i915_gem_object *obj,
+                        struct reloc_cache *cache,
+                        unsigned long page)
+{
+       struct i915_ggtt *ggtt = cache_to_ggtt(cache);
+       unsigned long offset;
+       void *vaddr;
+
+       if (cache->vaddr) {
+               intel_gt_flush_ggtt_writes(ggtt->vm.gt);
+               io_mapping_unmap_atomic((void __force __iomem *) unmask_page(cache->vaddr));
+       } else {
+               struct i915_vma *vma;
+               int err;
+
+               if (i915_gem_object_is_tiled(obj))
+                       return ERR_PTR(-EINVAL);
+
+               if (use_cpu_reloc(cache, obj))
+                       return NULL;
+
+               i915_gem_object_lock(obj);
+               err = i915_gem_object_set_to_gtt_domain(obj, true);
+               i915_gem_object_unlock(obj);
+               if (err)
+                       return ERR_PTR(err);
+
+               vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
+                                              PIN_MAPPABLE |
+                                              PIN_NONBLOCK /* NOWARN */ |
+                                              PIN_NOEVICT);
+               if (IS_ERR(vma)) {
+                       memset(&cache->node, 0, sizeof(cache->node));
+                       mutex_lock(&ggtt->vm.mutex);
+                       err = drm_mm_insert_node_in_range
+                               (&ggtt->vm.mm, &cache->node,
+                                PAGE_SIZE, 0, I915_COLOR_UNEVICTABLE,
+                                0, ggtt->mappable_end,
+                                DRM_MM_INSERT_LOW);
+                       mutex_unlock(&ggtt->vm.mutex);
+                       if (err) /* no inactive aperture space, use cpu reloc */
+                               return NULL;
+               } else {
+                       cache->node.start = vma->node.start;
+                       cache->node.mm = (void *)vma;
+               }
+       }
+
+       offset = cache->node.start;
+       if (drm_mm_node_allocated(&cache->node)) {
+               ggtt->vm.insert_page(&ggtt->vm,
+                                    i915_gem_object_get_dma_address(obj, page),
+                                    offset, I915_CACHE_NONE, 0);
+       } else {
+               offset += page << PAGE_SHIFT;
+       }
+
+       vaddr = (void __force *)io_mapping_map_atomic_wc(&ggtt->iomap,
+                                                        offset);
+       cache->page = page;
+       cache->vaddr = (unsigned long)vaddr;
+
+       return vaddr;
+}
+
+static void *reloc_vaddr(struct drm_i915_gem_object *obj,
+                        struct reloc_cache *cache,
+                        unsigned long page)
+{
+       void *vaddr;
+
+       if (cache->page == page) {
+               vaddr = unmask_page(cache->vaddr);
+       } else {
+               vaddr = NULL;
+               if ((cache->vaddr & KMAP) == 0)
+                       vaddr = reloc_iomap(obj, cache, page);
+               if (!vaddr)
+                       vaddr = reloc_kmap(obj, cache, page);
+       }
+
+       return vaddr;
+}
+
+static void clflush_write32(u32 *addr, u32 value, unsigned int flushes)
+{
+       if (unlikely(flushes & (CLFLUSH_BEFORE | CLFLUSH_AFTER))) {
+               if (flushes & CLFLUSH_BEFORE) {
+                       clflushopt(addr);
+                       mb();
+               }
+
+               *addr = value;
+
+               /*
+                * Writes to the same cacheline are serialised by the CPU
+                * (including clflush). On the write path, we only require
+                * that it hits memory in an orderly fashion and place
+                * mb barriers at the start and end of the relocation phase
+                * to ensure ordering of clflush wrt to the system.
+                */
+               if (flushes & CLFLUSH_AFTER)
+                       clflushopt(addr);
+       } else
+               *addr = value;
+}
+
  static int reloc_move_to_gpu(struct i915_request *rq, struct i915_vma *vma)
  {
         struct drm_i915_gem_object *obj = vma->obj;
@@ -1214,6 +1417,17 @@ static u32 *reloc_gpu(struct i915_execbuffer *eb,
         return cmd;
  }
  
+static inline bool use_reloc_gpu(struct i915_vma *vma)
+{
+       if (DBG_FORCE_RELOC == FORCE_GPU_RELOC)
+               return true;
+
+       if (DBG_FORCE_RELOC)
+               return false;
+
+       return !dma_resv_test_signaled_rcu(vma->resv, true);
+}
+
  static unsigned long vma_phys_addr(struct i915_vma *vma, u32 offset)
  {
         struct page *page;
@@ -1228,10 +1442,10 @@ static unsigned long vma_phys_addr(struct i915_vma *vma, u32 offset)
         return addr + offset_in_page(offset);
  }
  
-static int __reloc_entry_gpu(struct i915_execbuffer *eb,
-                            struct i915_vma *vma,
-                            u64 offset,
-                            u64 target_addr)
+static bool __reloc_entry_gpu(struct i915_execbuffer *eb,
+                             struct i915_vma *vma,
+                             u64 offset,
+                             u64 target_addr)
  {
         const unsigned int gen = eb->reloc_cache.gen;
         unsigned int len;
@@ -1247,7 +1461,7 @@ static int __reloc_entry_gpu(struct i915_execbuffer *eb,
  
         batch = reloc_gpu(eb, vma, len);
         if (IS_ERR(batch))
-               return PTR_ERR(batch);
+               return false;
  
         addr = gen8_canonical_addr(vma->node.start + offset);
         if (gen >= 8) {
@@ -1296,21 +1510,55 @@ static int __reloc_entry_gpu(struct i915_execbuffer *eb,
                 *batch++ = target_addr;
         }
  
-       return 0;
+       return true;
+}
+
+static bool reloc_entry_gpu(struct i915_execbuffer *eb,
+                           struct i915_vma *vma,
+                           u64 offset,
+                           u64 target_addr)
+{
+       if (eb->reloc_cache.vaddr)
+               return false;
+
+       if (!use_reloc_gpu(vma))
+               return false;
+
+       return __reloc_entry_gpu(eb, vma, offset, target_addr);
  }
  
  static u64
-relocate_entry(struct i915_execbuffer *eb,
-              struct i915_vma *vma,
+relocate_entry(struct i915_vma *vma,
                const struct drm_i915_gem_relocation_entry *reloc,
+              struct i915_execbuffer *eb,
                const struct i915_vma *target)
  {
         u64 target_addr = relocation_target(reloc, target);
-       int err;
-
-       err = __reloc_entry_gpu(eb, vma, reloc->offset, target_addr);
-       if (err)
-               return err;
+       u64 offset = reloc->offset;
+
+       if (!reloc_entry_gpu(eb, vma, offset, target_addr)) {
+               bool wide = eb->reloc_cache.use_64bit_reloc;
+               void *vaddr;
+
+repeat:
+               vaddr = reloc_vaddr(vma->obj,
+                                   &eb->reloc_cache,
+                                   offset >> PAGE_SHIFT);
+               if (IS_ERR(vaddr))
+                       return PTR_ERR(vaddr);
+
+               GEM_BUG_ON(!IS_ALIGNED(offset, sizeof(u32)));
+               clflush_write32(vaddr + offset_in_page(offset),
+                               lower_32_bits(target_addr),
+                               eb->reloc_cache.vaddr);
+
+               if (wide) {
+                       offset += sizeof(u32);
+                       target_addr >>= 32;
+                       wide = false;
+                       goto repeat;
+               }
+       }
  
         return target->node.start | UPDATE;
  }
@@ -1375,7 +1623,8 @@ eb_relocate_entry(struct i915_execbuffer *eb,
          * If the relocation already has the right value in it, no
          * more work needs to be done.
          */
-       if (gen8_canonical_addr(target->vma->node.start) == reloc->presumed_offset)
+       if (!DBG_FORCE_RELOC &&
+           gen8_canonical_addr(target->vma->node.start) == reloc->presumed_offset)
                 return 0;
  
         /* Check that the relocation address is valid... */
@@ -1407,7 +1656,7 @@ eb_relocate_entry(struct i915_execbuffer *eb,
         ev->flags &= ~EXEC_OBJECT_ASYNC;
  
         /* and update the user's relocation entry */
-       return relocate_entry(eb, ev->vma, reloc, target->vma);
+       return relocate_entry(ev->vma, reloc, eb, target->vma);
  }
  
  static int eb_relocate_vma(struct i915_execbuffer *eb, struct eb_vma *ev)
@@ -1445,8 +1694,10 @@ static int eb_relocate_vma(struct i915_execbuffer *eb, struct eb_vma *ev)
                  * this is bad and so lockdep complains vehemently.
                  */
                 copied = __copy_from_user(r, urelocs, count * sizeof(r[0]));
-               if (unlikely(copied))
-                       return -EFAULT;
+               if (unlikely(copied)) {
+                       remain = -EFAULT;
+                       goto out;
+               }
  
                 remain -= count;
                 do {
@@ -1454,7 +1705,8 @@ static int eb_relocate_vma(struct i915_execbuffer *eb, struct eb_vma *ev)
  
                         if (likely(offset == 0)) {
                         } else if ((s64)offset < 0) {
-                               return (int)offset;
+                               remain = (int)offset;
+                               goto out;
                         } else {
                                 /*
                                  * Note that reporting an error now
@@ -1484,8 +1736,9 @@ static int eb_relocate_vma(struct i915_execbuffer *eb, struct eb_vma *ev)
                 } while (r++, --count);
                 urelocs += ARRAY_SIZE(stack);
         } while (remain);
-
-       return 0;
+out:
+       reloc_cache_reset(&eb->reloc_cache);
+       return remain;
  }
  
  static int eb_relocate(struct i915_execbuffer *eb)
@@ -2392,7 +2645,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
         eb.i915 = i915;
         eb.file = file;
         eb.args = args;
-       if (!(args->flags & I915_EXEC_NO_RELOC))
+       if (DBG_FORCE_RELOC || !(args->flags & I915_EXEC_NO_RELOC))
                 args->flags |= __EXEC_HAS_RELOC;
  
         eb.exec = exec;
diff --git a/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c

index 57c14d3340cde50d456b3d123ca7392556a797ff..a49016f8ee0dc0bce31bdc99c88729f448f32988 100644 (file)
--- a/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c
@@ -37,14 +37,20 @@ static int __igt_gpu_reloc(struct i915_execbuffer *eb,
                 return err;
  
         /* 8-Byte aligned */
-       err = __reloc_entry_gpu(eb, vma, offsets[0] * sizeof(u32), 0);
-       if (err)
+       if (!__reloc_entry_gpu(eb, vma,
+                              offsets[0] * sizeof(u32),
+                              0)) {
+               err = -EIO;
                 goto unpin_vma;
+       }
  
         /* !8-Byte aligned */
-       err = __reloc_entry_gpu(eb, vma, offsets[1] * sizeof(u32), 1);
-       if (err)
+       if (!__reloc_entry_gpu(eb, vma,
+                              offsets[1] * sizeof(u32),
+                              1)) {
+               err = -EIO;
                 goto unpin_vma;
+       }
  
         /* Skip to the end of the cmd page */
         i = PAGE_SIZE / sizeof(u32) - RELOC_TAIL - 1;
@@ -54,9 +60,12 @@ static int __igt_gpu_reloc(struct i915_execbuffer *eb,
         eb->reloc_cache.rq_size += i;
  
         /* Force batch chaining */
-       err = __reloc_entry_gpu(eb, vma, offsets[2] * sizeof(u32), 2);
-       if (err)
+       if (!__reloc_entry_gpu(eb, vma,
+                              offsets[2] * sizeof(u32),
+                              2)) {
+               err = -EIO;
                 goto unpin_vma;
+       }
  
         GEM_BUG_ON(!eb->reloc_cache.rq);
         rq = i915_request_get(eb->reloc_cache.rq);
author	Dave Airlie <airlied@redhat.com>
	Tue, 8 Sep 2020 05:41:17 +0000 (15:41 +1000)
committer	Dave Airlie <airlied@redhat.com>
	Tue, 8 Sep 2020 05:45:17 +0000 (15:45 +1000)
drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c		patch \| blob \| history
drivers/gpu/drm/i915/gem/selftests/i915_gem_execbuffer.c		patch \| blob \| history