drm/i915: document caching related bits

author Matthew Auld <matthew.auld@intel.com>

Fri, 23 Jul 2021 10:50:44 +0000 (11:50 +0100)

committer Matthew Auld <matthew.auld@intel.com>

Tue, 27 Jul 2021 08:16:44 +0000 (09:16 +0100)
author Matthew Auld <matthew.auld@intel.com>
Fri, 23 Jul 2021 10:50:44 +0000 (11:50 +0100)
committer Matthew Auld <matthew.auld@intel.com>
Tue, 27 Jul 2021 08:16:44 +0000 (09:16 +0100)
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h

index afbadfc5516b68408ebc075c96a3b2767fedb9ef..79de925aecfdbeca6cf481019c86bb0dde5abcc9 100644 (file)
--- a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
@@ -92,6 +92,86 @@ struct drm_i915_gem_object_ops {
         const char *name; /* friendly name for debug, e.g. lockdep classes */
  };
  
+/**
+ * enum i915_cache_level - The supported GTT caching values for system memory
+ * pages.
+ *
+ * These translate to some special GTT PTE bits when binding pages into some
+ * address space. It also determines whether an object, or rather its pages are
+ * coherent with the GPU, when also reading or writing through the CPU cache
+ * with those pages.
+ *
+ * Userspace can also control this through struct drm_i915_gem_caching.
+ */
+enum i915_cache_level {
+       /**
+        * @I915_CACHE_NONE:
+        *
+        * GPU access is not coherent with the CPU cache. If the cache is dirty
+        * and we need the underlying pages to be coherent with some later GPU
+        * access then we need to manually flush the pages.
+        *
+        * On shared LLC platforms reads and writes through the CPU cache are
+        * still coherent even with this setting. See also
+        * &drm_i915_gem_object.cache_coherent for more details. Due to this we
+        * should only ever use uncached for scanout surfaces, otherwise we end
+        * up over-flushing in some places.
+        *
+        * This is the default on non-LLC platforms.
+        */
+       I915_CACHE_NONE = 0,
+       /**
+        * @I915_CACHE_LLC:
+        *
+        * GPU access is coherent with the CPU cache. If the cache is dirty,
+        * then the GPU will ensure that access remains coherent, when both
+        * reading and writing through the CPU cache. GPU writes can dirty the
+        * CPU cache.
+        *
+        * Not used for scanout surfaces.
+        *
+        * Applies to both platforms with shared LLC(HAS_LLC), and snooping
+        * based platforms(HAS_SNOOP).
+        *
+        * This is the default on shared LLC platforms.  The only exception is
+        * scanout objects, where the display engine is not coherent with the
+        * CPU cache. For such objects I915_CACHE_NONE or I915_CACHE_WT is
+        * automatically applied by the kernel in pin_for_display, if userspace
+        * has not done so already.
+        */
+       I915_CACHE_LLC,
+       /**
+        * @I915_CACHE_L3_LLC:
+        *
+        * Explicitly enable the Gfx L3 cache, with coherent LLC.
+        *
+        * The Gfx L3 sits between the domain specific caches, e.g
+        * sampler/render caches, and the larger LLC. LLC is coherent with the
+        * GPU, but L3 is only visible to the GPU, so likely needs to be flushed
+        * when the workload completes.
+        *
+        * Not used for scanout surfaces.
+        *
+        * Only exposed on some gen7 + GGTT. More recent hardware has dropped
+        * this explicit setting, where it should now be enabled by default.
+        */
+       I915_CACHE_L3_LLC,
+       /**
+        * @I915_CACHE_WT:
+        *
+        * Write-through. Used for scanout surfaces.
+        *
+        * The GPU can utilise the caches, while still having the display engine
+        * be coherent with GPU writes, as a result we don't need to flush the
+        * CPU caches when moving out of the render domain. This is the default
+        * setting chosen by the kernel, if supported by the HW, otherwise we
+        * fallback to I915_CACHE_NONE. On the CPU side writes through the CPU
+        * cache still need to be flushed, to remain coherent with the display
+        * engine.
+        */
+       I915_CACHE_WT,
+};
+
  enum i915_map_type {
         I915_MAP_WB = 0,
         I915_MAP_WC,
@@ -229,14 +309,113 @@ struct drm_i915_gem_object {
         unsigned int mem_flags;
  #define I915_BO_FLAG_STRUCT_PAGE BIT(0) /* Object backed by struct pages */
  #define I915_BO_FLAG_IOMEM       BIT(1) /* Object backed by IO memory */
-       /*
-        * Is the object to be mapped as read-only to the GPU
-        * Only honoured if hardware has relevant pte bit
+       /**
+        * @cache_level: The desired GTT caching level.
+        *
+        * See enum i915_cache_level for possible values, along with what
+        * each does.
          */
         unsigned int cache_level:3;
-       unsigned int cache_coherent:2;
+       /**
+        * @cache_coherent:
+        *
+        * Track whether the pages are coherent with the GPU if reading or
+        * writing through the CPU caches. The largely depends on the
+        * @cache_level setting.
+        *
+        * On platforms which don't have the shared LLC(HAS_SNOOP), like on Atom
+        * platforms, coherency must be explicitly requested with some special
+        * GTT caching bits(see enum i915_cache_level). When enabling coherency
+        * it does come at a performance and power cost on such platforms. On
+        * the flip side the kernel does not need to manually flush any buffers
+        * which need to be coherent with the GPU, if the object is not coherent
+        * i.e @cache_coherent is zero.
+        *
+        * On platforms that share the LLC with the CPU(HAS_LLC), all GT memory
+        * access will automatically snoop the CPU caches(even with CACHE_NONE).
+        * The one exception is when dealing with the display engine, like with
+        * scanout surfaces. To handle this the kernel will always flush the
+        * surface out of the CPU caches when preparing it for scanout.  Also
+        * note that since scanout surfaces are only ever read by the display
+        * engine we only need to care about flushing any writes through the CPU
+        * cache, reads on the other hand will always be coherent.
+        *
+        * Something strange here is why @cache_coherent is not a simple
+        * boolean, i.e coherent vs non-coherent. The reasoning for this is back
+        * to the display engine not being fully coherent. As a result scanout
+        * surfaces will either be marked as I915_CACHE_NONE or I915_CACHE_WT.
+        * In the case of seeing I915_CACHE_NONE the kernel makes the assumption
+        * that this is likely a scanout surface, and will set @cache_coherent
+        * as only I915_BO_CACHE_COHERENT_FOR_READ, on platforms with the shared
+        * LLC. The kernel uses this to always flush writes through the CPU
+        * cache as early as possible, where it can, in effect keeping
+        * @cache_dirty clean, so we can potentially avoid stalling when
+        * flushing the surface just before doing the scanout.  This does mean
+        * we might unnecessarily flush non-scanout objects in some places, but
+        * the default assumption is that all normal objects should be using
+        * I915_CACHE_LLC, at least on platforms with the shared LLC.
+        *
+        * Supported values:
+        *
+        * I915_BO_CACHE_COHERENT_FOR_READ:
+        *
+        * On shared LLC platforms, we use this for special scanout surfaces,
+        * where the display engine is not coherent with the CPU cache. As such
+        * we need to ensure we flush any writes before doing the scanout. As an
+        * optimisation we try to flush any writes as early as possible to avoid
+        * stalling later.
+        *
+        * Thus for scanout surfaces using I915_CACHE_NONE, on shared LLC
+        * platforms, we use:
+        *
+        *      cache_coherent = I915_BO_CACHE_COHERENT_FOR_READ
+        *
+        * While for normal objects that are fully coherent, including special
+        * scanout surfaces marked as I915_CACHE_WT, we use:
+        *
+        *      cache_coherent = I915_BO_CACHE_COHERENT_FOR_READ |
+        *                       I915_BO_CACHE_COHERENT_FOR_WRITE
+        *
+        * And then for objects that are not coherent at all we use:
+        *
+        *      cache_coherent = 0
+        *
+        * I915_BO_CACHE_COHERENT_FOR_WRITE:
+        *
+        * When writing through the CPU cache, the GPU is still coherent. Note
+        * that this also implies I915_BO_CACHE_COHERENT_FOR_READ.
+        */
  #define I915_BO_CACHE_COHERENT_FOR_READ BIT(0)
  #define I915_BO_CACHE_COHERENT_FOR_WRITE BIT(1)
+       unsigned int cache_coherent:2;
+
+       /**
+        * @cache_dirty:
+        *
+        * Track if we are we dirty with writes through the CPU cache for this
+        * object. As a result reading directly from main memory might yield
+        * stale data.
+        *
+        * This also ties into whether the kernel is tracking the object as
+        * coherent with the GPU, as per @cache_coherent, as it determines if
+        * flushing might be needed at various points.
+        *
+        * Another part of @cache_dirty is managing flushing when first
+        * acquiring the pages for system memory, at this point the pages are
+        * considered foreign, so the default assumption is that the cache is
+        * dirty, for example the page zeroing done by the kernel might leave
+        * writes though the CPU cache, or swapping-in, while the actual data in
+        * main memory is potentially stale.  Note that this is a potential
+        * security issue when dealing with userspace objects and zeroing. Now,
+        * whether we actually need apply the big sledgehammer of flushing all
+        * the pages on acquire depends on if @cache_coherent is marked as
+        * I915_BO_CACHE_COHERENT_FOR_WRITE, i.e that the GPU will be coherent
+        * for both reads and writes though the CPU cache.
+        *
+        * Note that on shared LLC platforms we still apply the heavy flush for
+        * I915_CACHE_NONE objects, under the assumption that this is going to
+        * be used for scanout.
+        */
         unsigned int cache_dirty:1;
  
         /**
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h

index b83389c06550dc6749a5288a852f0ae7dabb8092..5c82a80123328c21c63f2194017e7eb8c4d1d389 100644 (file)
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -392,15 +392,6 @@ struct drm_i915_display_funcs {
         void (*read_luts)(struct intel_crtc_state *crtc_state);
  };
  
-enum i915_cache_level {
-       I915_CACHE_NONE = 0,
-       I915_CACHE_LLC, /* also used for snoopable memory on non-LLC */
-       I915_CACHE_L3_LLC, /* gen7+, L3 sits between the domain specifc
-                             caches, eg sampler/render caches, and the
-                             large Last-Level-Cache. LLC is coherent with
-                             the CPU, but L3 is only visible to the GPU. */
-       I915_CACHE_WT, /* hsw:gt3e WriteThrough for scanouts */
-};
  
  #define I915_COLOR_UNEVICTABLE (-1) /* a non-vma sharing the address space */
author	Matthew Auld <matthew.auld@intel.com>
	Fri, 23 Jul 2021 10:50:44 +0000 (11:50 +0100)
committer	Matthew Auld <matthew.auld@intel.com>
	Tue, 27 Jul 2021 08:16:44 +0000 (09:16 +0100)
drivers/gpu/drm/i915/gem/i915_gem_object_types.h		patch \| blob \| history
drivers/gpu/drm/i915/i915_drv.h		patch \| blob \| history