drm/amdgpu: Add autodump debugfs node for gpu reset v8

author Jiange Zhao <Jiange.Zhao@amd.com>

Sun, 26 Apr 2020 09:57:00 +0000 (17:57 +0800)

committer Alex Deucher <alexander.deucher@amd.com>

Mon, 18 May 2020 15:23:37 +0000 (11:23 -0400)
author Jiange Zhao <Jiange.Zhao@amd.com>
Sun, 26 Apr 2020 09:57:00 +0000 (17:57 +0800)
committer Alex Deucher <alexander.deucher@amd.com>
Mon, 18 May 2020 15:23:37 +0000 (11:23 -0400)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

index 4a03a24348f4346093815def6c6af1dcd51af702..7975f8e157df22c4bf62f155638182dea142e86f 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -989,6 +989,8 @@ struct amdgpu_device {
         char                            product_number[16];
         char                            product_name[32];
         char                            serial[16];
+
+       struct amdgpu_autodump          autodump;
  };
  
  static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

index 1a4894fa369399ed27fcc7edaed56589667e4799..d33cb344be69f5266b4933137151ffa5feb5117a 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -27,7 +27,7 @@
  #include <linux/pci.h>
  #include <linux/uaccess.h>
  #include <linux/pm_runtime.h>
-
+#include <linux/poll.h>
  #include <drm/drm_debugfs.h>
  
  #include "amdgpu.h"
@@ -74,8 +74,82 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
         return 0;
  }
  
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
+{
+#if defined(CONFIG_DEBUG_FS)
+       unsigned long timeout = 600 * HZ;
+       int ret;
+
+       wake_up_interruptible(&adev->autodump.gpu_hang);
+
+       ret = wait_for_completion_interruptible_timeout(&adev->autodump.dumping, timeout);
+       if (ret == 0) {
+               pr_err("autodump: timeout, move on to gpu recovery\n");
+               return -ETIMEDOUT;
+       }
+#endif
+       return 0;
+}
+
  #if defined(CONFIG_DEBUG_FS)
  
+static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
+{
+       struct amdgpu_device *adev = inode->i_private;
+       int ret;
+
+       file->private_data = adev;
+
+       mutex_lock(&adev->lock_reset);
+       if (adev->autodump.dumping.done) {
+               reinit_completion(&adev->autodump.dumping);
+               ret = 0;
+       } else {
+               ret = -EBUSY;
+       }
+       mutex_unlock(&adev->lock_reset);
+
+       return ret;
+}
+
+static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file *file)
+{
+       struct amdgpu_device *adev = file->private_data;
+
+       complete_all(&adev->autodump.dumping);
+       return 0;
+}
+
+static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_table_struct *poll_table)
+{
+       struct amdgpu_device *adev = file->private_data;
+
+       poll_wait(file, &adev->autodump.gpu_hang, poll_table);
+
+       if (adev->in_gpu_reset)
+               return POLLIN | POLLRDNORM | POLLWRNORM;
+
+       return 0;
+}
+
+static const struct file_operations autodump_debug_fops = {
+       .owner = THIS_MODULE,
+       .open = amdgpu_debugfs_autodump_open,
+       .poll = amdgpu_debugfs_autodump_poll,
+       .release = amdgpu_debugfs_autodump_release,
+};
+
+static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
+{
+       init_completion(&adev->autodump.dumping);
+       complete_all(&adev->autodump.dumping);
+       init_waitqueue_head(&adev->autodump.gpu_hang);
+
+       debugfs_create_file("amdgpu_autodump", 0600,
+               adev->ddev->primary->debugfs_root,
+               adev, &autodump_debug_fops);
+}
+
  /**
   * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
   *
@@ -1434,6 +1508,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
  
         amdgpu_ras_debugfs_create_all(adev);
  
+       amdgpu_debugfs_autodump_init(adev);
+
         return amdgpu_debugfs_add_files(adev, amdgpu_debugfs_list,
                                         ARRAY_SIZE(amdgpu_debugfs_list));
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h

index de12d11015260eb3c44a9ef1cb95a434a26b5fda..2803884d338d53c1218b969fdb61c13658885660 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
@@ -31,6 +31,11 @@ struct amdgpu_debugfs {
         unsigned                num_files;
  };
  
+struct amdgpu_autodump {
+       struct completion               dumping;
+       struct wait_queue_head          gpu_hang;
+};
+
  int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
  int amdgpu_debugfs_init(struct amdgpu_device *adev);
  void amdgpu_debugfs_fini(struct amdgpu_device *adev);
@@ -40,3 +45,4 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
  int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
  int amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
  int amdgpu_debugfs_gem_init(struct amdgpu_device *adev);
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index cc41e8f5ad14f9b5728b70d42cfab06c23fcc32e..545beebcf43e8758488219350c08d9c950e505e1 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3927,6 +3927,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
         int i, r = 0;
         bool need_full_reset  = *need_full_reset_arg;
  
+       amdgpu_debugfs_wait_dump(adev);
+
         /* block all schedulers and reset given job's ring */
         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
                 struct amdgpu_ring *ring = adev->rings[i];
author	Jiange Zhao <Jiange.Zhao@amd.com>
	Sun, 26 Apr 2020 09:57:00 +0000 (17:57 +0800)
committer	Alex Deucher <alexander.deucher@amd.com>
	Mon, 18 May 2020 15:23:37 +0000 (11:23 -0400)
drivers/gpu/drm/amd/amdgpu/amdgpu.h		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h		patch \| blob \| history
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c		patch \| blob \| history