From 41de9fc855350e387df021b1e49a225c4fc97d70 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Sat, 9 May 2020 13:49:26 +0800 Subject: [PATCH] drm/amd/powerplay: shutdown on HW CTF To prevent further damage. Signed-off-by: Evan Quan Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- .../gpu/drm/amd/powerplay/hwmgr/smu_helper.c | 16 +++++++++++++-- drivers/gpu/drm/amd/powerplay/smu_v11_0.c | 20 ++++++++++++++++++- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/smu_helper.c b/drivers/gpu/drm/amd/powerplay/hwmgr/smu_helper.c index 782f6d295202c..4279f95ba7791 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/smu_helper.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/smu_helper.c @@ -612,11 +612,17 @@ int phm_irq_process(struct amdgpu_device *adev, PCI_BUS_NUM(adev->pdev->devfn), PCI_SLOT(adev->pdev->devfn), PCI_FUNC(adev->pdev->devfn)); - else if (src_id == VISLANDS30_IV_SRCID_GPIO_19) + else if (src_id == VISLANDS30_IV_SRCID_GPIO_19) { pr_warn("GPU Critical Temperature Fault detected on PCIe %d:%d.%d!\n", PCI_BUS_NUM(adev->pdev->devfn), PCI_SLOT(adev->pdev->devfn), PCI_FUNC(adev->pdev->devfn)); + /* + * HW CTF just occurred. Shutdown to prevent further damage. + */ + dev_emerg(adev->dev, "System is going to shutdown due to HW CTF!\n"); + orderly_poweroff(true); + } } else if (client_id == SOC15_IH_CLIENTID_THM) { if (src_id == 0) { pr_warn("GPU over temperature range detected on PCIe %d:%d.%d!\n", @@ -634,11 +640,17 @@ int phm_irq_process(struct amdgpu_device *adev, PCI_BUS_NUM(adev->pdev->devfn), PCI_SLOT(adev->pdev->devfn), PCI_FUNC(adev->pdev->devfn)); - } else if (client_id == SOC15_IH_CLIENTID_ROM_SMUIO) + } else if (client_id == SOC15_IH_CLIENTID_ROM_SMUIO) { pr_warn("GPU Critical Temperature Fault detected on PCIe %d:%d.%d!\n", PCI_BUS_NUM(adev->pdev->devfn), PCI_SLOT(adev->pdev->devfn), PCI_FUNC(adev->pdev->devfn)); + /* + * HW CTF just occurred. Shutdown to prevent further damage. + */ + dev_emerg(adev->dev, "System is going to shutdown due to HW CTF!\n"); + orderly_poweroff(true); + } return 0; } diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c index d1fe762c8d3de..3979973680ecc 100644 --- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c +++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c @@ -1548,6 +1548,8 @@ static int smu_v11_0_ack_ac_dc_interrupt(struct smu_context *smu) #define THM_11_0__SRCID__THM_DIG_THERM_L2H 0 /* ASIC_TEMP > CG_THERMAL_INT.DIG_THERM_INTH */ #define THM_11_0__SRCID__THM_DIG_THERM_H2L 1 /* ASIC_TEMP < CG_THERMAL_INT.DIG_THERM_INTL */ +#define SMUIO_11_0__SRCID__SMUIO_GPIO19 83 + static int smu_v11_0_irq_process(struct amdgpu_device *adev, struct amdgpu_irq_src *source, struct amdgpu_iv_entry *entry) @@ -1582,8 +1584,17 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev, PCI_SLOT(adev->pdev->devfn), PCI_FUNC(adev->pdev->devfn)); break; - } + } else if (client_id == SOC15_IH_CLIENTID_ROM_SMUIO) { + pr_warn("GPU Critical Temperature Fault detected on PCIe %d:%d.%d!\n", + PCI_BUS_NUM(adev->pdev->devfn), + PCI_SLOT(adev->pdev->devfn), + PCI_FUNC(adev->pdev->devfn)); + /* + * HW CTF just occurred. Shutdown to prevent further damage. + */ + dev_emerg(adev->dev, "System is going to shutdown due to HW CTF!\n"); + orderly_poweroff(true); } else if (client_id == SOC15_IH_CLIENTID_MP1) { if (src_id == 0xfe) smu_v11_0_ack_ac_dc_interrupt(&adev->smu); @@ -1626,6 +1637,13 @@ int smu_v11_0_register_irq_handler(struct smu_context *smu) if (ret) return ret; + /* Register CTF(GPIO_19) interrupt */ + ret = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_ROM_SMUIO, + SMUIO_11_0__SRCID__SMUIO_GPIO19, + irq_src); + if (ret) + return ret; + ret = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_MP1, 0xfe, irq_src); -- 2.39.5