struct list_head device_list, *device_list_handle = NULL;
struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, false);
- /* Build list of devices to query RAS related errors */
- if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
- device_list_handle = &hive->device_list;
- else {
- INIT_LIST_HEAD(&device_list);
- list_add_tail(&adev->gmc.xgmi.head, &device_list);
- device_list_handle = &device_list;
- }
+ if (!ras->disable_ras_err_cnt_harvest) {
+ /* Build list of devices to query RAS related errors */
+ if (hive && adev->gmc.xgmi.num_physical_nodes > 1) {
+ device_list_handle = &hive->device_list;
+ } else {
+ INIT_LIST_HEAD(&device_list);
+ list_add_tail(&adev->gmc.xgmi.head, &device_list);
+ device_list_handle = &device_list;
+ }
- list_for_each_entry(remote_adev, device_list_handle, gmc.xgmi.head) {
- amdgpu_ras_log_on_err_counter(remote_adev);
+ list_for_each_entry(remote_adev,
+ device_list_handle, gmc.xgmi.head)
+ amdgpu_ras_log_on_err_counter(remote_adev);
}
if (amdgpu_device_should_recover_gpu(ras->adev))
uint32_t bif_doorbell_intr_cntl;
struct ras_manager *obj = amdgpu_ras_find_obj(adev, adev->nbio.ras_if);
struct ras_err_data err_data = {0, 0, 0, NULL};
+ struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
bif_doorbell_intr_cntl = RREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL);
if (REG_GET_FIELD(bif_doorbell_intr_cntl,
RAS_CNTLR_INTERRUPT_CLEAR, 1);
WREG32_SOC15(NBIO, 0, mmBIF_DOORBELL_INT_CNTL, bif_doorbell_intr_cntl);
- /*
- * clear error status after ras_controller_intr according to
- * hw team and count ue number for query
- */
- nbio_v7_4_query_ras_error_count(adev, &err_data);
-
- /* logging on error counter and printing for awareness */
- obj->err_data.ue_count += err_data.ue_count;
- obj->err_data.ce_count += err_data.ce_count;
-
- if (err_data.ce_count)
- dev_info(adev->dev, "%ld correctable hardware "
- "errors detected in %s block, "
- "no user action is needed.\n",
- obj->err_data.ce_count,
- adev->nbio.ras_if->name);
-
- if (err_data.ue_count)
- dev_info(adev->dev, "%ld uncorrectable hardware "
- "errors detected in %s block\n",
- obj->err_data.ue_count,
- adev->nbio.ras_if->name);
+ if (!ras->disable_ras_err_cnt_harvest) {
+ /*
+ * clear error status after ras_controller_intr
+ * according to hw team and count ue number
+ * for query
+ */
+ nbio_v7_4_query_ras_error_count(adev, &err_data);
+
+ /* logging on error cnt and printing for awareness */
+ obj->err_data.ue_count += err_data.ue_count;
+ obj->err_data.ce_count += err_data.ce_count;
+
+ if (err_data.ce_count)
+ dev_info(adev->dev, "%ld correctable hardware "
+ "errors detected in %s block, "
+ "no user action is needed.\n",
+ obj->err_data.ce_count,
+ adev->nbio.ras_if->name);
+
+ if (err_data.ue_count)
+ dev_info(adev->dev, "%ld uncorrectable hardware "
+ "errors detected in %s block\n",
+ obj->err_data.ue_count,
+ adev->nbio.ras_if->name);
+ }
dev_info(adev->dev, "RAS controller interrupt triggered "
"by NBIF error\n");