static struct dma_fence *amdgpu_job_run(struct amd_sched_job *sched_job)
{
- struct dma_fence *fence = NULL;
+ struct dma_fence *fence = NULL, *finished;
struct amdgpu_device *adev;
struct amdgpu_job *job;
int r;
return NULL;
}
job = to_amdgpu_job(sched_job);
+ finished = &job->base.s_fence->finished;
adev = job->adev;
BUG_ON(amdgpu_sync_peek_fence(&job->sync, NULL));
trace_amdgpu_sched_run_job(job);
- /* skip ib schedule when vram is lost */
- if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter)) {
- dma_fence_set_error(&job->base.s_fence->finished, -ECANCELED);
- DRM_ERROR("Skip scheduling IBs!\n");
+
+ if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter))
+ dma_fence_set_error(finished, -ECANCELED);/* skip IB as well if VRAM lost */
+
+ if (finished->error < 0) {
+ DRM_INFO("Skip scheduling IBs!\n");
} else {
r = amdgpu_ib_schedule(job->ring, job->num_ibs, job->ibs, job,
&fence);
if (amd_sched_entity_add_dependency_cb(entity))
return NULL;
+ /* skip jobs from entity that marked guilty */
+ if (entity->guilty && atomic_read(entity->guilty))
+ dma_fence_set_error(&sched_job->s_fence->finished, -ECANCELED);
+
spsc_queue_pop(&entity->job_queue);
return sched_job;
}
job->sched->ops->timedout_job(job);
}
-static void amd_sched_set_guilty(struct amd_sched_job *s_job,
- struct amd_sched_entity *s_entity)
-{
- if (atomic_inc_return(&s_job->karma) > s_job->sched->hang_limit)
- if (s_entity->guilty)
- atomic_set(s_entity->guilty, 1);
-}
-
void amd_sched_hw_job_reset(struct amd_gpu_scheduler *sched, struct amd_sched_job *bad)
{
struct amd_sched_job *s_job;
spin_unlock(&sched->job_list_lock);
if (bad) {
- bool found = false;
-
- for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_MAX; i++ ) {
+ /* don't increase @bad's karma if it's from KERNEL RQ,
+ * becuase sometimes GPU hang would cause kernel jobs (like VM updating jobs)
+ * corrupt but keep in mind that kernel jobs always considered good.
+ */
+ for (i = AMD_SCHED_PRIORITY_MIN; i < AMD_SCHED_PRIORITY_KERNEL; i++ ) {
struct amd_sched_rq *rq = &sched->sched_rq[i];
spin_lock(&rq->lock);
list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
if (bad->s_fence->scheduled.context == entity->fence_context) {
- found = true;
- amd_sched_set_guilty(bad, entity);
+ if (atomic_inc_return(&bad->karma) > bad->sched->hang_limit)
+ if (entity->guilty)
+ atomic_set(entity->guilty, 1);
break;
}
}
spin_unlock(&rq->lock);
- if (found)
+ if (&entity->list != &rq->entities)
break;
}
}
void amd_sched_job_recovery(struct amd_gpu_scheduler *sched)
{
struct amd_sched_job *s_job, *tmp;
+ bool found_guilty = false;
int r;
spin_lock(&sched->job_list_lock);
list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
struct amd_sched_fence *s_fence = s_job->s_fence;
struct dma_fence *fence;
+ uint64_t guilty_context;
+
+ if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
+ found_guilty = true;
+ guilty_context = s_job->s_fence->scheduled.context;
+ }
+
+ if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
+ dma_fence_set_error(&s_fence->finished, -ECANCELED);
spin_unlock(&sched->job_list_lock);
fence = sched->ops->run_job(s_job);
r);
dma_fence_put(fence);
} else {
- DRM_ERROR("Failed to run job!\n");
amd_sched_process_job(NULL, &s_fence->cb);
}
spin_lock(&sched->job_list_lock);
r);
dma_fence_put(fence);
} else {
- DRM_ERROR("Failed to run job!\n");
amd_sched_process_job(NULL, &s_fence->cb);
}