On Fri, 25 Jun 2021 15:33:21 +0200 Boris Brezillon boris.brezillon@collabora.com wrote:
@@ -379,57 +370,73 @@ void panfrost_job_enable_interrupts(struct panfrost_device *pfdev) job_write(pfdev, JOB_INT_MASK, irq_mask); }
-static bool panfrost_scheduler_stop(struct panfrost_queue_state *queue,
struct drm_sched_job *bad)
+static void panfrost_reset(struct panfrost_device *pfdev,
struct drm_sched_job *bad)
{
- enum panfrost_queue_status old_status;
- bool stopped = false;
- unsigned int i;
- bool cookie;
- mutex_lock(&queue->lock);
- old_status = atomic_xchg(&queue->status,
PANFROST_QUEUE_STATUS_STOPPED);
- if (old_status == PANFROST_QUEUE_STATUS_STOPPED)
goto out;
- if (WARN_ON(!atomic_read(&pfdev->reset.pending)))
return;
- /* Stop the schedulers.
*
* FIXME: We temporarily get out of the dma_fence_signalling section
* because the cleanup path generate lockdep splats when taking locks
* to release job resources. We should rework the code to follow this
* pattern:
*
* try_lock
* if (locked)
* release
* else
* schedule_work_to_release_later
*/
- for (i = 0; i < NUM_JOB_SLOTS; i++)
drm_sched_stop(&pfdev->js->queue[i].sched, bad);
- cookie = dma_fence_begin_signalling();
WARN_ON(old_status != PANFROST_QUEUE_STATUS_ACTIVE);
drm_sched_stop(&queue->sched, bad); if (bad) drm_sched_increase_karma(bad);
stopped = true;
- spin_lock(&pfdev->js->job_lock);
- for (i = 0; i < NUM_JOB_SLOTS; i++) {
if (pfdev->jobs[i]) {
pm_runtime_put_noidle(pfdev->dev);
panfrost_devfreq_record_idle(&pfdev->pfdevfreq);
pfdev->jobs[i] = NULL;
}
- }
- spin_unlock(&pfdev->js->job_lock);
- /*
* Set the timeout to max so the timer doesn't get started
* when we return from the timeout handler (restored in
* panfrost_scheduler_start()).
- panfrost_device_reset(pfdev);
- /* GPU has been reset, we can cancel timeout/fault work that may have
*/* been queued in the meantime and clear the reset pending bit.
- queue->sched.timeout = MAX_SCHEDULE_TIMEOUT;
- atomic_set(&pfdev->reset.pending, 0);
- cancel_work_sync(&pfdev->reset.work);
This is introducing a deadlock since panfrost_reset() might be called from the reset handler, and cancel_work_sync() waits for the handler to return. Unfortunately there's no cancel_work() variant, so I'll just remove the
WARN_ON(!atomic_read(&pfdev->reset.pending)
and return directly when the pending bit is cleared.
- for (i = 0; i < NUM_JOB_SLOTS; i++)
cancel_delayed_work(&pfdev->js->queue[i].sched.work_tdr);
-out:
mutex_unlock(&queue->lock);
return stopped;
-}
- /* Now resubmit jobs that were previously queued but didn't have a
* chance to finish.
* FIXME: We temporarily get out of the DMA fence signalling section
* while resubmitting jobs because the job submission logic will
* allocate memory with the GFP_KERNEL flag which can trigger memory
* reclaim and exposes a lock ordering issue.
*/
- dma_fence_end_signalling(cookie);
- for (i = 0; i < NUM_JOB_SLOTS; i++)
drm_sched_resubmit_jobs(&pfdev->js->queue[i].sched);
- cookie = dma_fence_begin_signalling();
-static void panfrost_scheduler_start(struct panfrost_queue_state *queue) -{
- enum panfrost_queue_status old_status;
- for (i = 0; i < NUM_JOB_SLOTS; i++)
drm_sched_start(&pfdev->js->queue[i].sched, true);
- mutex_lock(&queue->lock);
- old_status = atomic_xchg(&queue->status,
PANFROST_QUEUE_STATUS_STARTING);
- WARN_ON(old_status != PANFROST_QUEUE_STATUS_STOPPED);
- /* Restore the original timeout before starting the scheduler. */
- queue->sched.timeout = msecs_to_jiffies(JOB_TIMEOUT_MS);
- drm_sched_resubmit_jobs(&queue->sched);
- drm_sched_start(&queue->sched, true);
- old_status = atomic_xchg(&queue->status,
PANFROST_QUEUE_STATUS_ACTIVE);
- if (old_status == PANFROST_QUEUE_STATUS_FAULT_PENDING)
drm_sched_fault(&queue->sched);
- mutex_unlock(&queue->lock);
- dma_fence_end_signalling(cookie);
}