Re: [PATCH v3 09/15] drm/panfrost: Simplify the reset serialization logic

25 Jun 2021


      On Fri, 25 Jun 2021 15:33:21 +0200
Boris Brezillon boris.brezillon@collabora.com wrote:
...
@@ -379,57 +370,73 @@ void panfrost_job_enable_interrupts(struct panfrost_device *pfdev)
   job_write(pfdev, JOB_INT_MASK, irq_mask);
 }
-static bool panfrost_scheduler_stop(struct panfrost_queue_state *queue,

		    struct drm_sched_job *bad)



+static void panfrost_reset(struct panfrost_device *pfdev,

	   struct drm_sched_job *bad)



{

enum panfrost_queue_status old_status;
bool stopped = false;


unsigned int i;
bool cookie;


mutex_lock(&queue->lock);
old_status = atomic_xchg(&queue->status,
		 PANFROST_QUEUE_STATUS_STOPPED);


if (old_status == PANFROST_QUEUE_STATUS_STOPPED)
goto out;




if (WARN_ON(!atomic_read(&pfdev->reset.pending)))
return;



/* Stop the schedulers.
*


* FIXME: We temporarily get out of the dma_fence_signalling section


* because the cleanup path generate lockdep splats when taking locks


* to release job resources. We should rework the code to follow this


* pattern:


*


*	try_lock


*	if (locked)


*		release


*	else


*		schedule_work_to_release_later


*/


for (i = 0; i < NUM_JOB_SLOTS; i++)
drm_sched_stop(&pfdev->js->queue[i].sched, bad);



cookie = dma_fence_begin_signalling();


WARN_ON(old_status != PANFROST_QUEUE_STATUS_ACTIVE);

drm_sched_stop(&queue->sched, bad);
if (bad)
drm_sched_increase_karma(bad);

stopped = true;



spin_lock(&pfdev->js->job_lock);
for (i = 0; i < NUM_JOB_SLOTS; i++) {
if (pfdev->jobs[i]) {


	pm_runtime_put_noidle(pfdev->dev);


	panfrost_devfreq_record_idle(&pfdev->pfdevfreq);


	pfdev->jobs[i] = NULL;


}


}
spin_unlock(&pfdev->js->job_lock);


/*
* Set the timeout to max so the timer doesn't get started


* when we return from the timeout handler (restored in


* panfrost_scheduler_start()).




panfrost_device_reset(pfdev);

/* GPU has been reset, we can cancel timeout/fault work that may have
* been queued in the meantime and clear the reset pending bit.

*/


queue->sched.timeout = MAX_SCHEDULE_TIMEOUT;


atomic_set(&pfdev->reset.pending, 0);
cancel_work_sync(&pfdev->reset.work);

This is introducing a deadlock since panfrost_reset() might be called
from the reset handler, and cancel_work_sync() waits for the handler to
return. Unfortunately there's no cancel_work() variant, so I'll just
remove the
WARN_ON(!atomic_read(&pfdev->reset.pending)
and return directly when the pending bit is cleared.
...

for (i = 0; i < NUM_JOB_SLOTS; i++)
cancel_delayed_work(&pfdev->js->queue[i].sched.work_tdr);




-out:

mutex_unlock(&queue->lock);

return stopped;


-}

/* Now resubmit jobs that were previously queued but didn't have a
* chance to finish.


* FIXME: We temporarily get out of the DMA fence signalling section


* while resubmitting jobs because the job submission logic will


* allocate memory with the GFP_KERNEL flag which can trigger memory


* reclaim and exposes a lock ordering issue.


*/


dma_fence_end_signalling(cookie);
for (i = 0; i < NUM_JOB_SLOTS; i++)
drm_sched_resubmit_jobs(&pfdev->js->queue[i].sched);


cookie = dma_fence_begin_signalling();

-static void panfrost_scheduler_start(struct panfrost_queue_state *queue)
-{

enum panfrost_queue_status old_status;


for (i = 0; i < NUM_JOB_SLOTS; i++)
drm_sched_start(&pfdev->js->queue[i].sched, true);





mutex_lock(&queue->lock);
old_status = atomic_xchg(&queue->status,
		 PANFROST_QUEUE_STATUS_STARTING);


WARN_ON(old_status != PANFROST_QUEUE_STATUS_STOPPED);

/* Restore the original timeout before starting the scheduler. */
queue->sched.timeout = msecs_to_jiffies(JOB_TIMEOUT_MS);
drm_sched_resubmit_jobs(&queue->sched);
drm_sched_start(&queue->sched, true);
old_status = atomic_xchg(&queue->status,
		 PANFROST_QUEUE_STATUS_ACTIVE);


if (old_status == PANFROST_QUEUE_STATUS_FAULT_PENDING)
drm_sched_fault(&queue->sched);



mutex_unlock(&queue->lock);


dma_fence_end_signalling(cookie);

}

    

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

Re: [PATCH v3 09/15] drm/panfrost: Simplify the reset serialization logic