This small patch-set fixes a corner-case bug in kernel queue handling, which was caused by using a function called sync_with_hw().
The patch-set removes this function, as it is not needed anymore. As a result, we need to replace cpu_relax() with schedule() in DQM, so this patch-set takes care of that as well.
I would like this fix to be in 3.19 so please review.
Thanks,
Oded
Oded Gabbay (3): drm/amdkfd: Replace cpu_relax() with schedule() in DQM drm/amdkfd: Remove unused function busy_wait() drm/amdkfd: Remove sync_with_hw() from amdkfd
.../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 3 ++- drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 24 ---------------------- drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h | 2 -- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 6 ------ drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 5 +---- 5 files changed, 3 insertions(+), 37 deletions(-)
In order not to occupy the current core and thus prevent the core from servicing IOMMU PPR requests, this patch replaces the call in DQM to cpu_relax() with a call to schedule().
Signed-off-by: Oded Gabbay oded.gabbay@amd.com --- drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 102aaba..08cf5b3 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -26,6 +26,7 @@ #include <linux/types.h> #include <linux/printk.h> #include <linux/bitops.h> +#include <linux/sched.h> #include "kfd_priv.h" #include "kfd_device_queue_manager.h" #include "kfd_mqd_manager.h" @@ -827,7 +828,7 @@ static int fence_wait_timeout(unsigned int *fence_addr, pr_err("kfd: qcm fence wait loop timeout expired\n"); return -ETIME; } - cpu_relax(); + schedule(); }
return 0;
Signed-off-by: Oded Gabbay oded.gabbay@amd.com --- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 6 ------ 1 file changed, 6 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c index 4c3828c..5e065bd 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c @@ -28,12 +28,6 @@ #include "cik_regs.h" #include "../../radeon/cik_reg.h"
-inline void busy_wait(unsigned long ms) -{ - while (time_before(jiffies, ms)) - cpu_relax(); -} - static inline struct cik_mqd *get_mqd(void *mqd) { return (struct cik_mqd *)mqd;
This patch completely removes the sync_with_hw() because it was broken and actually there is no point of using it.
This function was used to:
- Make sure that the submitted packet to the HIQ (which is a kernel queue) was read by the CP. However, it was discovered that the method this function used to do that (checking wptr == rptr) is not consistent with how the actual CP firmware works in all cases.
- Make sure that the queue is empty before issuing the next packet. To achieve that, the function blocked amdkfd from continuing until the recently submitted packet was consumed. However, the acquire_packet_buffer() already checks if there is enough room for a new packet so calling sync_with_hw() is redundant.
Signed-off-by: Oded Gabbay oded.gabbay@amd.com --- drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 24 ------------------------ drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h | 2 -- drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 5 +---- 3 files changed, 1 insertion(+), 30 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c index 9350714..5ffca94 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c @@ -265,28 +265,6 @@ static void submit_packet(struct kernel_queue *kq) kq->pending_wptr); }
-static int sync_with_hw(struct kernel_queue *kq, unsigned long timeout_ms) -{ - unsigned long org_timeout_ms; - - BUG_ON(!kq); - - org_timeout_ms = timeout_ms; - timeout_ms += jiffies * 1000 / HZ; - while (*kq->wptr_kernel != *kq->rptr_kernel) { - if (time_after(jiffies * 1000 / HZ, timeout_ms)) { - pr_err("kfd: kernel_queue %s timeout expired %lu\n", - __func__, org_timeout_ms); - pr_err("kfd: wptr: %d rptr: %d\n", - *kq->wptr_kernel, *kq->rptr_kernel); - return -ETIME; - } - schedule(); - } - - return 0; -} - static void rollback_packet(struct kernel_queue *kq) { BUG_ON(!kq); @@ -308,7 +286,6 @@ struct kernel_queue *kernel_queue_init(struct kfd_dev *dev, kq->uninitialize = uninitialize; kq->acquire_packet_buffer = acquire_packet_buffer; kq->submit_packet = submit_packet; - kq->sync_with_hw = sync_with_hw; kq->rollback_packet = rollback_packet;
if (kq->initialize(kq, dev, type, KFD_KERNEL_QUEUE_SIZE) == false) { @@ -345,7 +322,6 @@ static __attribute__((unused)) void test_kq(struct kfd_dev *dev) for (i = 0; i < 5; i++) buffer[i] = kq->nop_packet; kq->submit_packet(kq); - kq->sync_with_hw(kq, 1000);
pr_debug("kfd: ending kernel queue test\n"); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h index dcd2bdb..11bfd92d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h @@ -38,8 +38,6 @@ struct kernel_queue { unsigned int **buffer_ptr);
void (*submit_packet)(struct kernel_queue *kq); - int (*sync_with_hw)(struct kernel_queue *kq, - unsigned long timeout_ms); void (*rollback_packet)(struct kernel_queue *kq);
/* data */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c index 5ce9233..24acd46 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -379,7 +379,6 @@ int pm_send_set_resources(struct packet_manager *pm, packet->queue_mask_hi = upper_32_bits(res->queue_mask);
pm->priv_queue->submit_packet(pm->priv_queue); - pm->priv_queue->sync_with_hw(pm->priv_queue, KFD_HIQ_TIMEOUT);
mutex_unlock(&pm->lock);
@@ -416,7 +415,6 @@ int pm_send_runlist(struct packet_manager *pm, struct list_head *dqm_queues) goto fail_create_runlist;
pm->priv_queue->submit_packet(pm->priv_queue); - pm->priv_queue->sync_with_hw(pm->priv_queue, KFD_HIQ_TIMEOUT);
mutex_unlock(&pm->lock);
@@ -463,7 +461,7 @@ int pm_send_query_status(struct packet_manager *pm, uint64_t fence_address, packet->data_lo = lower_32_bits((uint64_t)fence_value);
pm->priv_queue->submit_packet(pm->priv_queue); - pm->priv_queue->sync_with_hw(pm->priv_queue, KFD_HIQ_TIMEOUT); + mutex_unlock(&pm->lock);
return 0; @@ -541,7 +539,6 @@ int pm_send_unmap_queue(struct packet_manager *pm, enum kfd_queue_type type, };
pm->priv_queue->submit_packet(pm->priv_queue); - pm->priv_queue->sync_with_hw(pm->priv_queue, KFD_HIQ_TIMEOUT);
mutex_unlock(&pm->lock); return 0;
On Thu, Jan 15, 2015 at 5:46 AM, Oded Gabbay oded.gabbay@amd.com wrote:
This small patch-set fixes a corner-case bug in kernel queue handling, which was caused by using a function called sync_with_hw().
The patch-set removes this function, as it is not needed anymore. As a result, we need to replace cpu_relax() with schedule() in DQM, so this patch-set takes care of that as well.
I would like this fix to be in 3.19 so please review.
I'm not really too familiar with these corner cases, but the logic seems correct so series is: Acked-by: Alex Deucher alexander.deucher@amd.com
Thanks,
Oded
Oded Gabbay (3): drm/amdkfd: Replace cpu_relax() with schedule() in DQM drm/amdkfd: Remove unused function busy_wait() drm/amdkfd: Remove sync_with_hw() from amdkfd
.../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 3 ++- drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.c | 24 ---------------------- drivers/gpu/drm/amd/amdkfd/kfd_kernel_queue.h | 2 -- drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager.c | 6 ------ drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c | 5 +---- 5 files changed, 3 insertions(+), 37 deletions(-)
-- 2.1.0
dri-devel mailing list dri-devel@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/dri-devel
dri-devel@lists.freedesktop.org