On my Kaveri system, streaming data from CPU to GPU via VRAM is faster than via GTT both with the integrated GPU and with discrete GPUs.
The following kernel patches make this safe by always flushing the HDP cache before submitting a command stream to the GPU, and bump the radeon DRM minor version.
The following Mesa patches check for the bumped radeon DRM minor version, and if it's satisfied, they prefer CPU -> GPU streaming via VRAM and relax the restrictions for persistent mappings.
[PATCH 1/2] drm/radeon: s/ioctl_wait_idle/mmio_hpd_flush/ [PATCH 2/2] drm/radeon: Always flush the HDP cache before submitting [PATCH 1/2] r600g/radeonsi: Reduce or even drop special treatment of [PATCH 2/2] r600g/radeonsi: Prefer VRAM for CPU -> GPU streaming
From: Michel Dänzer michel.daenzer@amd.com
And clean up the function comment a little.
Signed-off-by: Michel Dänzer michel.daenzer@amd.com --- drivers/gpu/drm/radeon/r600.c | 13 +++++------ drivers/gpu/drm/radeon/radeon.h | 9 ++------ drivers/gpu/drm/radeon/radeon_asic.c | 44 ++++++++++++++++++------------------ drivers/gpu/drm/radeon/radeon_asic.h | 2 +- drivers/gpu/drm/radeon/radeon_gem.c | 6 ++--- 5 files changed, 34 insertions(+), 40 deletions(-)
diff --git a/drivers/gpu/drm/radeon/r600.c b/drivers/gpu/drm/radeon/r600.c index c17ff5d..76e1616 100644 --- a/drivers/gpu/drm/radeon/r600.c +++ b/drivers/gpu/drm/radeon/r600.c @@ -4088,16 +4088,15 @@ int r600_debugfs_mc_info_init(struct radeon_device *rdev) }
/** - * r600_ioctl_wait_idle - flush host path cache on wait idle ioctl + * r600_mmio_hdp_flush - flush Host Data Path cache via MMIO * rdev: radeon device structure - * bo: buffer object struct which userspace is waiting for idle * - * Some R6XX/R7XX doesn't seems to take into account HDP flush performed - * through ring buffer, this leads to corruption in rendering, see - * http://bugzilla.kernel.org/show_bug.cgi?id=15186 to avoid this we - * directly perform HDP flush by writing register through MMIO. + * Some R6XX/R7XX don't seem to take into account HDP flushes performed + * through the ring buffer. This leads to corruption in rendering, see + * http://bugzilla.kernel.org/show_bug.cgi?id=15186 . To avoid this, we + * directly perform the HDP flush by writing the register through MMIO. */ -void r600_ioctl_wait_idle(struct radeon_device *rdev, struct radeon_bo *bo) +void r600_mmio_hdp_flush(struct radeon_device *rdev) { /* r7xx hw bug. write to HDP_DEBUG1 followed by fb read * rather than write to HDP_REG_COHERENCY_FLUSH_CNTL. diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 6695b62..4a76e13 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -1771,13 +1771,8 @@ struct radeon_asic { int (*suspend)(struct radeon_device *rdev); void (*vga_set_state)(struct radeon_device *rdev, bool state); int (*asic_reset)(struct radeon_device *rdev); - /* ioctl hw specific callback. Some hw might want to perform special - * operation on specific ioctl. For instance on wait idle some hw - * might want to perform and HDP flush through MMIO as it seems that - * some R6XX/R7XX hw doesn't take HDP flush into account if programmed - * through ring. - */ - void (*ioctl_wait_idle)(struct radeon_device *rdev, struct radeon_bo *bo); + /* Flush the HDP cache via MMIO */ + void (*mmio_hdp_flush)(struct radeon_device *rdev); /* check if 3D engine is idle */ bool (*gui_idle)(struct radeon_device *rdev); /* wait for mc_idle */ diff --git a/drivers/gpu/drm/radeon/radeon_asic.c b/drivers/gpu/drm/radeon/radeon_asic.c index 34b9aa9..ba8caa7 100644 --- a/drivers/gpu/drm/radeon/radeon_asic.c +++ b/drivers/gpu/drm/radeon/radeon_asic.c @@ -194,7 +194,7 @@ static struct radeon_asic r100_asic = { .resume = &r100_resume, .vga_set_state = &r100_vga_set_state, .asic_reset = &r100_asic_reset, - .ioctl_wait_idle = NULL, + .mmio_hdp_flush = NULL, .gui_idle = &r100_gui_idle, .mc_wait_for_idle = &r100_mc_wait_for_idle, .gart = { @@ -260,7 +260,7 @@ static struct radeon_asic r200_asic = { .resume = &r100_resume, .vga_set_state = &r100_vga_set_state, .asic_reset = &r100_asic_reset, - .ioctl_wait_idle = NULL, + .mmio_hdp_flush = NULL, .gui_idle = &r100_gui_idle, .mc_wait_for_idle = &r100_mc_wait_for_idle, .gart = { @@ -340,7 +340,7 @@ static struct radeon_asic r300_asic = { .resume = &r300_resume, .vga_set_state = &r100_vga_set_state, .asic_reset = &r300_asic_reset, - .ioctl_wait_idle = NULL, + .mmio_hdp_flush = NULL, .gui_idle = &r100_gui_idle, .mc_wait_for_idle = &r300_mc_wait_for_idle, .gart = { @@ -406,7 +406,7 @@ static struct radeon_asic r300_asic_pcie = { .resume = &r300_resume, .vga_set_state = &r100_vga_set_state, .asic_reset = &r300_asic_reset, - .ioctl_wait_idle = NULL, + .mmio_hdp_flush = NULL, .gui_idle = &r100_gui_idle, .mc_wait_for_idle = &r300_mc_wait_for_idle, .gart = { @@ -472,7 +472,7 @@ static struct radeon_asic r420_asic = { .resume = &r420_resume, .vga_set_state = &r100_vga_set_state, .asic_reset = &r300_asic_reset, - .ioctl_wait_idle = NULL, + .mmio_hdp_flush = NULL, .gui_idle = &r100_gui_idle, .mc_wait_for_idle = &r300_mc_wait_for_idle, .gart = { @@ -538,7 +538,7 @@ static struct radeon_asic rs400_asic = { .resume = &rs400_resume, .vga_set_state = &r100_vga_set_state, .asic_reset = &r300_asic_reset, - .ioctl_wait_idle = NULL, + .mmio_hdp_flush = NULL, .gui_idle = &r100_gui_idle, .mc_wait_for_idle = &rs400_mc_wait_for_idle, .gart = { @@ -604,7 +604,7 @@ static struct radeon_asic rs600_asic = { .resume = &rs600_resume, .vga_set_state = &r100_vga_set_state, .asic_reset = &rs600_asic_reset, - .ioctl_wait_idle = NULL, + .mmio_hdp_flush = NULL, .gui_idle = &r100_gui_idle, .mc_wait_for_idle = &rs600_mc_wait_for_idle, .gart = { @@ -672,7 +672,7 @@ static struct radeon_asic rs690_asic = { .resume = &rs690_resume, .vga_set_state = &r100_vga_set_state, .asic_reset = &rs600_asic_reset, - .ioctl_wait_idle = NULL, + .mmio_hdp_flush = NULL, .gui_idle = &r100_gui_idle, .mc_wait_for_idle = &rs690_mc_wait_for_idle, .gart = { @@ -740,7 +740,7 @@ static struct radeon_asic rv515_asic = { .resume = &rv515_resume, .vga_set_state = &r100_vga_set_state, .asic_reset = &rs600_asic_reset, - .ioctl_wait_idle = NULL, + .mmio_hdp_flush = NULL, .gui_idle = &r100_gui_idle, .mc_wait_for_idle = &rv515_mc_wait_for_idle, .gart = { @@ -806,7 +806,7 @@ static struct radeon_asic r520_asic = { .resume = &r520_resume, .vga_set_state = &r100_vga_set_state, .asic_reset = &rs600_asic_reset, - .ioctl_wait_idle = NULL, + .mmio_hdp_flush = NULL, .gui_idle = &r100_gui_idle, .mc_wait_for_idle = &r520_mc_wait_for_idle, .gart = { @@ -898,7 +898,7 @@ static struct radeon_asic r600_asic = { .resume = &r600_resume, .vga_set_state = &r600_vga_set_state, .asic_reset = &r600_asic_reset, - .ioctl_wait_idle = r600_ioctl_wait_idle, + .mmio_hdp_flush = r600_mmio_hdp_flush, .gui_idle = &r600_gui_idle, .mc_wait_for_idle = &r600_mc_wait_for_idle, .get_xclk = &r600_get_xclk, @@ -970,7 +970,7 @@ static struct radeon_asic rv6xx_asic = { .resume = &r600_resume, .vga_set_state = &r600_vga_set_state, .asic_reset = &r600_asic_reset, - .ioctl_wait_idle = r600_ioctl_wait_idle, + .mmio_hdp_flush = r600_mmio_hdp_flush, .gui_idle = &r600_gui_idle, .mc_wait_for_idle = &r600_mc_wait_for_idle, .get_xclk = &r600_get_xclk, @@ -1060,7 +1060,7 @@ static struct radeon_asic rs780_asic = { .resume = &r600_resume, .vga_set_state = &r600_vga_set_state, .asic_reset = &r600_asic_reset, - .ioctl_wait_idle = r600_ioctl_wait_idle, + .mmio_hdp_flush = r600_mmio_hdp_flush, .gui_idle = &r600_gui_idle, .mc_wait_for_idle = &r600_mc_wait_for_idle, .get_xclk = &r600_get_xclk, @@ -1163,7 +1163,7 @@ static struct radeon_asic rv770_asic = { .resume = &rv770_resume, .asic_reset = &r600_asic_reset, .vga_set_state = &r600_vga_set_state, - .ioctl_wait_idle = r600_ioctl_wait_idle, + .mmio_hdp_flush = r600_mmio_hdp_flush, .gui_idle = &r600_gui_idle, .mc_wait_for_idle = &r600_mc_wait_for_idle, .get_xclk = &rv770_get_xclk, @@ -1281,7 +1281,7 @@ static struct radeon_asic evergreen_asic = { .resume = &evergreen_resume, .asic_reset = &evergreen_asic_reset, .vga_set_state = &r600_vga_set_state, - .ioctl_wait_idle = r600_ioctl_wait_idle, + .mmio_hdp_flush = r600_mmio_hdp_flush, .gui_idle = &r600_gui_idle, .mc_wait_for_idle = &evergreen_mc_wait_for_idle, .get_xclk = &rv770_get_xclk, @@ -1373,7 +1373,7 @@ static struct radeon_asic sumo_asic = { .resume = &evergreen_resume, .asic_reset = &evergreen_asic_reset, .vga_set_state = &r600_vga_set_state, - .ioctl_wait_idle = r600_ioctl_wait_idle, + .mmio_hdp_flush = r600_mmio_hdp_flush, .gui_idle = &r600_gui_idle, .mc_wait_for_idle = &evergreen_mc_wait_for_idle, .get_xclk = &r600_get_xclk, @@ -1464,7 +1464,7 @@ static struct radeon_asic btc_asic = { .resume = &evergreen_resume, .asic_reset = &evergreen_asic_reset, .vga_set_state = &r600_vga_set_state, - .ioctl_wait_idle = r600_ioctl_wait_idle, + .mmio_hdp_flush = r600_mmio_hdp_flush, .gui_idle = &r600_gui_idle, .mc_wait_for_idle = &evergreen_mc_wait_for_idle, .get_xclk = &rv770_get_xclk, @@ -1599,7 +1599,7 @@ static struct radeon_asic cayman_asic = { .resume = &cayman_resume, .asic_reset = &cayman_asic_reset, .vga_set_state = &r600_vga_set_state, - .ioctl_wait_idle = r600_ioctl_wait_idle, + .mmio_hdp_flush = r600_mmio_hdp_flush, .gui_idle = &r600_gui_idle, .mc_wait_for_idle = &evergreen_mc_wait_for_idle, .get_xclk = &rv770_get_xclk, @@ -1699,7 +1699,7 @@ static struct radeon_asic trinity_asic = { .resume = &cayman_resume, .asic_reset = &cayman_asic_reset, .vga_set_state = &r600_vga_set_state, - .ioctl_wait_idle = r600_ioctl_wait_idle, + .mmio_hdp_flush = r600_mmio_hdp_flush, .gui_idle = &r600_gui_idle, .mc_wait_for_idle = &evergreen_mc_wait_for_idle, .get_xclk = &r600_get_xclk, @@ -1829,7 +1829,7 @@ static struct radeon_asic si_asic = { .resume = &si_resume, .asic_reset = &si_asic_reset, .vga_set_state = &r600_vga_set_state, - .ioctl_wait_idle = r600_ioctl_wait_idle, + .mmio_hdp_flush = r600_mmio_hdp_flush, .gui_idle = &r600_gui_idle, .mc_wait_for_idle = &evergreen_mc_wait_for_idle, .get_xclk = &si_get_xclk, @@ -1987,7 +1987,7 @@ static struct radeon_asic ci_asic = { .resume = &cik_resume, .asic_reset = &cik_asic_reset, .vga_set_state = &r600_vga_set_state, - .ioctl_wait_idle = NULL, + .mmio_hdp_flush = NULL, .gui_idle = &r600_gui_idle, .mc_wait_for_idle = &evergreen_mc_wait_for_idle, .get_xclk = &cik_get_xclk, @@ -2091,7 +2091,7 @@ static struct radeon_asic kv_asic = { .resume = &cik_resume, .asic_reset = &cik_asic_reset, .vga_set_state = &r600_vga_set_state, - .ioctl_wait_idle = NULL, + .mmio_hdp_flush = NULL, .gui_idle = &r600_gui_idle, .mc_wait_for_idle = &evergreen_mc_wait_for_idle, .get_xclk = &cik_get_xclk, diff --git a/drivers/gpu/drm/radeon/radeon_asic.h b/drivers/gpu/drm/radeon/radeon_asic.h index f632e31..b8826c6 100644 --- a/drivers/gpu/drm/radeon/radeon_asic.h +++ b/drivers/gpu/drm/radeon/radeon_asic.h @@ -351,7 +351,7 @@ void r600_hpd_fini(struct radeon_device *rdev); bool r600_hpd_sense(struct radeon_device *rdev, enum radeon_hpd_id hpd); void r600_hpd_set_polarity(struct radeon_device *rdev, enum radeon_hpd_id hpd); -extern void r600_ioctl_wait_idle(struct radeon_device *rdev, struct radeon_bo *bo); +extern void r600_mmio_hdp_flush(struct radeon_device *rdev); extern bool r600_gui_idle(struct radeon_device *rdev); extern void r600_pm_misc(struct radeon_device *rdev); extern void r600_pm_init_profile(struct radeon_device *rdev); diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c index 08756f6..a350cf9 100644 --- a/drivers/gpu/drm/radeon/radeon_gem.c +++ b/drivers/gpu/drm/radeon/radeon_gem.c @@ -366,9 +366,9 @@ int radeon_gem_wait_idle_ioctl(struct drm_device *dev, void *data, } robj = gem_to_radeon_bo(gobj); r = radeon_bo_wait(robj, NULL, false); - /* callback hw specific functions if any */ - if (rdev->asic->ioctl_wait_idle) - robj->rdev->asic->ioctl_wait_idle(rdev, robj); + /* Flush HDP cache via MMIO if necessary */ + if (rdev->asic->mmio_hdp_flush) + robj->rdev->asic->mmio_hdp_flush(rdev); drm_gem_object_unreference_unlocked(gobj); r = radeon_gem_handle_lockup(rdev, r); return r;
On Thu, Jul 31, 2014 at 5:43 AM, Michel Dänzer michel@daenzer.net wrote:
Applied the series to my 3.17 tree.
Alex
From: Michel Dänzer michel.daenzer@amd.com
This ensures the GPU sees all previous CPU writes to VRAM, which makes it safe:
* For userspace to stream data from CPU to GPU via VRAM instead of GTT * For IBs to be stored in VRAM instead of GTT * For ring buffers to be stored in VRAM instead of GTT, if the HPD flush is performed via MMIO
Signed-off-by: Michel Dänzer michel.daenzer@amd.com --- drivers/gpu/drm/radeon/cik.c | 4 ---- drivers/gpu/drm/radeon/r100.c | 20 +++++++++++++++----- drivers/gpu/drm/radeon/radeon.h | 1 + drivers/gpu/drm/radeon/radeon_asic.c | 6 ++++-- drivers/gpu/drm/radeon/radeon_asic.h | 3 ++- drivers/gpu/drm/radeon/radeon_drv.c | 4 +++- drivers/gpu/drm/radeon/radeon_ring.c | 10 ++++++++++ 7 files changed, 35 insertions(+), 13 deletions(-)
diff --git a/drivers/gpu/drm/radeon/cik.c b/drivers/gpu/drm/radeon/cik.c index 67194a5..a5dfe58 100644 --- a/drivers/gpu/drm/radeon/cik.c +++ b/drivers/gpu/drm/radeon/cik.c @@ -3666,8 +3666,6 @@ void cik_fence_gfx_ring_emit(struct radeon_device *rdev, radeon_ring_write(ring, (upper_32_bits(addr) & 0xffff) | DATA_SEL(1) | INT_SEL(2)); radeon_ring_write(ring, fence->seq); radeon_ring_write(ring, 0); - /* HDP flush */ - cik_hdp_flush_cp_ring_emit(rdev, fence->ring); }
/** @@ -3696,8 +3694,6 @@ void cik_fence_compute_ring_emit(struct radeon_device *rdev, radeon_ring_write(ring, upper_32_bits(addr)); radeon_ring_write(ring, fence->seq); radeon_ring_write(ring, 0); - /* HDP flush */ - cik_hdp_flush_cp_ring_emit(rdev, fence->ring); }
bool cik_semaphore_ring_emit(struct radeon_device *rdev, diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c index 9241b89..e1bed43 100644 --- a/drivers/gpu/drm/radeon/r100.c +++ b/drivers/gpu/drm/radeon/r100.c @@ -837,11 +837,7 @@ void r100_fence_ring_emit(struct radeon_device *rdev, /* Wait until IDLE & CLEAN */ radeon_ring_write(ring, PACKET0(RADEON_WAIT_UNTIL, 0)); radeon_ring_write(ring, RADEON_WAIT_2D_IDLECLEAN | RADEON_WAIT_3D_IDLECLEAN); - radeon_ring_write(ring, PACKET0(RADEON_HOST_PATH_CNTL, 0)); - radeon_ring_write(ring, rdev->config.r100.hdp_cntl | - RADEON_HDP_READ_BUFFER_INVALIDATE); - radeon_ring_write(ring, PACKET0(RADEON_HOST_PATH_CNTL, 0)); - radeon_ring_write(ring, rdev->config.r100.hdp_cntl); + r100_ring_hdp_flush(rdev, ring); /* Emit fence sequence & fire IRQ */ radeon_ring_write(ring, PACKET0(rdev->fence_drv[fence->ring].scratch_reg, 0)); radeon_ring_write(ring, fence->seq); @@ -1060,6 +1056,20 @@ void r100_gfx_set_wptr(struct radeon_device *rdev, (void)RREG32(RADEON_CP_RB_WPTR); }
+/** + * r100_ring_hdp_flush - flush Host Data Path via the ring buffer + * rdev: radeon device structure + * ring: ring buffer struct for emitting packets + */ +void r100_ring_hdp_flush(struct radeon_device *rdev, struct radeon_ring *ring) +{ + radeon_ring_write(ring, PACKET0(RADEON_HOST_PATH_CNTL, 0)); + radeon_ring_write(ring, rdev->config.r100.hdp_cntl | + RADEON_HDP_READ_BUFFER_INVALIDATE); + radeon_ring_write(ring, PACKET0(RADEON_HOST_PATH_CNTL, 0)); + radeon_ring_write(ring, rdev->config.r100.hdp_cntl); +} + static void r100_cp_load_microcode(struct radeon_device *rdev) { const __be32 *fw_data; diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 4a76e13..bc970b6 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -1748,6 +1748,7 @@ struct radeon_asic_ring { /* command emmit functions */ void (*ib_execute)(struct radeon_device *rdev, struct radeon_ib *ib); void (*emit_fence)(struct radeon_device *rdev, struct radeon_fence *fence); + void (*hdp_flush)(struct radeon_device *rdev, struct radeon_ring *ring); bool (*emit_semaphore)(struct radeon_device *rdev, struct radeon_ring *cp, struct radeon_semaphore *semaphore, bool emit_wait); void (*vm_flush)(struct radeon_device *rdev, int ridx, struct radeon_vm *vm); diff --git a/drivers/gpu/drm/radeon/radeon_asic.c b/drivers/gpu/drm/radeon/radeon_asic.c index ba8caa7..1cb9330 100644 --- a/drivers/gpu/drm/radeon/radeon_asic.c +++ b/drivers/gpu/drm/radeon/radeon_asic.c @@ -185,6 +185,7 @@ static struct radeon_asic_ring r100_gfx_ring = { .get_rptr = &r100_gfx_get_rptr, .get_wptr = &r100_gfx_get_wptr, .set_wptr = &r100_gfx_set_wptr, + .hdp_flush = &r100_ring_hdp_flush, };
static struct radeon_asic r100_asic = { @@ -331,6 +332,7 @@ static struct radeon_asic_ring r300_gfx_ring = { .get_rptr = &r100_gfx_get_rptr, .get_wptr = &r100_gfx_get_wptr, .set_wptr = &r100_gfx_set_wptr, + .hdp_flush = &r100_ring_hdp_flush, };
static struct radeon_asic r300_asic = { @@ -1987,7 +1989,7 @@ static struct radeon_asic ci_asic = { .resume = &cik_resume, .asic_reset = &cik_asic_reset, .vga_set_state = &r600_vga_set_state, - .mmio_hdp_flush = NULL, + .mmio_hdp_flush = &r600_mmio_hdp_flush, .gui_idle = &r600_gui_idle, .mc_wait_for_idle = &evergreen_mc_wait_for_idle, .get_xclk = &cik_get_xclk, @@ -2091,7 +2093,7 @@ static struct radeon_asic kv_asic = { .resume = &cik_resume, .asic_reset = &cik_asic_reset, .vga_set_state = &r600_vga_set_state, - .mmio_hdp_flush = NULL, + .mmio_hdp_flush = &r600_mmio_hdp_flush, .gui_idle = &r600_gui_idle, .mc_wait_for_idle = &evergreen_mc_wait_for_idle, .get_xclk = &cik_get_xclk, diff --git a/drivers/gpu/drm/radeon/radeon_asic.h b/drivers/gpu/drm/radeon/radeon_asic.h index b8826c6..3cf6be6 100644 --- a/drivers/gpu/drm/radeon/radeon_asic.h +++ b/drivers/gpu/drm/radeon/radeon_asic.h @@ -148,7 +148,8 @@ u32 r100_gfx_get_wptr(struct radeon_device *rdev, struct radeon_ring *ring); void r100_gfx_set_wptr(struct radeon_device *rdev, struct radeon_ring *ring); - +void r100_ring_hdp_flush(struct radeon_device *rdev, + struct radeon_ring *ring); /* * r200,rv250,rs300,rv280 */ diff --git a/drivers/gpu/drm/radeon/radeon_drv.c b/drivers/gpu/drm/radeon/radeon_drv.c index e9e3610..54aa293 100644 --- a/drivers/gpu/drm/radeon/radeon_drv.c +++ b/drivers/gpu/drm/radeon/radeon_drv.c @@ -82,9 +82,11 @@ * 2.38.0 - RADEON_GEM_OP (GET_INITIAL_DOMAIN, SET_INITIAL_DOMAIN), * CIK: 1D and linear tiling modes contain valid PIPE_CONFIG * 2.39.0 - Add INFO query for number of active CUs + * 2.40.0 - Add RADEON_GEM_GTT_WC/UC, flush HDP cache before submitting + * CS to GPU */ #define KMS_DRIVER_MAJOR 2 -#define KMS_DRIVER_MINOR 39 +#define KMS_DRIVER_MINOR 40 #define KMS_DRIVER_PATCHLEVEL 0 int radeon_driver_load_kms(struct drm_device *dev, unsigned long flags); int radeon_driver_unload_kms(struct drm_device *dev); diff --git a/drivers/gpu/drm/radeon/radeon_ring.c b/drivers/gpu/drm/radeon/radeon_ring.c index 20b0e4f..ec27e05 100644 --- a/drivers/gpu/drm/radeon/radeon_ring.c +++ b/drivers/gpu/drm/radeon/radeon_ring.c @@ -445,11 +445,21 @@ int radeon_ring_lock(struct radeon_device *rdev, struct radeon_ring *ring, unsig */ void radeon_ring_commit(struct radeon_device *rdev, struct radeon_ring *ring) { + /* If we are emitting the HDP flush via the ring buffer, we need to + * do it before padding. + */ + if (rdev->asic->ring[ring->idx]->hdp_flush) + rdev->asic->ring[ring->idx]->hdp_flush(rdev, ring); /* We pad to match fetch size */ while (ring->wptr & ring->align_mask) { radeon_ring_write(ring, ring->nop); } mb(); + /* If we are emitting the HDP flush via MMIO, we need to do it after + * all CPU writes to VRAM finished. + */ + if (rdev->asic->mmio_hdp_flush) + rdev->asic->mmio_hdp_flush(rdev); radeon_ring_set_wptr(rdev, ring); }
From: Michel Dänzer michel.daenzer@amd.com
Signed-off-by: Michel Dänzer michel.daenzer@amd.com --- src/gallium/drivers/radeon/r600_buffer_common.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c index 4e6b897..154c33d 100644 --- a/src/gallium/drivers/radeon/r600_buffer_common.c +++ b/src/gallium/drivers/radeon/r600_buffer_common.c @@ -127,13 +127,17 @@ bool r600_init_resource(struct r600_common_screen *rscreen, break; }
- /* Use GTT for all persistent mappings, because they are - * always cached and coherent. */ - if (res->b.b.target == PIPE_BUFFER && + /* Use GTT for all persistent mappings with older kernels, because they + * didn't always flush the HDP cache before CS execution. + * + * Write-combined CPU mappings are fine, the kernel ensures all CPU + * writes finish before the GPU executes a command stream. + */ + if (rscreen->info.drm_minor < 40 && + res->b.b.target == PIPE_BUFFER && res->b.b.flags & (PIPE_RESOURCE_FLAG_MAP_PERSISTENT | PIPE_RESOURCE_FLAG_MAP_COHERENT)) { res->domains = RADEON_DOMAIN_GTT; - flags = 0; }
/* Tiled textures are unmappable. Always put them in VRAM. */
Reviewed-by: Marek Olšák marek.olsak@amd.com
Marek
On Thu, Jul 31, 2014 at 11:43 AM, Michel Dänzer michel@daenzer.net wrote:
From: Michel Dänzer michel.daenzer@amd.com
Signed-off-by: Michel Dänzer michel.daenzer@amd.com --- src/gallium/drivers/radeon/r600_buffer_common.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-)
diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c index 154c33d..d747cbc 100644 --- a/src/gallium/drivers/radeon/r600_buffer_common.c +++ b/src/gallium/drivers/radeon/r600_buffer_common.c @@ -110,14 +110,21 @@ bool r600_init_resource(struct r600_common_screen *rscreen, enum radeon_bo_flag flags = 0;
switch (res->b.b.usage) { - case PIPE_USAGE_DYNAMIC: - case PIPE_USAGE_STREAM: - flags = RADEON_FLAG_GTT_WC; - /* fall through */ case PIPE_USAGE_STAGING: /* Transfers are likely to occur more often with these resources. */ res->domains = RADEON_DOMAIN_GTT; break; + case PIPE_USAGE_STREAM: + case PIPE_USAGE_DYNAMIC: + /* Older kernels didn't always flush the HDP cache before + * CS execution + */ + if (rscreen->info.drm_minor < 40) { + res->domains = RADEON_DOMAIN_GTT; + flags = RADEON_FLAG_GTT_WC; + break; + } + /* fall through */ case PIPE_USAGE_DEFAULT: case PIPE_USAGE_IMMUTABLE: default:
Am 31.07.2014 um 11:43 schrieb Michel Dänzer:
From: Michel Dänzer michel.daenzer@amd.com
Signed-off-by: Michel Dänzer michel.daenzer@amd.com
At least for PIPE_USAGE_STREAM buffers that's a bad idea, cause they are used by VDPAU to read back to data to a CPU buffer and that's really slow from VRAM.
Christian.
On 31.07.2014 18:52, Christian König wrote:
From src/gallium/docs/source/screen.rst:
* ``PIPE_USAGE_DEFAULT``: Optimized for fast GPU access. * ``PIPE_USAGE_IMMUTABLE``: Optimized for fast GPU access and the resource is not expected to be mapped or changed (even by the GPU) after the first upload. * ``PIPE_USAGE_DYNAMIC``: Expect frequent write-only CPU access. What is uploaded is expected to be used at least several times by the GPU. * ``PIPE_USAGE_STREAM``: Expect frequent write-only CPU access. What is uploaded is expected to be used only once by the GPU. * ``PIPE_USAGE_STAGING``: Optimized for fast CPU access.
That reads to me like only PIPE_USAGE_STAGING is expected to provide fast CPU reads.
Am 31.07.2014 um 11:57 schrieb Michel Dänzer:
Forget what I've wrote, we do this handling by letting the driver copy the bitmap content to a staging texture. All other use case indeed use PIPE_USAGE_STAGING.
Christian.
Reviewed-by: Marek Olšák marek.olsak@amd.com
Marek
On Thu, Jul 31, 2014 at 11:43 AM, Michel Dänzer michel@daenzer.net wrote:
dri-devel@lists.freedesktop.org