From: Alex Deucher alexander.deucher@amd.com
Allows us to use the DMA ring from userspace. DMA doesn't have a good NOP packet in which to embed the reloc idx, so userspace has to add a reloc for each buffer used and order them to match the command stream.
Signed-off-by: Alex Deucher alexander.deucher@amd.com --- drivers/gpu/drm/radeon/r600_cs.c | 193 ++++++++++++++++++++++++++++++++++ drivers/gpu/drm/radeon/radeon.h | 1 + drivers/gpu/drm/radeon/radeon_asic.c | 6 +- drivers/gpu/drm/radeon/radeon_asic.h | 1 + drivers/gpu/drm/radeon/radeon_cs.c | 1 + 5 files changed, 199 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/radeon/r600_cs.c b/drivers/gpu/drm/radeon/r600_cs.c index 5d6e7f9..f23609ac 100644 --- a/drivers/gpu/drm/radeon/r600_cs.c +++ b/drivers/gpu/drm/radeon/r600_cs.c @@ -2568,3 +2568,196 @@ void r600_cs_legacy_init(void) { r600_cs_packet_next_reloc = &r600_cs_packet_next_reloc_nomm; } + +/* + * DMA + */ +/** + * r600_dma_cs_next_reloc() - parse next reloc + * @p: parser structure holding parsing context. + * @cs_reloc: reloc informations + * + * Return the next reloc, do bo validation and compute + * GPU offset using the provided start. + **/ +int r600_dma_cs_next_reloc(struct radeon_cs_parser *p, + struct radeon_cs_reloc **cs_reloc) +{ + struct radeon_cs_chunk *relocs_chunk; + unsigned idx; + + if (p->chunk_relocs_idx == -1) { + DRM_ERROR("No relocation chunk !\n"); + return -EINVAL; + } + *cs_reloc = NULL; + relocs_chunk = &p->chunks[p->chunk_relocs_idx]; + idx = p->dma_reloc_idx; + if (idx >= relocs_chunk->length_dw) { + DRM_ERROR("Relocs at %d after relocations chunk end %d !\n", + idx, relocs_chunk->length_dw); + return -EINVAL; + } + /* FIXME: we assume reloc size is 4 dwords */ + *cs_reloc = p->relocs_ptr[(idx / 4)]; + p->dma_reloc_idx++; + return 0; +} + +#define GET_DMA_CMD(h) (((h) & 0xf0000000) >> 28) +#define GET_DMA_COUNT(h) ((h) & 0x0000ffff) +#define GET_DMA_T(h) (((h) & 0x00800000) >> 23) + +/** + * r600_dma_cs_parse() - parse the DMA IB + * @p: parser structure holding parsing context. + * + * Parses the DMA IB from the CS ioctl and updates + * the GPU addresses based on the reloc information and + * checks for errors. (R6xx-R7xx) + * Returns 0 for success and an error on failure. + **/ +int r600_dma_cs_parse(struct radeon_cs_parser *p) +{ + struct radeon_cs_chunk *ib_chunk = &p->chunks[p->chunk_ib_idx]; + struct radeon_cs_reloc *src_reloc, *dst_reloc; + u32 header, cmd, count, tiled; + volatile u32 *ib = p->ib.ptr; + u32 idx, idx_value; + u64 src_offset, dst_offset; + int r; + + do { + if (p->idx >= ib_chunk->length_dw) { + DRM_ERROR("Can not parse packet at %d after CS end %d !\n", + p->idx, ib_chunk->length_dw); + return -EINVAL; + } + idx = p->idx; + header = radeon_get_ib_value(p, idx); + cmd = GET_DMA_CMD(header); + count = GET_DMA_COUNT(header); + tiled = GET_DMA_T(header); + + switch (cmd) { + case DMA_PACKET_WRITE: + r = r600_dma_cs_next_reloc(p, &dst_reloc); + if (r) { + DRM_ERROR("bad DMA_PACKET_WRITE\n"); + return -EINVAL; + } + if (tiled) { + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8); + dst_offset = ib[idx+1]; + dst_offset <<= 8; + p->idx += count + 5; + } else { + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+2] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff; + dst_offset = ib[idx+1]; + dst_offset |= ((u64)(ib[idx+2] & 0xff)) << 32; + p->idx += count + 3; + } + if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) { + dev_warn(p->dev, "DMA write buffer too small (%llu %lu)\n", + dst_offset, radeon_bo_size(dst_reloc->robj)); + return -EINVAL; + } + break; + case DMA_PACKET_COPY: + r = r600_dma_cs_next_reloc(p, &src_reloc); + if (r) { + DRM_ERROR("bad DMA_PACKET_COPY\n"); + return -EINVAL; + } + r = r600_dma_cs_next_reloc(p, &dst_reloc); + if (r) { + DRM_ERROR("bad DMA_PACKET_COPY\n"); + return -EINVAL; + } + if (tiled) { + idx_value = radeon_get_ib_value(p, idx + 2); + /* detile bit */ + if (idx_value & (1 << 31)) { + /* tiled src, linear dst */ + ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8); + src_offset = ib[idx+1]; + src_offset <<= 8; + + ib[idx+5] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+6] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff; + dst_offset = ib[idx+5]; + dst_offset |= ((u64)(ib[idx+6] & 0xff)) << 32; + } else { + /* linear src, tiled dst */ + ib[idx+5] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+6] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff; + src_offset = ib[idx+5]; + src_offset |= ((u64)(ib[idx+6] & 0xff)) << 32; + + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8); + dst_offset = ib[idx+1]; + dst_offset <<= 8; + } + p->idx += 7; + } else { + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+2] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+3] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff; + ib[idx+4] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff; + + src_offset = ib[idx+2]; + src_offset |= ((u64)(ib[idx+4] & 0xff)) << 32; + dst_offset = ib[idx+1]; + dst_offset |= ((u64)(ib[idx+3] & 0xff)) << 32; + + p->idx += 5; + } + if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) { + dev_warn(p->dev, "DMA copy src buffer too small (%llu %lu)\n", + src_offset, radeon_bo_size(src_reloc->robj)); + return -EINVAL; + } + if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) { + dev_warn(p->dev, "DMA write dst buffer too small (%llu %lu)\n", + dst_offset, radeon_bo_size(dst_reloc->robj)); + return -EINVAL; + } + break; + case DMA_PACKET_CONSTANT_FILL: + if (p->family < CHIP_RV770) { + DRM_ERROR("Constant Fill is 7xx only !\n"); + return -EINVAL; + } + r = r600_dma_cs_next_reloc(p, &dst_reloc); + if (r) { + DRM_ERROR("bad DMA_PACKET_WRITE\n"); + return -EINVAL; + } + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+3] += (upper_32_bits(dst_reloc->lobj.gpu_offset) << 16) & 0x00ff0000; + dst_offset = ib[idx+1]; + dst_offset |= ((u64)(ib[idx+3] & 0x00ff0000)) << 16; + if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) { + dev_warn(p->dev, "DMA constant fill buffer too small (%llu %lu)\n", + dst_offset, radeon_bo_size(dst_reloc->robj)); + return -EINVAL; + } + p->idx += 4; + break; + case DMA_PACKET_NOP: + p->idx += 1; + break; + default: + DRM_ERROR("Unknown packet type %d at %d !\n", cmd, idx); + return -EINVAL; + } + } while (p->idx < p->chunks[p->chunk_ib_idx].length_dw); +#if 0 + for (r = 0; r < p->ib->length_dw; r++) { + printk(KERN_INFO "%05d 0x%08X\n", r, p->ib.ptr[r]); + mdelay(1); + } +#endif + return 0; +} diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 285fb3f..5dc744d 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -839,6 +839,7 @@ struct radeon_cs_parser { struct radeon_cs_reloc *relocs; struct radeon_cs_reloc **relocs_ptr; struct list_head validated; + unsigned dma_reloc_idx; /* indices of various chunks */ int chunk_ib_idx; int chunk_relocs_idx; diff --git a/drivers/gpu/drm/radeon/radeon_asic.c b/drivers/gpu/drm/radeon/radeon_asic.c index 3ea0475..d360341 100644 --- a/drivers/gpu/drm/radeon/radeon_asic.c +++ b/drivers/gpu/drm/radeon/radeon_asic.c @@ -952,7 +952,7 @@ static struct radeon_asic r600_asic = { .ib_execute = &r600_dma_ring_ib_execute, .emit_fence = &r600_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, - .cs_parse = NULL, + .cs_parse = &r600_dma_cs_parse, .ring_test = &r600_dma_ring_test, .ib_test = &r600_dma_ib_test, .is_lockup = &r600_dma_is_lockup, @@ -1036,7 +1036,7 @@ static struct radeon_asic rs780_asic = { .ib_execute = &r600_dma_ring_ib_execute, .emit_fence = &r600_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, - .cs_parse = NULL, + .cs_parse = &r600_dma_cs_parse, .ring_test = &r600_dma_ring_test, .ib_test = &r600_dma_ib_test, .is_lockup = &r600_dma_is_lockup, @@ -1120,7 +1120,7 @@ static struct radeon_asic rv770_asic = { .ib_execute = &r600_dma_ring_ib_execute, .emit_fence = &r600_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, - .cs_parse = NULL, + .cs_parse = &r600_dma_cs_parse, .ring_test = &r600_dma_ring_test, .ib_test = &r600_dma_ib_test, .is_lockup = &r600_dma_is_lockup, diff --git a/drivers/gpu/drm/radeon/radeon_asic.h b/drivers/gpu/drm/radeon/radeon_asic.h index c338931..b311c0a 100644 --- a/drivers/gpu/drm/radeon/radeon_asic.h +++ b/drivers/gpu/drm/radeon/radeon_asic.h @@ -304,6 +304,7 @@ void r600_pcie_gart_tlb_flush(struct radeon_device *rdev); uint32_t r600_pciep_rreg(struct radeon_device *rdev, uint32_t reg); void r600_pciep_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v); int r600_cs_parse(struct radeon_cs_parser *p); +int r600_dma_cs_parse(struct radeon_cs_parser *p); void r600_fence_ring_emit(struct radeon_device *rdev, struct radeon_fence *fence); void r600_semaphore_ring_emit(struct radeon_device *rdev, diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c index 41672cc..1b32a5a 100644 --- a/drivers/gpu/drm/radeon/radeon_cs.c +++ b/drivers/gpu/drm/radeon/radeon_cs.c @@ -43,6 +43,7 @@ static int radeon_cs_parser_relocs(struct radeon_cs_parser *p) return 0; } chunk = &p->chunks[p->chunk_relocs_idx]; + p->dma_reloc_idx = 0; /* FIXME: we assume that each relocs use 4 dwords */ p->nrelocs = chunk->length_dw / 4; p->relocs_ptr = kcalloc(p->nrelocs, sizeof(void *), GFP_KERNEL);
From: Alex Deucher alexander.deucher@amd.com
Allows us to use the DMA ring from userspace. DMA doesn't have a good NOP packet in which to embed the reloc idx, so userspace has to add a reloc for each buffer used and order them to match the command stream.
Signed-off-by: Alex Deucher alexander.deucher@amd.com --- drivers/gpu/drm/radeon/evergreen_cs.c | 449 +++++++++++++++++++++++++++++++++ drivers/gpu/drm/radeon/radeon_asic.c | 14 +- drivers/gpu/drm/radeon/radeon_asic.h | 1 + 3 files changed, 457 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/radeon/evergreen_cs.c b/drivers/gpu/drm/radeon/evergreen_cs.c index 62c2271..adfc66f 100644 --- a/drivers/gpu/drm/radeon/evergreen_cs.c +++ b/drivers/gpu/drm/radeon/evergreen_cs.c @@ -34,6 +34,8 @@ #define MAX(a,b) (((a)>(b))?(a):(b)) #define MIN(a,b) (((a)<(b))?(a):(b))
+int r600_dma_cs_next_reloc(struct radeon_cs_parser *p, + struct radeon_cs_reloc **cs_reloc); static int evergreen_cs_packet_next_reloc(struct radeon_cs_parser *p, struct radeon_cs_reloc **cs_reloc);
@@ -2804,6 +2806,453 @@ int evergreen_cs_parse(struct radeon_cs_parser *p) return 0; }
+/* + * DMA + */ + +#define GET_DMA_CMD(h) (((h) & 0xf0000000) >> 28) +#define GET_DMA_COUNT(h) ((h) & 0x000fffff) +#define GET_DMA_T(h) (((h) & 0x00800000) >> 23) +#define GET_DMA_NEW(h) (((h) & 0x04000000) >> 26) +#define GET_DMA_MISC(h) (((h) & 0x0700000) >> 20) + +/** + * evergreen_dma_cs_parse() - parse the DMA IB + * @p: parser structure holding parsing context. + * + * Parses the DMA IB from the CS ioctl and updates + * the GPU addresses based on the reloc information and + * checks for errors. (Evergreen-Cayman) + * Returns 0 for success and an error on failure. + **/ +int evergreen_dma_cs_parse(struct radeon_cs_parser *p) +{ + struct radeon_cs_chunk *ib_chunk = &p->chunks[p->chunk_ib_idx]; + struct radeon_cs_reloc *src_reloc, *dst_reloc, *dst2_reloc; + u32 header, cmd, count, tiled, new_cmd, misc; + volatile u32 *ib = p->ib.ptr; + u32 idx, idx_value; + u64 src_offset, dst_offset, dst2_offset; + int r; + + do { + if (p->idx >= ib_chunk->length_dw) { + DRM_ERROR("Can not parse packet at %d after CS end %d !\n", + p->idx, ib_chunk->length_dw); + return -EINVAL; + } + idx = p->idx; + header = radeon_get_ib_value(p, idx); + cmd = GET_DMA_CMD(header); + count = GET_DMA_COUNT(header); + tiled = GET_DMA_T(header); + new_cmd = GET_DMA_NEW(header); + misc = GET_DMA_MISC(header); + + switch (cmd) { + case DMA_PACKET_WRITE: + r = r600_dma_cs_next_reloc(p, &dst_reloc); + if (r) { + DRM_ERROR("bad DMA_PACKET_WRITE\n"); + return -EINVAL; + } + if (tiled) { + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8); + dst_offset = ib[idx+1]; + dst_offset <<= 8; + p->idx += count + 7; + } else { + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+2] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff; + dst_offset = ib[idx+1]; + dst_offset |= ((u64)(ib[idx+2] & 0xff)) << 32; + p->idx += count + 3; + } + if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) { + dev_warn(p->dev, "DMA write buffer too small (%llu %lu)\n", + dst_offset, radeon_bo_size(dst_reloc->robj)); + return -EINVAL; + } + break; + case DMA_PACKET_COPY: + r = r600_dma_cs_next_reloc(p, &src_reloc); + if (r) { + DRM_ERROR("bad DMA_PACKET_COPY\n"); + return -EINVAL; + } + r = r600_dma_cs_next_reloc(p, &dst_reloc); + if (r) { + DRM_ERROR("bad DMA_PACKET_COPY\n"); + return -EINVAL; + } + if (tiled) { + idx_value = radeon_get_ib_value(p, idx + 2); + if (new_cmd) { + switch (misc) { + case 0: + /* L2T, frame to fields */ + if (idx_value & (1 << 31)) { + DRM_ERROR("bad L2T, frame to fields DMA_PACKET_COPY\n"); + return -EINVAL; + } + r = r600_dma_cs_next_reloc(p, &dst2_reloc); + if (r) { + DRM_ERROR("bad L2T, frame to fields DMA_PACKET_COPY\n"); + return -EINVAL; + } + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8); + ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset >> 8); + ib[idx+8] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+9] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff; + dst_offset = ib[idx+1]; + dst_offset <<= 8; + dst2_offset = ib[idx+2]; + dst2_offset <<= 8; + src_offset = ib[idx+8]; + src_offset |= ((u64)(ib[idx+9] & 0xff)) << 32; + if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) { + dev_warn(p->dev, "DMA L2T, frame to fields src buffer too small (%llu %lu)\n", + src_offset, radeon_bo_size(src_reloc->robj)); + return -EINVAL; + } + if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) { + dev_warn(p->dev, "DMA L2T, frame to fields buffer too small (%llu %lu)\n", + dst_offset, radeon_bo_size(dst_reloc->robj)); + return -EINVAL; + } + if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) { + dev_warn(p->dev, "DMA L2T, frame to fields buffer too small (%llu %lu)\n", + dst2_offset, radeon_bo_size(dst2_reloc->robj)); + return -EINVAL; + } + p->idx += 10; + break; + case 1: + /* L2T, T2L partial */ + if (p->family < CHIP_CAYMAN) { + DRM_ERROR("L2T, T2L Partial is cayman only !\n"); + return -EINVAL; + } + /* detile bit */ + if (idx_value & (1 << 31)) { + /* tiled src, linear dst */ + ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8); + + ib[idx+7] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+8] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff; + } else { + /* linear src, tiled dst */ + ib[idx+7] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+8] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff; + + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8); + } + p->idx += 12; + break; + case 3: + /* L2T, broadcast */ + if (idx_value & (1 << 31)) { + DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n"); + return -EINVAL; + } + r = r600_dma_cs_next_reloc(p, &dst2_reloc); + if (r) { + DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n"); + return -EINVAL; + } + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8); + ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset >> 8); + ib[idx+8] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+9] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff; + dst_offset = ib[idx+1]; + dst_offset <<= 8; + dst2_offset = ib[idx+2]; + dst2_offset <<= 8; + src_offset = ib[idx+8]; + src_offset |= ((u64)(ib[idx+9] & 0xff)) << 32; + if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) { + dev_warn(p->dev, "DMA L2T, broadcast src buffer too small (%llu %lu)\n", + src_offset, radeon_bo_size(src_reloc->robj)); + return -EINVAL; + } + if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) { + dev_warn(p->dev, "DMA L2T, broadcast dst buffer too small (%llu %lu)\n", + dst_offset, radeon_bo_size(dst_reloc->robj)); + return -EINVAL; + } + if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) { + dev_warn(p->dev, "DMA L2T, broadcast dst2 buffer too small (%llu %lu)\n", + dst2_offset, radeon_bo_size(dst2_reloc->robj)); + return -EINVAL; + } + p->idx += 10; + break; + case 4: + /* L2T, T2L */ + /* detile bit */ + if (idx_value & (1 << 31)) { + /* tiled src, linear dst */ + ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8); + src_offset = ib[idx+1]; + src_offset <<= 8; + + ib[idx+7] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+8] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff; + dst_offset = ib[idx+7]; + dst_offset |= ((u64)(ib[idx+8] & 0xff)) << 32; + } else { + /* linear src, tiled dst */ + ib[idx+7] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+8] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff; + src_offset = ib[idx+7]; + src_offset |= ((u64)(ib[idx+8] & 0xff)) << 32; + + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8); + dst_offset = ib[idx+1]; + dst_offset <<= 8; + } + if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) { + dev_warn(p->dev, "DMA L2T, T2L src buffer too small (%llu %lu)\n", + src_offset, radeon_bo_size(src_reloc->robj)); + return -EINVAL; + } + if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) { + dev_warn(p->dev, "DMA L2T, T2L dst buffer too small (%llu %lu)\n", + dst_offset, radeon_bo_size(dst_reloc->robj)); + return -EINVAL; + } + p->idx += 9; + break; + case 5: + /* T2T partial */ + if (p->family < CHIP_CAYMAN) { + DRM_ERROR("L2T, T2L Partial is cayman only !\n"); + return -EINVAL; + } + ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8); + ib[idx+4] += (u32)(dst_reloc->lobj.gpu_offset >> 8); + p->idx += 13; + break; + case 7: + /* L2T, broadcast */ + if (idx_value & (1 << 31)) { + DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n"); + return -EINVAL; + } + r = r600_dma_cs_next_reloc(p, &dst2_reloc); + if (r) { + DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n"); + return -EINVAL; + } + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8); + ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset >> 8); + ib[idx+8] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+9] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff; + dst_offset = ib[idx+1]; + dst_offset <<= 8; + dst2_offset = ib[idx+2]; + dst2_offset <<= 8; + src_offset = ib[idx+8]; + src_offset |= ((u64)(ib[idx+9] & 0xff)) << 32; + if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) { + dev_warn(p->dev, "DMA L2T, broadcast src buffer too small (%llu %lu)\n", + src_offset, radeon_bo_size(src_reloc->robj)); + return -EINVAL; + } + if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) { + dev_warn(p->dev, "DMA L2T, broadcast dst buffer too small (%llu %lu)\n", + dst_offset, radeon_bo_size(dst_reloc->robj)); + return -EINVAL; + } + if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) { + dev_warn(p->dev, "DMA L2T, broadcast dst2 buffer too small (%llu %lu)\n", + dst2_offset, radeon_bo_size(dst2_reloc->robj)); + return -EINVAL; + } + p->idx += 10; + break; + default: + DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc); + return -EINVAL; + } + } else { + switch (misc) { + case 0: + /* detile bit */ + if (idx_value & (1 << 31)) { + /* tiled src, linear dst */ + ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8); + src_offset = ib[idx+1]; + src_offset <<= 8; + + ib[idx+7] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+8] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff; + dst_offset = ib[idx+7]; + dst_offset |= ((u64)(ib[idx+8] & 0xff)) << 32; + } else { + /* linear src, tiled dst */ + ib[idx+7] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+8] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff; + src_offset = ib[idx+7]; + src_offset |= ((u64)(ib[idx+8] & 0xff)) << 32; + + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8); + dst_offset = ib[idx+1]; + dst_offset <<= 8; + } + if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) { + dev_warn(p->dev, "DMA L2T, broadcast src buffer too small (%llu %lu)\n", + src_offset, radeon_bo_size(src_reloc->robj)); + return -EINVAL; + } + if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) { + dev_warn(p->dev, "DMA L2T, broadcast dst buffer too small (%llu %lu)\n", + dst_offset, radeon_bo_size(dst_reloc->robj)); + return -EINVAL; + } + p->idx += 9; + break; + default: + DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc); + return -EINVAL; + } + } + } else { + if (new_cmd) { + switch (misc) { + case 0: + /* L2L, byte */ + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xffffffff); + ib[idx+2] += (u32)(src_reloc->lobj.gpu_offset & 0xffffffff); + ib[idx+3] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff; + ib[idx+4] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff; + src_offset = ib[idx+2]; + src_offset |= ((u64)(ib[idx+4] & 0xff)) << 32; + dst_offset = ib[idx+1]; + dst_offset |= ((u64)(ib[idx+3] & 0xff)) << 32; + if ((src_offset + count) > radeon_bo_size(src_reloc->robj)) { + dev_warn(p->dev, "DMA L2L, byte src buffer too small (%llu %lu)\n", + src_offset, radeon_bo_size(src_reloc->robj)); + return -EINVAL; + } + if ((dst_offset + count) > radeon_bo_size(dst_reloc->robj)) { + dev_warn(p->dev, "DMA L2L, byte dst buffer too small (%llu %lu)\n", + dst_offset, radeon_bo_size(dst_reloc->robj)); + return -EINVAL; + } + p->idx += 5; + break; + case 1: + /* L2L, partial */ + if (p->family < CHIP_CAYMAN) { + DRM_ERROR("L2L Partial is cayman only !\n"); + return -EINVAL; + } + ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset & 0xffffffff); + ib[idx+2] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff; + ib[idx+4] += (u32)(dst_reloc->lobj.gpu_offset & 0xffffffff); + ib[idx+5] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff; + + p->idx += 9; + break; + case 4: + /* L2L, dw, broadcast */ + r = r600_dma_cs_next_reloc(p, &dst2_reloc); + if (r) { + DRM_ERROR("bad L2L, dw, broadcast DMA_PACKET_COPY\n"); + return -EINVAL; + } + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+3] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+4] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff; + ib[idx+5] += upper_32_bits(dst2_reloc->lobj.gpu_offset) & 0xff; + ib[idx+6] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff; + dst_offset = ib[idx+1]; + dst_offset |= ((u64)(ib[idx+4] & 0xff)) << 32; + dst2_offset = ib[idx+2]; + dst2_offset |= ((u64)(ib[idx+5] & 0xff)) << 32; + src_offset = ib[idx+3]; + src_offset |= ((u64)(ib[idx+6] & 0xff)) << 32; + if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) { + dev_warn(p->dev, "DMA L2L, dw, broadcast src buffer too small (%llu %lu)\n", + src_offset, radeon_bo_size(src_reloc->robj)); + return -EINVAL; + } + if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) { + dev_warn(p->dev, "DMA L2L, dw, broadcast dst buffer too small (%llu %lu)\n", + dst_offset, radeon_bo_size(dst_reloc->robj)); + return -EINVAL; + } + if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) { + dev_warn(p->dev, "DMA L2L, dw, broadcast dst2 buffer too small (%llu %lu)\n", + dst2_offset, radeon_bo_size(dst2_reloc->robj)); + return -EINVAL; + } + p->idx += 7; + break; + default: + DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc); + return -EINVAL; + } + } else { + /* L2L, dw */ + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+2] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+3] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff; + ib[idx+4] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff; + src_offset = ib[idx+2]; + src_offset |= ((u64)(ib[idx+4] & 0xff)) << 32; + dst_offset = ib[idx+1]; + dst_offset |= ((u64)(ib[idx+3] & 0xff)) << 32; + if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) { + dev_warn(p->dev, "DMA L2L, dw src buffer too small (%llu %lu)\n", + src_offset, radeon_bo_size(src_reloc->robj)); + return -EINVAL; + } + if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) { + dev_warn(p->dev, "DMA L2L, dw dst buffer too small (%llu %lu)\n", + dst_offset, radeon_bo_size(dst_reloc->robj)); + return -EINVAL; + } + p->idx += 5; + } + } + break; + case DMA_PACKET_CONSTANT_FILL: + r = r600_dma_cs_next_reloc(p, &dst_reloc); + if (r) { + DRM_ERROR("bad DMA_PACKET_CONSTANT_FILL\n"); + return -EINVAL; + } + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+3] += (upper_32_bits(dst_reloc->lobj.gpu_offset) << 16) & 0x00ff0000; + dst_offset = ib[idx+1]; + dst_offset |= ((u64)(ib[idx+3] & 0x00ff0000)) << 16; + if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) { + dev_warn(p->dev, "DMA constant fill buffer too small (%llu %lu)\n", + dst_offset, radeon_bo_size(dst_reloc->robj)); + return -EINVAL; + } + p->idx += 4; + break; + case DMA_PACKET_NOP: + p->idx += 1; + break; + default: + DRM_ERROR("Unknown packet type %d at %d !\n", cmd, idx); + return -EINVAL; + } + } while (p->idx < p->chunks[p->chunk_ib_idx].length_dw); +#if 0 + for (r = 0; r < p->ib->length_dw; r++) { + printk(KERN_INFO "%05d 0x%08X\n", r, p->ib.ptr[r]); + mdelay(1); + } +#endif + return 0; +} + /* vm parser */ static bool evergreen_vm_reg_valid(u32 reg) { diff --git a/drivers/gpu/drm/radeon/radeon_asic.c b/drivers/gpu/drm/radeon/radeon_asic.c index d360341..ac1d570 100644 --- a/drivers/gpu/drm/radeon/radeon_asic.c +++ b/drivers/gpu/drm/radeon/radeon_asic.c @@ -1204,7 +1204,7 @@ static struct radeon_asic evergreen_asic = { .ib_execute = &evergreen_dma_ring_ib_execute, .emit_fence = &evergreen_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, - .cs_parse = NULL, + .cs_parse = &evergreen_dma_cs_parse, .ring_test = &r600_dma_ring_test, .ib_test = &r600_dma_ib_test, .is_lockup = &r600_dma_is_lockup, @@ -1288,7 +1288,7 @@ static struct radeon_asic sumo_asic = { .ib_execute = &evergreen_dma_ring_ib_execute, .emit_fence = &evergreen_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, - .cs_parse = NULL, + .cs_parse = &evergreen_dma_cs_parse, .ring_test = &r600_dma_ring_test, .ib_test = &r600_dma_ib_test, .is_lockup = &r600_dma_is_lockup, @@ -1372,7 +1372,7 @@ static struct radeon_asic btc_asic = { .ib_execute = &evergreen_dma_ring_ib_execute, .emit_fence = &evergreen_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, - .cs_parse = NULL, + .cs_parse = &evergreen_dma_cs_parse, .ring_test = &r600_dma_ring_test, .ib_test = &r600_dma_ib_test, .is_lockup = &r600_dma_is_lockup, @@ -1486,7 +1486,7 @@ static struct radeon_asic cayman_asic = { .ib_execute = &cayman_dma_ring_ib_execute, .emit_fence = &evergreen_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, - .cs_parse = NULL, + .cs_parse = &evergreen_dma_cs_parse, .ring_test = &r600_dma_ring_test, .ib_test = &r600_dma_ib_test, .is_lockup = &cayman_dma_is_lockup, @@ -1496,7 +1496,7 @@ static struct radeon_asic cayman_asic = { .ib_execute = &cayman_dma_ring_ib_execute, .emit_fence = &evergreen_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, - .cs_parse = NULL, + .cs_parse = &evergreen_dma_cs_parse, .ring_test = &r600_dma_ring_test, .ib_test = &r600_dma_ib_test, .is_lockup = &cayman_dma_is_lockup, @@ -1611,7 +1611,7 @@ static struct radeon_asic trinity_asic = { .ib_execute = &cayman_dma_ring_ib_execute, .emit_fence = &evergreen_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, - .cs_parse = NULL, + .cs_parse = &evergreen_dma_cs_parse, .ring_test = &r600_dma_ring_test, .ib_test = &r600_dma_ib_test, .is_lockup = &cayman_dma_is_lockup, @@ -1621,7 +1621,7 @@ static struct radeon_asic trinity_asic = { .ib_execute = &cayman_dma_ring_ib_execute, .emit_fence = &evergreen_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, - .cs_parse = NULL, + .cs_parse = &evergreen_dma_cs_parse, .ring_test = &r600_dma_ring_test, .ib_test = &r600_dma_ib_test, .is_lockup = &cayman_dma_is_lockup, diff --git a/drivers/gpu/drm/radeon/radeon_asic.h b/drivers/gpu/drm/radeon/radeon_asic.h index b311c0a..d2ac646 100644 --- a/drivers/gpu/drm/radeon/radeon_asic.h +++ b/drivers/gpu/drm/radeon/radeon_asic.h @@ -431,6 +431,7 @@ u32 evergreen_get_vblank_counter(struct radeon_device *rdev, int crtc); int evergreen_irq_set(struct radeon_device *rdev); int evergreen_irq_process(struct radeon_device *rdev); extern int evergreen_cs_parse(struct radeon_cs_parser *p); +extern int evergreen_dma_cs_parse(struct radeon_cs_parser *p); extern void evergreen_pm_misc(struct radeon_device *rdev); extern void evergreen_pm_prepare(struct radeon_device *rdev); extern void evergreen_pm_finish(struct radeon_device *rdev);
From: Alex Deucher alexander.deucher@amd.com
Allows us to use async DMA from userspace.
Signed-off-by: Alex Deucher alexander.deucher@amd.com --- drivers/gpu/drm/radeon/evergreen_cs.c | 111 +++++++++++++++++++++++++++++++++ drivers/gpu/drm/radeon/radeon_asic.c | 6 ++ drivers/gpu/drm/radeon/radeon_asic.h | 1 + 3 files changed, 118 insertions(+), 0 deletions(-)
diff --git a/drivers/gpu/drm/radeon/evergreen_cs.c b/drivers/gpu/drm/radeon/evergreen_cs.c index adfc66f..fefb2cc 100644 --- a/drivers/gpu/drm/radeon/evergreen_cs.c +++ b/drivers/gpu/drm/radeon/evergreen_cs.c @@ -3543,3 +3543,114 @@ int evergreen_ib_parse(struct radeon_device *rdev, struct radeon_ib *ib)
return ret; } + +/** + * evergreen_dma_ib_parse() - parse the DMA IB for VM + * @rdev: radeon_device pointer + * @ib: radeon_ib pointer + * + * Parses the DMA IB from the VM CS ioctl + * checks for errors. (Cayman-SI) + * Returns 0 for success and an error on failure. + **/ +int evergreen_dma_ib_parse(struct radeon_device *rdev, struct radeon_ib *ib) +{ + u32 idx = 0; + u32 header, cmd, count, tiled, new_cmd, misc; + + do { + header = ib->ptr[idx]; + cmd = GET_DMA_CMD(header); + count = GET_DMA_COUNT(header); + tiled = GET_DMA_T(header); + new_cmd = GET_DMA_NEW(header); + misc = GET_DMA_MISC(header); + + switch (cmd) { + case DMA_PACKET_WRITE: + if (tiled) + idx += count + 7; + else + idx += count + 3; + break; + case DMA_PACKET_COPY: + if (tiled) { + if (new_cmd) { + switch (misc) { + case 0: + /* L2T, frame to fields */ + idx += 10; + break; + case 1: + /* L2T, T2L partial */ + idx += 12; + break; + case 3: + /* L2T, broadcast */ + idx += 10; + break; + case 4: + /* L2T, T2L */ + idx += 9; + break; + case 5: + /* T2T partial */ + idx += 13; + break; + case 7: + /* L2T, broadcast */ + idx += 10; + break; + default: + DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc); + return -EINVAL; + } + } else { + switch (misc) { + case 0: + idx += 9; + break; + default: + DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc); + return -EINVAL; + } + } + } else { + if (new_cmd) { + switch (misc) { + case 0: + /* L2L, byte */ + idx += 5; + break; + case 1: + /* L2L, partial */ + idx += 9; + break; + case 4: + /* L2L, dw, broadcast */ + idx += 7; + break; + default: + DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc); + return -EINVAL; + } + } else { + /* L2L, dw */ + idx += 5; + } + } + break; + case DMA_PACKET_CONSTANT_FILL: + idx += 4; + break; + case DMA_PACKET_NOP: + idx += 1; + break; + default: + DRM_ERROR("Unknown packet type %d at %d !\n", cmd, idx); + return -EINVAL; + } + } while (idx < ib->length_dw); + + return 0; +} diff --git a/drivers/gpu/drm/radeon/radeon_asic.c b/drivers/gpu/drm/radeon/radeon_asic.c index ac1d570..596bcbe 100644 --- a/drivers/gpu/drm/radeon/radeon_asic.c +++ b/drivers/gpu/drm/radeon/radeon_asic.c @@ -1484,6 +1484,7 @@ static struct radeon_asic cayman_asic = { }, [R600_RING_TYPE_DMA_INDEX] = { .ib_execute = &cayman_dma_ring_ib_execute, + .ib_parse = &evergreen_dma_ib_parse, .emit_fence = &evergreen_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, .cs_parse = &evergreen_dma_cs_parse, @@ -1494,6 +1495,7 @@ static struct radeon_asic cayman_asic = { }, [CAYMAN_RING_TYPE_DMA1_INDEX] = { .ib_execute = &cayman_dma_ring_ib_execute, + .ib_parse = &evergreen_dma_ib_parse, .emit_fence = &evergreen_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, .cs_parse = &evergreen_dma_cs_parse, @@ -1609,6 +1611,7 @@ static struct radeon_asic trinity_asic = { }, [R600_RING_TYPE_DMA_INDEX] = { .ib_execute = &cayman_dma_ring_ib_execute, + .ib_parse = &evergreen_dma_ib_parse, .emit_fence = &evergreen_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, .cs_parse = &evergreen_dma_cs_parse, @@ -1619,6 +1622,7 @@ static struct radeon_asic trinity_asic = { }, [CAYMAN_RING_TYPE_DMA1_INDEX] = { .ib_execute = &cayman_dma_ring_ib_execute, + .ib_parse = &evergreen_dma_ib_parse, .emit_fence = &evergreen_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, .cs_parse = &evergreen_dma_cs_parse, @@ -1734,6 +1738,7 @@ static struct radeon_asic si_asic = { }, [R600_RING_TYPE_DMA_INDEX] = { .ib_execute = &cayman_dma_ring_ib_execute, + .ib_parse = &evergreen_dma_ib_parse, .emit_fence = &evergreen_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, .cs_parse = NULL, @@ -1744,6 +1749,7 @@ static struct radeon_asic si_asic = { }, [CAYMAN_RING_TYPE_DMA1_INDEX] = { .ib_execute = &cayman_dma_ring_ib_execute, + .ib_parse = &evergreen_dma_ib_parse, .emit_fence = &evergreen_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, .cs_parse = NULL, diff --git a/drivers/gpu/drm/radeon/radeon_asic.h b/drivers/gpu/drm/radeon/radeon_asic.h index d2ac646..5f4882c 100644 --- a/drivers/gpu/drm/radeon/radeon_asic.h +++ b/drivers/gpu/drm/radeon/radeon_asic.h @@ -473,6 +473,7 @@ void cayman_vm_set_page(struct radeon_device *rdev, uint64_t pe, uint64_t addr, unsigned count, uint32_t incr, uint32_t flags); int evergreen_ib_parse(struct radeon_device *rdev, struct radeon_ib *ib); +int evergreen_dma_ib_parse(struct radeon_device *rdev, struct radeon_ib *ib); void cayman_dma_ring_ib_execute(struct radeon_device *rdev, struct radeon_ib *ib); bool cayman_dma_is_lockup(struct radeon_device *rdev, struct radeon_ring *ring);
From: Alex Deucher alexander.deucher@amd.com
This enables the functionality added in the previous patches. Userspace acceleration drivers can use the CS ioctl to submit command buffers to the async DMA rings.
Signed-off-by: Alex Deucher alexander.deucher@amd.com --- drivers/gpu/drm/radeon/radeon_cs.c | 12 ++++++++++++ include/uapi/drm/radeon_drm.h | 1 + 2 files changed, 13 insertions(+), 0 deletions(-)
diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c index 1b32a5a..396baba 100644 --- a/drivers/gpu/drm/radeon/radeon_cs.c +++ b/drivers/gpu/drm/radeon/radeon_cs.c @@ -112,6 +112,18 @@ static int radeon_cs_get_ring(struct radeon_cs_parser *p, u32 ring, s32 priority } else p->ring = RADEON_RING_TYPE_GFX_INDEX; break; + case RADEON_CS_RING_DMA: + if (p->rdev->family >= CHIP_CAYMAN) { + if (p->priority > 0) + p->ring = R600_RING_TYPE_DMA_INDEX; + else + p->ring = CAYMAN_RING_TYPE_DMA1_INDEX; + } else if (p->rdev->family >= CHIP_R600) { + p->ring = R600_RING_TYPE_DMA_INDEX; + } else { + return -EINVAL; + } + break; } return 0; } diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h index 5645a87..eeda917 100644 --- a/include/uapi/drm/radeon_drm.h +++ b/include/uapi/drm/radeon_drm.h @@ -917,6 +917,7 @@ struct drm_radeon_gem_va { /* The second dword of RADEON_CHUNK_ID_FLAGS is a uint32 that sets the ring type */ #define RADEON_CS_RING_GFX 0 #define RADEON_CS_RING_COMPUTE 1 +#define RADEON_CS_RING_DMA 2 /* The third dword of RADEON_CHUNK_ID_FLAGS is a sint32 that sets the priority */ /* 0 = normal, + = higher priority, - = lower priority */
On 13.12.2012 18:57, alexdeucher@gmail.com wrote:
From: Alex Deucher alexander.deucher@amd.com
Allows us to use the DMA ring from userspace. DMA doesn't have a good NOP packet in which to embed the reloc idx, so userspace has to add a reloc for each buffer used and order them to match the command stream.
Signed-off-by: Alex Deucher alexander.deucher@amd.com
Looks good at first glance:
Reviewed-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/radeon/r600_cs.c | 193 ++++++++++++++++++++++++++++++++++ drivers/gpu/drm/radeon/radeon.h | 1 + drivers/gpu/drm/radeon/radeon_asic.c | 6 +- drivers/gpu/drm/radeon/radeon_asic.h | 1 + drivers/gpu/drm/radeon/radeon_cs.c | 1 + 5 files changed, 199 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/radeon/r600_cs.c b/drivers/gpu/drm/radeon/r600_cs.c index 5d6e7f9..f23609ac 100644 --- a/drivers/gpu/drm/radeon/r600_cs.c +++ b/drivers/gpu/drm/radeon/r600_cs.c @@ -2568,3 +2568,196 @@ void r600_cs_legacy_init(void) { r600_cs_packet_next_reloc = &r600_cs_packet_next_reloc_nomm; }
+/*
- DMA
- */
+/**
- r600_dma_cs_next_reloc() - parse next reloc
- @p: parser structure holding parsing context.
- @cs_reloc: reloc informations
- Return the next reloc, do bo validation and compute
- GPU offset using the provided start.
- **/
+int r600_dma_cs_next_reloc(struct radeon_cs_parser *p,
struct radeon_cs_reloc **cs_reloc)
+{
- struct radeon_cs_chunk *relocs_chunk;
- unsigned idx;
- if (p->chunk_relocs_idx == -1) {
DRM_ERROR("No relocation chunk !\n");
return -EINVAL;
- }
- *cs_reloc = NULL;
- relocs_chunk = &p->chunks[p->chunk_relocs_idx];
- idx = p->dma_reloc_idx;
- if (idx >= relocs_chunk->length_dw) {
DRM_ERROR("Relocs at %d after relocations chunk end %d !\n",
idx, relocs_chunk->length_dw);
return -EINVAL;
- }
- /* FIXME: we assume reloc size is 4 dwords */
- *cs_reloc = p->relocs_ptr[(idx / 4)];
- p->dma_reloc_idx++;
- return 0;
+}
+#define GET_DMA_CMD(h) (((h) & 0xf0000000) >> 28) +#define GET_DMA_COUNT(h) ((h) & 0x0000ffff) +#define GET_DMA_T(h) (((h) & 0x00800000) >> 23)
+/**
- r600_dma_cs_parse() - parse the DMA IB
- @p: parser structure holding parsing context.
- Parses the DMA IB from the CS ioctl and updates
- the GPU addresses based on the reloc information and
- checks for errors. (R6xx-R7xx)
- Returns 0 for success and an error on failure.
- **/
+int r600_dma_cs_parse(struct radeon_cs_parser *p) +{
- struct radeon_cs_chunk *ib_chunk = &p->chunks[p->chunk_ib_idx];
- struct radeon_cs_reloc *src_reloc, *dst_reloc;
- u32 header, cmd, count, tiled;
- volatile u32 *ib = p->ib.ptr;
- u32 idx, idx_value;
- u64 src_offset, dst_offset;
- int r;
- do {
if (p->idx >= ib_chunk->length_dw) {
DRM_ERROR("Can not parse packet at %d after CS end %d !\n",
p->idx, ib_chunk->length_dw);
return -EINVAL;
}
idx = p->idx;
header = radeon_get_ib_value(p, idx);
cmd = GET_DMA_CMD(header);
count = GET_DMA_COUNT(header);
tiled = GET_DMA_T(header);
switch (cmd) {
case DMA_PACKET_WRITE:
r = r600_dma_cs_next_reloc(p, &dst_reloc);
if (r) {
DRM_ERROR("bad DMA_PACKET_WRITE\n");
return -EINVAL;
}
if (tiled) {
ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
dst_offset = ib[idx+1];
dst_offset <<= 8;
p->idx += count + 5;
} else {
ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
ib[idx+2] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
dst_offset = ib[idx+1];
dst_offset |= ((u64)(ib[idx+2] & 0xff)) << 32;
p->idx += count + 3;
}
if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
dev_warn(p->dev, "DMA write buffer too small (%llu %lu)\n",
dst_offset, radeon_bo_size(dst_reloc->robj));
return -EINVAL;
}
break;
case DMA_PACKET_COPY:
r = r600_dma_cs_next_reloc(p, &src_reloc);
if (r) {
DRM_ERROR("bad DMA_PACKET_COPY\n");
return -EINVAL;
}
r = r600_dma_cs_next_reloc(p, &dst_reloc);
if (r) {
DRM_ERROR("bad DMA_PACKET_COPY\n");
return -EINVAL;
}
if (tiled) {
idx_value = radeon_get_ib_value(p, idx + 2);
/* detile bit */
if (idx_value & (1 << 31)) {
/* tiled src, linear dst */
ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8);
src_offset = ib[idx+1];
src_offset <<= 8;
ib[idx+5] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
ib[idx+6] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
dst_offset = ib[idx+5];
dst_offset |= ((u64)(ib[idx+6] & 0xff)) << 32;
} else {
/* linear src, tiled dst */
ib[idx+5] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
ib[idx+6] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
src_offset = ib[idx+5];
src_offset |= ((u64)(ib[idx+6] & 0xff)) << 32;
ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8);
dst_offset = ib[idx+1];
dst_offset <<= 8;
}
p->idx += 7;
} else {
ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
ib[idx+2] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc);
ib[idx+3] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff;
ib[idx+4] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff;
src_offset = ib[idx+2];
src_offset |= ((u64)(ib[idx+4] & 0xff)) << 32;
dst_offset = ib[idx+1];
dst_offset |= ((u64)(ib[idx+3] & 0xff)) << 32;
p->idx += 5;
}
if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) {
dev_warn(p->dev, "DMA copy src buffer too small (%llu %lu)\n",
src_offset, radeon_bo_size(src_reloc->robj));
return -EINVAL;
}
if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
dev_warn(p->dev, "DMA write dst buffer too small (%llu %lu)\n",
dst_offset, radeon_bo_size(dst_reloc->robj));
return -EINVAL;
}
break;
case DMA_PACKET_CONSTANT_FILL:
if (p->family < CHIP_RV770) {
DRM_ERROR("Constant Fill is 7xx only !\n");
return -EINVAL;
}
r = r600_dma_cs_next_reloc(p, &dst_reloc);
if (r) {
DRM_ERROR("bad DMA_PACKET_WRITE\n");
return -EINVAL;
}
ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc);
ib[idx+3] += (upper_32_bits(dst_reloc->lobj.gpu_offset) << 16) & 0x00ff0000;
dst_offset = ib[idx+1];
dst_offset |= ((u64)(ib[idx+3] & 0x00ff0000)) << 16;
if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) {
dev_warn(p->dev, "DMA constant fill buffer too small (%llu %lu)\n",
dst_offset, radeon_bo_size(dst_reloc->robj));
return -EINVAL;
}
p->idx += 4;
break;
case DMA_PACKET_NOP:
p->idx += 1;
break;
default:
DRM_ERROR("Unknown packet type %d at %d !\n", cmd, idx);
return -EINVAL;
}
- } while (p->idx < p->chunks[p->chunk_ib_idx].length_dw);
+#if 0
- for (r = 0; r < p->ib->length_dw; r++) {
printk(KERN_INFO "%05d 0x%08X\n", r, p->ib.ptr[r]);
mdelay(1);
- }
+#endif
- return 0;
+} diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 285fb3f..5dc744d 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -839,6 +839,7 @@ struct radeon_cs_parser { struct radeon_cs_reloc *relocs; struct radeon_cs_reloc **relocs_ptr; struct list_head validated;
- unsigned dma_reloc_idx; /* indices of various chunks */ int chunk_ib_idx; int chunk_relocs_idx;
diff --git a/drivers/gpu/drm/radeon/radeon_asic.c b/drivers/gpu/drm/radeon/radeon_asic.c index 3ea0475..d360341 100644 --- a/drivers/gpu/drm/radeon/radeon_asic.c +++ b/drivers/gpu/drm/radeon/radeon_asic.c @@ -952,7 +952,7 @@ static struct radeon_asic r600_asic = { .ib_execute = &r600_dma_ring_ib_execute, .emit_fence = &r600_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit,
.cs_parse = NULL,
.cs_parse = &r600_dma_cs_parse, .ring_test = &r600_dma_ring_test, .ib_test = &r600_dma_ib_test, .is_lockup = &r600_dma_is_lockup,
@@ -1036,7 +1036,7 @@ static struct radeon_asic rs780_asic = { .ib_execute = &r600_dma_ring_ib_execute, .emit_fence = &r600_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit,
.cs_parse = NULL,
.cs_parse = &r600_dma_cs_parse, .ring_test = &r600_dma_ring_test, .ib_test = &r600_dma_ib_test, .is_lockup = &r600_dma_is_lockup,
@@ -1120,7 +1120,7 @@ static struct radeon_asic rv770_asic = { .ib_execute = &r600_dma_ring_ib_execute, .emit_fence = &r600_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit,
.cs_parse = NULL,
.cs_parse = &r600_dma_cs_parse, .ring_test = &r600_dma_ring_test, .ib_test = &r600_dma_ib_test, .is_lockup = &r600_dma_is_lockup,
diff --git a/drivers/gpu/drm/radeon/radeon_asic.h b/drivers/gpu/drm/radeon/radeon_asic.h index c338931..b311c0a 100644 --- a/drivers/gpu/drm/radeon/radeon_asic.h +++ b/drivers/gpu/drm/radeon/radeon_asic.h @@ -304,6 +304,7 @@ void r600_pcie_gart_tlb_flush(struct radeon_device *rdev); uint32_t r600_pciep_rreg(struct radeon_device *rdev, uint32_t reg); void r600_pciep_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v); int r600_cs_parse(struct radeon_cs_parser *p); +int r600_dma_cs_parse(struct radeon_cs_parser *p); void r600_fence_ring_emit(struct radeon_device *rdev, struct radeon_fence *fence); void r600_semaphore_ring_emit(struct radeon_device *rdev, diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c index 41672cc..1b32a5a 100644 --- a/drivers/gpu/drm/radeon/radeon_cs.c +++ b/drivers/gpu/drm/radeon/radeon_cs.c @@ -43,6 +43,7 @@ static int radeon_cs_parser_relocs(struct radeon_cs_parser *p) return 0; } chunk = &p->chunks[p->chunk_relocs_idx];
- p->dma_reloc_idx = 0; /* FIXME: we assume that each relocs use 4 dwords */ p->nrelocs = chunk->length_dw / 4; p->relocs_ptr = kcalloc(p->nrelocs, sizeof(void *), GFP_KERNEL);
From: Alex Deucher alexander.deucher@amd.com
Allows us to use the DMA ring from userspace. DMA doesn't have a good NOP packet in which to embed the reloc idx, so userspace has to add a reloc for each buffer used and order them to match the command stream.
v2: fix address bounds checking, reloc indexing
Signed-off-by: Alex Deucher alexander.deucher@amd.com --- drivers/gpu/drm/radeon/r600_cs.c | 193 ++++++++++++++++++++++++++++++++++ drivers/gpu/drm/radeon/radeon.h | 1 + drivers/gpu/drm/radeon/radeon_asic.c | 6 +- drivers/gpu/drm/radeon/radeon_asic.h | 1 + drivers/gpu/drm/radeon/radeon_cs.c | 1 + 5 files changed, 199 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/radeon/r600_cs.c b/drivers/gpu/drm/radeon/r600_cs.c index 5d6e7f9..1e2935e 100644 --- a/drivers/gpu/drm/radeon/r600_cs.c +++ b/drivers/gpu/drm/radeon/r600_cs.c @@ -2568,3 +2568,196 @@ void r600_cs_legacy_init(void) { r600_cs_packet_next_reloc = &r600_cs_packet_next_reloc_nomm; } + +/* + * DMA + */ +/** + * r600_dma_cs_next_reloc() - parse next reloc + * @p: parser structure holding parsing context. + * @cs_reloc: reloc informations + * + * Return the next reloc, do bo validation and compute + * GPU offset using the provided start. + **/ +int r600_dma_cs_next_reloc(struct radeon_cs_parser *p, + struct radeon_cs_reloc **cs_reloc) +{ + struct radeon_cs_chunk *relocs_chunk; + unsigned idx; + + if (p->chunk_relocs_idx == -1) { + DRM_ERROR("No relocation chunk !\n"); + return -EINVAL; + } + *cs_reloc = NULL; + relocs_chunk = &p->chunks[p->chunk_relocs_idx]; + idx = p->dma_reloc_idx; + if (idx >= relocs_chunk->length_dw) { + DRM_ERROR("Relocs at %d after relocations chunk end %d !\n", + idx, relocs_chunk->length_dw); + return -EINVAL; + } + *cs_reloc = p->relocs_ptr[idx]; + p->dma_reloc_idx++; + return 0; +} + +#define GET_DMA_CMD(h) (((h) & 0xf0000000) >> 28) +#define GET_DMA_COUNT(h) ((h) & 0x0000ffff) +#define GET_DMA_T(h) (((h) & 0x00800000) >> 23) + +/** + * r600_dma_cs_parse() - parse the DMA IB + * @p: parser structure holding parsing context. + * + * Parses the DMA IB from the CS ioctl and updates + * the GPU addresses based on the reloc information and + * checks for errors. (R6xx-R7xx) + * Returns 0 for success and an error on failure. + **/ +int r600_dma_cs_parse(struct radeon_cs_parser *p) +{ + struct radeon_cs_chunk *ib_chunk = &p->chunks[p->chunk_ib_idx]; + struct radeon_cs_reloc *src_reloc, *dst_reloc; + u32 header, cmd, count, tiled; + volatile u32 *ib = p->ib.ptr; + u32 idx, idx_value; + u64 src_offset, dst_offset; + int r; + + do { + if (p->idx >= ib_chunk->length_dw) { + DRM_ERROR("Can not parse packet at %d after CS end %d !\n", + p->idx, ib_chunk->length_dw); + return -EINVAL; + } + idx = p->idx; + header = radeon_get_ib_value(p, idx); + cmd = GET_DMA_CMD(header); + count = GET_DMA_COUNT(header); + tiled = GET_DMA_T(header); + + switch (cmd) { + case DMA_PACKET_WRITE: + r = r600_dma_cs_next_reloc(p, &dst_reloc); + if (r) { + DRM_ERROR("bad DMA_PACKET_WRITE\n"); + return -EINVAL; + } + if (tiled) { + dst_offset = ib[idx+1]; + dst_offset <<= 8; + + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8); + p->idx += count + 5; + } else { + dst_offset = ib[idx+1]; + dst_offset |= ((u64)(ib[idx+2] & 0xff)) << 32; + + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+2] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff; + p->idx += count + 3; + } + if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) { + dev_warn(p->dev, "DMA write buffer too small (%llu %lu)\n", + dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj)); + return -EINVAL; + } + break; + case DMA_PACKET_COPY: + r = r600_dma_cs_next_reloc(p, &src_reloc); + if (r) { + DRM_ERROR("bad DMA_PACKET_COPY\n"); + return -EINVAL; + } + r = r600_dma_cs_next_reloc(p, &dst_reloc); + if (r) { + DRM_ERROR("bad DMA_PACKET_COPY\n"); + return -EINVAL; + } + if (tiled) { + idx_value = radeon_get_ib_value(p, idx + 2); + /* detile bit */ + if (idx_value & (1 << 31)) { + /* tiled src, linear dst */ + src_offset = ib[idx+1]; + src_offset <<= 8; + ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8); + + dst_offset = ib[idx+5]; + dst_offset |= ((u64)(ib[idx+6] & 0xff)) << 32; + ib[idx+5] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+6] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff; + } else { + /* linear src, tiled dst */ + src_offset = ib[idx+5]; + src_offset |= ((u64)(ib[idx+6] & 0xff)) << 32; + ib[idx+5] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+6] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff; + + dst_offset = ib[idx+1]; + dst_offset <<= 8; + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8); + } + p->idx += 7; + } else { + src_offset = ib[idx+2]; + src_offset |= ((u64)(ib[idx+4] & 0xff)) << 32; + dst_offset = ib[idx+1]; + dst_offset |= ((u64)(ib[idx+3] & 0xff)) << 32; + + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+2] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+3] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff; + ib[idx+4] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff; + p->idx += 5; + } + if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) { + dev_warn(p->dev, "DMA copy src buffer too small (%llu %lu)\n", + src_offset + (count * 4), radeon_bo_size(src_reloc->robj)); + return -EINVAL; + } + if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) { + dev_warn(p->dev, "DMA write dst buffer too small (%llu %lu)\n", + dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj)); + return -EINVAL; + } + break; + case DMA_PACKET_CONSTANT_FILL: + if (p->family < CHIP_RV770) { + DRM_ERROR("Constant Fill is 7xx only !\n"); + return -EINVAL; + } + r = r600_dma_cs_next_reloc(p, &dst_reloc); + if (r) { + DRM_ERROR("bad DMA_PACKET_WRITE\n"); + return -EINVAL; + } + dst_offset = ib[idx+1]; + dst_offset |= ((u64)(ib[idx+3] & 0x00ff0000)) << 16; + if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) { + dev_warn(p->dev, "DMA constant fill buffer too small (%llu %lu)\n", + dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj)); + return -EINVAL; + } + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+3] += (upper_32_bits(dst_reloc->lobj.gpu_offset) << 16) & 0x00ff0000; + p->idx += 4; + break; + case DMA_PACKET_NOP: + p->idx += 1; + break; + default: + DRM_ERROR("Unknown packet type %d at %d !\n", cmd, idx); + return -EINVAL; + } + } while (p->idx < p->chunks[p->chunk_ib_idx].length_dw); +#if 0 + for (r = 0; r < p->ib->length_dw; r++) { + printk(KERN_INFO "%05d 0x%08X\n", r, p->ib.ptr[r]); + mdelay(1); + } +#endif + return 0; +} diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 285fb3f..5dc744d 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -839,6 +839,7 @@ struct radeon_cs_parser { struct radeon_cs_reloc *relocs; struct radeon_cs_reloc **relocs_ptr; struct list_head validated; + unsigned dma_reloc_idx; /* indices of various chunks */ int chunk_ib_idx; int chunk_relocs_idx; diff --git a/drivers/gpu/drm/radeon/radeon_asic.c b/drivers/gpu/drm/radeon/radeon_asic.c index 3ea0475..d360341 100644 --- a/drivers/gpu/drm/radeon/radeon_asic.c +++ b/drivers/gpu/drm/radeon/radeon_asic.c @@ -952,7 +952,7 @@ static struct radeon_asic r600_asic = { .ib_execute = &r600_dma_ring_ib_execute, .emit_fence = &r600_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, - .cs_parse = NULL, + .cs_parse = &r600_dma_cs_parse, .ring_test = &r600_dma_ring_test, .ib_test = &r600_dma_ib_test, .is_lockup = &r600_dma_is_lockup, @@ -1036,7 +1036,7 @@ static struct radeon_asic rs780_asic = { .ib_execute = &r600_dma_ring_ib_execute, .emit_fence = &r600_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, - .cs_parse = NULL, + .cs_parse = &r600_dma_cs_parse, .ring_test = &r600_dma_ring_test, .ib_test = &r600_dma_ib_test, .is_lockup = &r600_dma_is_lockup, @@ -1120,7 +1120,7 @@ static struct radeon_asic rv770_asic = { .ib_execute = &r600_dma_ring_ib_execute, .emit_fence = &r600_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, - .cs_parse = NULL, + .cs_parse = &r600_dma_cs_parse, .ring_test = &r600_dma_ring_test, .ib_test = &r600_dma_ib_test, .is_lockup = &r600_dma_is_lockup, diff --git a/drivers/gpu/drm/radeon/radeon_asic.h b/drivers/gpu/drm/radeon/radeon_asic.h index c338931..b311c0a 100644 --- a/drivers/gpu/drm/radeon/radeon_asic.h +++ b/drivers/gpu/drm/radeon/radeon_asic.h @@ -304,6 +304,7 @@ void r600_pcie_gart_tlb_flush(struct radeon_device *rdev); uint32_t r600_pciep_rreg(struct radeon_device *rdev, uint32_t reg); void r600_pciep_wreg(struct radeon_device *rdev, uint32_t reg, uint32_t v); int r600_cs_parse(struct radeon_cs_parser *p); +int r600_dma_cs_parse(struct radeon_cs_parser *p); void r600_fence_ring_emit(struct radeon_device *rdev, struct radeon_fence *fence); void r600_semaphore_ring_emit(struct radeon_device *rdev, diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c index 41672cc..1b32a5a 100644 --- a/drivers/gpu/drm/radeon/radeon_cs.c +++ b/drivers/gpu/drm/radeon/radeon_cs.c @@ -43,6 +43,7 @@ static int radeon_cs_parser_relocs(struct radeon_cs_parser *p) return 0; } chunk = &p->chunks[p->chunk_relocs_idx]; + p->dma_reloc_idx = 0; /* FIXME: we assume that each relocs use 4 dwords */ p->nrelocs = chunk->length_dw / 4; p->relocs_ptr = kcalloc(p->nrelocs, sizeof(void *), GFP_KERNEL);
From: Alex Deucher alexander.deucher@amd.com
Allows us to use the DMA ring from userspace. DMA doesn't have a good NOP packet in which to embed the reloc idx, so userspace has to add a reloc for each buffer used and order them to match the command stream.
v2: fix address bounds checking
Signed-off-by: Alex Deucher alexander.deucher@amd.com --- drivers/gpu/drm/radeon/evergreen_cs.c | 451 +++++++++++++++++++++++++++++++++ drivers/gpu/drm/radeon/radeon_asic.c | 14 +- drivers/gpu/drm/radeon/radeon_asic.h | 1 + 3 files changed, 459 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/radeon/evergreen_cs.c b/drivers/gpu/drm/radeon/evergreen_cs.c index 62c2271..1e8bd46 100644 --- a/drivers/gpu/drm/radeon/evergreen_cs.c +++ b/drivers/gpu/drm/radeon/evergreen_cs.c @@ -34,6 +34,8 @@ #define MAX(a,b) (((a)>(b))?(a):(b)) #define MIN(a,b) (((a)<(b))?(a):(b))
+int r600_dma_cs_next_reloc(struct radeon_cs_parser *p, + struct radeon_cs_reloc **cs_reloc); static int evergreen_cs_packet_next_reloc(struct radeon_cs_parser *p, struct radeon_cs_reloc **cs_reloc);
@@ -2804,6 +2806,455 @@ int evergreen_cs_parse(struct radeon_cs_parser *p) return 0; }
+/* + * DMA + */ + +#define GET_DMA_CMD(h) (((h) & 0xf0000000) >> 28) +#define GET_DMA_COUNT(h) ((h) & 0x000fffff) +#define GET_DMA_T(h) (((h) & 0x00800000) >> 23) +#define GET_DMA_NEW(h) (((h) & 0x04000000) >> 26) +#define GET_DMA_MISC(h) (((h) & 0x0700000) >> 20) + +/** + * evergreen_dma_cs_parse() - parse the DMA IB + * @p: parser structure holding parsing context. + * + * Parses the DMA IB from the CS ioctl and updates + * the GPU addresses based on the reloc information and + * checks for errors. (Evergreen-Cayman) + * Returns 0 for success and an error on failure. + **/ +int evergreen_dma_cs_parse(struct radeon_cs_parser *p) +{ + struct radeon_cs_chunk *ib_chunk = &p->chunks[p->chunk_ib_idx]; + struct radeon_cs_reloc *src_reloc, *dst_reloc, *dst2_reloc; + u32 header, cmd, count, tiled, new_cmd, misc; + volatile u32 *ib = p->ib.ptr; + u32 idx, idx_value; + u64 src_offset, dst_offset, dst2_offset; + int r; + + do { + if (p->idx >= ib_chunk->length_dw) { + DRM_ERROR("Can not parse packet at %d after CS end %d !\n", + p->idx, ib_chunk->length_dw); + return -EINVAL; + } + idx = p->idx; + header = radeon_get_ib_value(p, idx); + cmd = GET_DMA_CMD(header); + count = GET_DMA_COUNT(header); + tiled = GET_DMA_T(header); + new_cmd = GET_DMA_NEW(header); + misc = GET_DMA_MISC(header); + + switch (cmd) { + case DMA_PACKET_WRITE: + r = r600_dma_cs_next_reloc(p, &dst_reloc); + if (r) { + DRM_ERROR("bad DMA_PACKET_WRITE\n"); + return -EINVAL; + } + if (tiled) { + dst_offset = ib[idx+1]; + dst_offset <<= 8; + + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8); + p->idx += count + 7; + } else { + dst_offset = ib[idx+1]; + dst_offset |= ((u64)(ib[idx+2] & 0xff)) << 32; + + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+2] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff; + p->idx += count + 3; + } + if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) { + dev_warn(p->dev, "DMA write buffer too small (%llu %lu)\n", + dst_offset, radeon_bo_size(dst_reloc->robj)); + return -EINVAL; + } + break; + case DMA_PACKET_COPY: + r = r600_dma_cs_next_reloc(p, &src_reloc); + if (r) { + DRM_ERROR("bad DMA_PACKET_COPY\n"); + return -EINVAL; + } + r = r600_dma_cs_next_reloc(p, &dst_reloc); + if (r) { + DRM_ERROR("bad DMA_PACKET_COPY\n"); + return -EINVAL; + } + if (tiled) { + idx_value = radeon_get_ib_value(p, idx + 2); + if (new_cmd) { + switch (misc) { + case 0: + /* L2T, frame to fields */ + if (idx_value & (1 << 31)) { + DRM_ERROR("bad L2T, frame to fields DMA_PACKET_COPY\n"); + return -EINVAL; + } + r = r600_dma_cs_next_reloc(p, &dst2_reloc); + if (r) { + DRM_ERROR("bad L2T, frame to fields DMA_PACKET_COPY\n"); + return -EINVAL; + } + dst_offset = ib[idx+1]; + dst_offset <<= 8; + dst2_offset = ib[idx+2]; + dst2_offset <<= 8; + src_offset = ib[idx+8]; + src_offset |= ((u64)(ib[idx+9] & 0xff)) << 32; + if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) { + dev_warn(p->dev, "DMA L2T, frame to fields src buffer too small (%llu %lu)\n", + src_offset + (count * 4), radeon_bo_size(src_reloc->robj)); + return -EINVAL; + } + if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) { + dev_warn(p->dev, "DMA L2T, frame to fields buffer too small (%llu %lu)\n", + dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj)); + return -EINVAL; + } + if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) { + dev_warn(p->dev, "DMA L2T, frame to fields buffer too small (%llu %lu)\n", + dst2_offset + (count * 4), radeon_bo_size(dst2_reloc->robj)); + return -EINVAL; + } + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8); + ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset >> 8); + ib[idx+8] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+9] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff; + p->idx += 10; + break; + case 1: + /* L2T, T2L partial */ + if (p->family < CHIP_CAYMAN) { + DRM_ERROR("L2T, T2L Partial is cayman only !\n"); + return -EINVAL; + } + /* detile bit */ + if (idx_value & (1 << 31)) { + /* tiled src, linear dst */ + ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8); + + ib[idx+7] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+8] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff; + } else { + /* linear src, tiled dst */ + ib[idx+7] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+8] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff; + + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8); + } + p->idx += 12; + break; + case 3: + /* L2T, broadcast */ + if (idx_value & (1 << 31)) { + DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n"); + return -EINVAL; + } + r = r600_dma_cs_next_reloc(p, &dst2_reloc); + if (r) { + DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n"); + return -EINVAL; + } + dst_offset = ib[idx+1]; + dst_offset <<= 8; + dst2_offset = ib[idx+2]; + dst2_offset <<= 8; + src_offset = ib[idx+8]; + src_offset |= ((u64)(ib[idx+9] & 0xff)) << 32; + if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) { + dev_warn(p->dev, "DMA L2T, broadcast src buffer too small (%llu %lu)\n", + src_offset + (count * 4), radeon_bo_size(src_reloc->robj)); + return -EINVAL; + } + if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) { + dev_warn(p->dev, "DMA L2T, broadcast dst buffer too small (%llu %lu)\n", + dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj)); + return -EINVAL; + } + if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) { + dev_warn(p->dev, "DMA L2T, broadcast dst2 buffer too small (%llu %lu)\n", + dst2_offset + (count * 4), radeon_bo_size(dst2_reloc->robj)); + return -EINVAL; + } + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8); + ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset >> 8); + ib[idx+8] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+9] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff; + p->idx += 10; + break; + case 4: + /* L2T, T2L */ + /* detile bit */ + if (idx_value & (1 << 31)) { + /* tiled src, linear dst */ + src_offset = ib[idx+1]; + src_offset <<= 8; + ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8); + + dst_offset = ib[idx+7]; + dst_offset |= ((u64)(ib[idx+8] & 0xff)) << 32; + ib[idx+7] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+8] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff; + } else { + /* linear src, tiled dst */ + src_offset = ib[idx+7]; + src_offset |= ((u64)(ib[idx+8] & 0xff)) << 32; + ib[idx+7] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+8] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff; + + dst_offset = ib[idx+1]; + dst_offset <<= 8; + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8); + } + if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) { + dev_warn(p->dev, "DMA L2T, T2L src buffer too small (%llu %lu)\n", + src_offset + (count * 4), radeon_bo_size(src_reloc->robj)); + return -EINVAL; + } + if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) { + dev_warn(p->dev, "DMA L2T, T2L dst buffer too small (%llu %lu)\n", + dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj)); + return -EINVAL; + } + p->idx += 9; + break; + case 5: + /* T2T partial */ + if (p->family < CHIP_CAYMAN) { + DRM_ERROR("L2T, T2L Partial is cayman only !\n"); + return -EINVAL; + } + ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8); + ib[idx+4] += (u32)(dst_reloc->lobj.gpu_offset >> 8); + p->idx += 13; + break; + case 7: + /* L2T, broadcast */ + if (idx_value & (1 << 31)) { + DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n"); + return -EINVAL; + } + r = r600_dma_cs_next_reloc(p, &dst2_reloc); + if (r) { + DRM_ERROR("bad L2T, broadcast DMA_PACKET_COPY\n"); + return -EINVAL; + } + dst_offset = ib[idx+1]; + dst_offset <<= 8; + dst2_offset = ib[idx+2]; + dst2_offset <<= 8; + src_offset = ib[idx+8]; + src_offset |= ((u64)(ib[idx+9] & 0xff)) << 32; + if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) { + dev_warn(p->dev, "DMA L2T, broadcast src buffer too small (%llu %lu)\n", + src_offset + (count * 4), radeon_bo_size(src_reloc->robj)); + return -EINVAL; + } + if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) { + dev_warn(p->dev, "DMA L2T, broadcast dst buffer too small (%llu %lu)\n", + dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj)); + return -EINVAL; + } + if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) { + dev_warn(p->dev, "DMA L2T, broadcast dst2 buffer too small (%llu %lu)\n", + dst2_offset + (count * 4), radeon_bo_size(dst2_reloc->robj)); + return -EINVAL; + } + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8); + ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset >> 8); + ib[idx+8] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+9] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff; + p->idx += 10; + break; + default: + DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc); + return -EINVAL; + } + } else { + switch (misc) { + case 0: + /* detile bit */ + if (idx_value & (1 << 31)) { + /* tiled src, linear dst */ + src_offset = ib[idx+1]; + src_offset <<= 8; + ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset >> 8); + + dst_offset = ib[idx+7]; + dst_offset |= ((u64)(ib[idx+8] & 0xff)) << 32; + ib[idx+7] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+8] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff; + } else { + /* linear src, tiled dst */ + src_offset = ib[idx+7]; + src_offset |= ((u64)(ib[idx+8] & 0xff)) << 32; + ib[idx+7] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+8] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff; + + dst_offset = ib[idx+1]; + dst_offset <<= 8; + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset >> 8); + } + if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) { + dev_warn(p->dev, "DMA L2T, broadcast src buffer too small (%llu %lu)\n", + src_offset + (count * 4), radeon_bo_size(src_reloc->robj)); + return -EINVAL; + } + if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) { + dev_warn(p->dev, "DMA L2T, broadcast dst buffer too small (%llu %lu)\n", + dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj)); + return -EINVAL; + } + p->idx += 9; + break; + default: + DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc); + return -EINVAL; + } + } + } else { + if (new_cmd) { + switch (misc) { + case 0: + /* L2L, byte */ + src_offset = ib[idx+2]; + src_offset |= ((u64)(ib[idx+4] & 0xff)) << 32; + dst_offset = ib[idx+1]; + dst_offset |= ((u64)(ib[idx+3] & 0xff)) << 32; + if ((src_offset + count) > radeon_bo_size(src_reloc->robj)) { + dev_warn(p->dev, "DMA L2L, byte src buffer too small (%llu %lu)\n", + src_offset + count, radeon_bo_size(src_reloc->robj)); + return -EINVAL; + } + if ((dst_offset + count) > radeon_bo_size(dst_reloc->robj)) { + dev_warn(p->dev, "DMA L2L, byte dst buffer too small (%llu %lu)\n", + dst_offset + count, radeon_bo_size(dst_reloc->robj)); + return -EINVAL; + } + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xffffffff); + ib[idx+2] += (u32)(src_reloc->lobj.gpu_offset & 0xffffffff); + ib[idx+3] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff; + ib[idx+4] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff; + p->idx += 5; + break; + case 1: + /* L2L, partial */ + if (p->family < CHIP_CAYMAN) { + DRM_ERROR("L2L Partial is cayman only !\n"); + return -EINVAL; + } + ib[idx+1] += (u32)(src_reloc->lobj.gpu_offset & 0xffffffff); + ib[idx+2] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff; + ib[idx+4] += (u32)(dst_reloc->lobj.gpu_offset & 0xffffffff); + ib[idx+5] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff; + + p->idx += 9; + break; + case 4: + /* L2L, dw, broadcast */ + r = r600_dma_cs_next_reloc(p, &dst2_reloc); + if (r) { + DRM_ERROR("bad L2L, dw, broadcast DMA_PACKET_COPY\n"); + return -EINVAL; + } + dst_offset = ib[idx+1]; + dst_offset |= ((u64)(ib[idx+4] & 0xff)) << 32; + dst2_offset = ib[idx+2]; + dst2_offset |= ((u64)(ib[idx+5] & 0xff)) << 32; + src_offset = ib[idx+3]; + src_offset |= ((u64)(ib[idx+6] & 0xff)) << 32; + if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) { + dev_warn(p->dev, "DMA L2L, dw, broadcast src buffer too small (%llu %lu)\n", + src_offset + (count * 4), radeon_bo_size(src_reloc->robj)); + return -EINVAL; + } + if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) { + dev_warn(p->dev, "DMA L2L, dw, broadcast dst buffer too small (%llu %lu)\n", + dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj)); + return -EINVAL; + } + if ((dst2_offset + (count * 4)) > radeon_bo_size(dst2_reloc->robj)) { + dev_warn(p->dev, "DMA L2L, dw, broadcast dst2 buffer too small (%llu %lu)\n", + dst2_offset + (count * 4), radeon_bo_size(dst2_reloc->robj)); + return -EINVAL; + } + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+2] += (u32)(dst2_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+3] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+4] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff; + ib[idx+5] += upper_32_bits(dst2_reloc->lobj.gpu_offset) & 0xff; + ib[idx+6] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff; + p->idx += 7; + break; + default: + DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc); + return -EINVAL; + } + } else { + /* L2L, dw */ + src_offset = ib[idx+2]; + src_offset |= ((u64)(ib[idx+4] & 0xff)) << 32; + dst_offset = ib[idx+1]; + dst_offset |= ((u64)(ib[idx+3] & 0xff)) << 32; + if ((src_offset + (count * 4)) > radeon_bo_size(src_reloc->robj)) { + dev_warn(p->dev, "DMA L2L, dw src buffer too small (%llu %lu)\n", + src_offset + (count * 4), radeon_bo_size(src_reloc->robj)); + return -EINVAL; + } + if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) { + dev_warn(p->dev, "DMA L2L, dw dst buffer too small (%llu %lu)\n", + dst_offset + (count * 4), radeon_bo_size(dst_reloc->robj)); + return -EINVAL; + } + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+2] += (u32)(src_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+3] += upper_32_bits(dst_reloc->lobj.gpu_offset) & 0xff; + ib[idx+4] += upper_32_bits(src_reloc->lobj.gpu_offset) & 0xff; + p->idx += 5; + } + } + break; + case DMA_PACKET_CONSTANT_FILL: + r = r600_dma_cs_next_reloc(p, &dst_reloc); + if (r) { + DRM_ERROR("bad DMA_PACKET_CONSTANT_FILL\n"); + return -EINVAL; + } + dst_offset = ib[idx+1]; + dst_offset |= ((u64)(ib[idx+3] & 0x00ff0000)) << 16; + if ((dst_offset + (count * 4)) > radeon_bo_size(dst_reloc->robj)) { + dev_warn(p->dev, "DMA constant fill buffer too small (%llu %lu)\n", + dst_offset, radeon_bo_size(dst_reloc->robj)); + return -EINVAL; + } + ib[idx+1] += (u32)(dst_reloc->lobj.gpu_offset & 0xfffffffc); + ib[idx+3] += (upper_32_bits(dst_reloc->lobj.gpu_offset) << 16) & 0x00ff0000; + p->idx += 4; + break; + case DMA_PACKET_NOP: + p->idx += 1; + break; + default: + DRM_ERROR("Unknown packet type %d at %d !\n", cmd, idx); + return -EINVAL; + } + } while (p->idx < p->chunks[p->chunk_ib_idx].length_dw); +#if 0 + for (r = 0; r < p->ib->length_dw; r++) { + printk(KERN_INFO "%05d 0x%08X\n", r, p->ib.ptr[r]); + mdelay(1); + } +#endif + return 0; +} + /* vm parser */ static bool evergreen_vm_reg_valid(u32 reg) { diff --git a/drivers/gpu/drm/radeon/radeon_asic.c b/drivers/gpu/drm/radeon/radeon_asic.c index d360341..ac1d570 100644 --- a/drivers/gpu/drm/radeon/radeon_asic.c +++ b/drivers/gpu/drm/radeon/radeon_asic.c @@ -1204,7 +1204,7 @@ static struct radeon_asic evergreen_asic = { .ib_execute = &evergreen_dma_ring_ib_execute, .emit_fence = &evergreen_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, - .cs_parse = NULL, + .cs_parse = &evergreen_dma_cs_parse, .ring_test = &r600_dma_ring_test, .ib_test = &r600_dma_ib_test, .is_lockup = &r600_dma_is_lockup, @@ -1288,7 +1288,7 @@ static struct radeon_asic sumo_asic = { .ib_execute = &evergreen_dma_ring_ib_execute, .emit_fence = &evergreen_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, - .cs_parse = NULL, + .cs_parse = &evergreen_dma_cs_parse, .ring_test = &r600_dma_ring_test, .ib_test = &r600_dma_ib_test, .is_lockup = &r600_dma_is_lockup, @@ -1372,7 +1372,7 @@ static struct radeon_asic btc_asic = { .ib_execute = &evergreen_dma_ring_ib_execute, .emit_fence = &evergreen_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, - .cs_parse = NULL, + .cs_parse = &evergreen_dma_cs_parse, .ring_test = &r600_dma_ring_test, .ib_test = &r600_dma_ib_test, .is_lockup = &r600_dma_is_lockup, @@ -1486,7 +1486,7 @@ static struct radeon_asic cayman_asic = { .ib_execute = &cayman_dma_ring_ib_execute, .emit_fence = &evergreen_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, - .cs_parse = NULL, + .cs_parse = &evergreen_dma_cs_parse, .ring_test = &r600_dma_ring_test, .ib_test = &r600_dma_ib_test, .is_lockup = &cayman_dma_is_lockup, @@ -1496,7 +1496,7 @@ static struct radeon_asic cayman_asic = { .ib_execute = &cayman_dma_ring_ib_execute, .emit_fence = &evergreen_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, - .cs_parse = NULL, + .cs_parse = &evergreen_dma_cs_parse, .ring_test = &r600_dma_ring_test, .ib_test = &r600_dma_ib_test, .is_lockup = &cayman_dma_is_lockup, @@ -1611,7 +1611,7 @@ static struct radeon_asic trinity_asic = { .ib_execute = &cayman_dma_ring_ib_execute, .emit_fence = &evergreen_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, - .cs_parse = NULL, + .cs_parse = &evergreen_dma_cs_parse, .ring_test = &r600_dma_ring_test, .ib_test = &r600_dma_ib_test, .is_lockup = &cayman_dma_is_lockup, @@ -1621,7 +1621,7 @@ static struct radeon_asic trinity_asic = { .ib_execute = &cayman_dma_ring_ib_execute, .emit_fence = &evergreen_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, - .cs_parse = NULL, + .cs_parse = &evergreen_dma_cs_parse, .ring_test = &r600_dma_ring_test, .ib_test = &r600_dma_ib_test, .is_lockup = &cayman_dma_is_lockup, diff --git a/drivers/gpu/drm/radeon/radeon_asic.h b/drivers/gpu/drm/radeon/radeon_asic.h index b311c0a..d2ac646 100644 --- a/drivers/gpu/drm/radeon/radeon_asic.h +++ b/drivers/gpu/drm/radeon/radeon_asic.h @@ -431,6 +431,7 @@ u32 evergreen_get_vblank_counter(struct radeon_device *rdev, int crtc); int evergreen_irq_set(struct radeon_device *rdev); int evergreen_irq_process(struct radeon_device *rdev); extern int evergreen_cs_parse(struct radeon_cs_parser *p); +extern int evergreen_dma_cs_parse(struct radeon_cs_parser *p); extern void evergreen_pm_misc(struct radeon_device *rdev); extern void evergreen_pm_prepare(struct radeon_device *rdev); extern void evergreen_pm_finish(struct radeon_device *rdev);
From: Alex Deucher alexander.deucher@amd.com
Allows us to use async DMA from userspace.
Signed-off-by: Alex Deucher alexander.deucher@amd.com --- drivers/gpu/drm/radeon/evergreen_cs.c | 111 +++++++++++++++++++++++++++++++++ drivers/gpu/drm/radeon/radeon_asic.c | 6 ++ drivers/gpu/drm/radeon/radeon_asic.h | 1 + 3 files changed, 118 insertions(+), 0 deletions(-)
diff --git a/drivers/gpu/drm/radeon/evergreen_cs.c b/drivers/gpu/drm/radeon/evergreen_cs.c index 1e8bd46..5e7ba99 100644 --- a/drivers/gpu/drm/radeon/evergreen_cs.c +++ b/drivers/gpu/drm/radeon/evergreen_cs.c @@ -3545,3 +3545,114 @@ int evergreen_ib_parse(struct radeon_device *rdev, struct radeon_ib *ib)
return ret; } + +/** + * evergreen_dma_ib_parse() - parse the DMA IB for VM + * @rdev: radeon_device pointer + * @ib: radeon_ib pointer + * + * Parses the DMA IB from the VM CS ioctl + * checks for errors. (Cayman-SI) + * Returns 0 for success and an error on failure. + **/ +int evergreen_dma_ib_parse(struct radeon_device *rdev, struct radeon_ib *ib) +{ + u32 idx = 0; + u32 header, cmd, count, tiled, new_cmd, misc; + + do { + header = ib->ptr[idx]; + cmd = GET_DMA_CMD(header); + count = GET_DMA_COUNT(header); + tiled = GET_DMA_T(header); + new_cmd = GET_DMA_NEW(header); + misc = GET_DMA_MISC(header); + + switch (cmd) { + case DMA_PACKET_WRITE: + if (tiled) + idx += count + 7; + else + idx += count + 3; + break; + case DMA_PACKET_COPY: + if (tiled) { + if (new_cmd) { + switch (misc) { + case 0: + /* L2T, frame to fields */ + idx += 10; + break; + case 1: + /* L2T, T2L partial */ + idx += 12; + break; + case 3: + /* L2T, broadcast */ + idx += 10; + break; + case 4: + /* L2T, T2L */ + idx += 9; + break; + case 5: + /* T2T partial */ + idx += 13; + break; + case 7: + /* L2T, broadcast */ + idx += 10; + break; + default: + DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc); + return -EINVAL; + } + } else { + switch (misc) { + case 0: + idx += 9; + break; + default: + DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc); + return -EINVAL; + } + } + } else { + if (new_cmd) { + switch (misc) { + case 0: + /* L2L, byte */ + idx += 5; + break; + case 1: + /* L2L, partial */ + idx += 9; + break; + case 4: + /* L2L, dw, broadcast */ + idx += 7; + break; + default: + DRM_ERROR("bad DMA_PACKET_COPY misc %u\n", misc); + return -EINVAL; + } + } else { + /* L2L, dw */ + idx += 5; + } + } + break; + case DMA_PACKET_CONSTANT_FILL: + idx += 4; + break; + case DMA_PACKET_NOP: + idx += 1; + break; + default: + DRM_ERROR("Unknown packet type %d at %d !\n", cmd, idx); + return -EINVAL; + } + } while (idx < ib->length_dw); + + return 0; +} diff --git a/drivers/gpu/drm/radeon/radeon_asic.c b/drivers/gpu/drm/radeon/radeon_asic.c index ac1d570..596bcbe 100644 --- a/drivers/gpu/drm/radeon/radeon_asic.c +++ b/drivers/gpu/drm/radeon/radeon_asic.c @@ -1484,6 +1484,7 @@ static struct radeon_asic cayman_asic = { }, [R600_RING_TYPE_DMA_INDEX] = { .ib_execute = &cayman_dma_ring_ib_execute, + .ib_parse = &evergreen_dma_ib_parse, .emit_fence = &evergreen_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, .cs_parse = &evergreen_dma_cs_parse, @@ -1494,6 +1495,7 @@ static struct radeon_asic cayman_asic = { }, [CAYMAN_RING_TYPE_DMA1_INDEX] = { .ib_execute = &cayman_dma_ring_ib_execute, + .ib_parse = &evergreen_dma_ib_parse, .emit_fence = &evergreen_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, .cs_parse = &evergreen_dma_cs_parse, @@ -1609,6 +1611,7 @@ static struct radeon_asic trinity_asic = { }, [R600_RING_TYPE_DMA_INDEX] = { .ib_execute = &cayman_dma_ring_ib_execute, + .ib_parse = &evergreen_dma_ib_parse, .emit_fence = &evergreen_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, .cs_parse = &evergreen_dma_cs_parse, @@ -1619,6 +1622,7 @@ static struct radeon_asic trinity_asic = { }, [CAYMAN_RING_TYPE_DMA1_INDEX] = { .ib_execute = &cayman_dma_ring_ib_execute, + .ib_parse = &evergreen_dma_ib_parse, .emit_fence = &evergreen_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, .cs_parse = &evergreen_dma_cs_parse, @@ -1734,6 +1738,7 @@ static struct radeon_asic si_asic = { }, [R600_RING_TYPE_DMA_INDEX] = { .ib_execute = &cayman_dma_ring_ib_execute, + .ib_parse = &evergreen_dma_ib_parse, .emit_fence = &evergreen_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, .cs_parse = NULL, @@ -1744,6 +1749,7 @@ static struct radeon_asic si_asic = { }, [CAYMAN_RING_TYPE_DMA1_INDEX] = { .ib_execute = &cayman_dma_ring_ib_execute, + .ib_parse = &evergreen_dma_ib_parse, .emit_fence = &evergreen_dma_fence_ring_emit, .emit_semaphore = &r600_dma_semaphore_ring_emit, .cs_parse = NULL, diff --git a/drivers/gpu/drm/radeon/radeon_asic.h b/drivers/gpu/drm/radeon/radeon_asic.h index d2ac646..5f4882c 100644 --- a/drivers/gpu/drm/radeon/radeon_asic.h +++ b/drivers/gpu/drm/radeon/radeon_asic.h @@ -473,6 +473,7 @@ void cayman_vm_set_page(struct radeon_device *rdev, uint64_t pe, uint64_t addr, unsigned count, uint32_t incr, uint32_t flags); int evergreen_ib_parse(struct radeon_device *rdev, struct radeon_ib *ib); +int evergreen_dma_ib_parse(struct radeon_device *rdev, struct radeon_ib *ib); void cayman_dma_ring_ib_execute(struct radeon_device *rdev, struct radeon_ib *ib); bool cayman_dma_is_lockup(struct radeon_device *rdev, struct radeon_ring *ring);
From: Alex Deucher alexander.deucher@amd.com
This enables the functionality added in the previous patches. Userspace acceleration drivers can use the CS ioctl to submit command buffers to the async DMA rings.
Signed-off-by: Alex Deucher alexander.deucher@amd.com --- drivers/gpu/drm/radeon/radeon_cs.c | 12 ++++++++++++ include/uapi/drm/radeon_drm.h | 1 + 2 files changed, 13 insertions(+), 0 deletions(-)
diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c index 1b32a5a..396baba 100644 --- a/drivers/gpu/drm/radeon/radeon_cs.c +++ b/drivers/gpu/drm/radeon/radeon_cs.c @@ -112,6 +112,18 @@ static int radeon_cs_get_ring(struct radeon_cs_parser *p, u32 ring, s32 priority } else p->ring = RADEON_RING_TYPE_GFX_INDEX; break; + case RADEON_CS_RING_DMA: + if (p->rdev->family >= CHIP_CAYMAN) { + if (p->priority > 0) + p->ring = R600_RING_TYPE_DMA_INDEX; + else + p->ring = CAYMAN_RING_TYPE_DMA1_INDEX; + } else if (p->rdev->family >= CHIP_R600) { + p->ring = R600_RING_TYPE_DMA_INDEX; + } else { + return -EINVAL; + } + break; } return 0; } diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h index 5645a87..eeda917 100644 --- a/include/uapi/drm/radeon_drm.h +++ b/include/uapi/drm/radeon_drm.h @@ -917,6 +917,7 @@ struct drm_radeon_gem_va { /* The second dword of RADEON_CHUNK_ID_FLAGS is a uint32 that sets the ring type */ #define RADEON_CS_RING_GFX 0 #define RADEON_CS_RING_COMPUTE 1 +#define RADEON_CS_RING_DMA 2 /* The third dword of RADEON_CHUNK_ID_FLAGS is a sint32 that sets the priority */ /* 0 = normal, + = higher priority, - = lower priority */
dri-devel@lists.freedesktop.org