From: Christian König christian.koenig@amd.com
Nobody is interested at which index the chunk is. What's needed is a pointer to the chunk. Remove unused chunk_id field as well.
Signed-off-by: Christian König christian.koenig@amd.com --- drivers/gpu/drm/radeon/evergreen_cs.c | 6 ++-- drivers/gpu/drm/radeon/r100.c | 2 +- drivers/gpu/drm/radeon/r300.c | 2 +- drivers/gpu/drm/radeon/r600_cs.c | 14 ++++----- drivers/gpu/drm/radeon/radeon.h | 11 +++---- drivers/gpu/drm/radeon/radeon_cs.c | 57 +++++++++++++++++------------------ drivers/gpu/drm/radeon/radeon_trace.h | 2 +- drivers/gpu/drm/radeon/radeon_uvd.c | 10 +++--- drivers/gpu/drm/radeon/radeon_vce.c | 4 +-- 9 files changed, 53 insertions(+), 55 deletions(-)
diff --git a/drivers/gpu/drm/radeon/evergreen_cs.c b/drivers/gpu/drm/radeon/evergreen_cs.c index 5c8b358..6c60e70 100644 --- a/drivers/gpu/drm/radeon/evergreen_cs.c +++ b/drivers/gpu/drm/radeon/evergreen_cs.c @@ -2661,7 +2661,7 @@ int evergreen_cs_parse(struct radeon_cs_parser *p) p->track = NULL; return r; } - } while (p->idx < p->chunks[p->chunk_ib_idx].length_dw); + } while (p->idx < p->chunk_ib->length_dw); #if 0 for (r = 0; r < p->ib.length_dw; r++) { printk(KERN_INFO "%05d 0x%08X\n", r, p->ib.ptr[r]); @@ -2684,7 +2684,7 @@ int evergreen_cs_parse(struct radeon_cs_parser *p) **/ int evergreen_dma_cs_parse(struct radeon_cs_parser *p) { - struct radeon_cs_chunk *ib_chunk = &p->chunks[p->chunk_ib_idx]; + struct radeon_cs_chunk *ib_chunk = p->chunk_ib; struct radeon_cs_reloc *src_reloc, *dst_reloc, *dst2_reloc; u32 header, cmd, count, sub_cmd; volatile u32 *ib = p->ib.ptr; @@ -3100,7 +3100,7 @@ int evergreen_dma_cs_parse(struct radeon_cs_parser *p) DRM_ERROR("Unknown packet type %d at %d !\n", cmd, idx); return -EINVAL; } - } while (p->idx < p->chunks[p->chunk_ib_idx].length_dw); + } while (p->idx < p->chunk_ib->length_dw); #if 0 for (r = 0; r < p->ib->length_dw; r++) { printk(KERN_INFO "%05d 0x%08X\n", r, p->ib.ptr[r]); diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c index c6b486f..43d129a 100644 --- a/drivers/gpu/drm/radeon/r100.c +++ b/drivers/gpu/drm/radeon/r100.c @@ -2061,7 +2061,7 @@ int r100_cs_parse(struct radeon_cs_parser *p) } if (r) return r; - } while (p->idx < p->chunks[p->chunk_ib_idx].length_dw); + } while (p->idx < p->chunk_ib->length_dw); return 0; }
diff --git a/drivers/gpu/drm/radeon/r300.c b/drivers/gpu/drm/radeon/r300.c index 1bc4704..b0a5ade 100644 --- a/drivers/gpu/drm/radeon/r300.c +++ b/drivers/gpu/drm/radeon/r300.c @@ -1283,7 +1283,7 @@ int r300_cs_parse(struct radeon_cs_parser *p) if (r) { return r; } - } while (p->idx < p->chunks[p->chunk_ib_idx].length_dw); + } while (p->idx < p->chunk_ib->length_dw); return 0; }
diff --git a/drivers/gpu/drm/radeon/r600_cs.c b/drivers/gpu/drm/radeon/r600_cs.c index c47537a..654b38b 100644 --- a/drivers/gpu/drm/radeon/r600_cs.c +++ b/drivers/gpu/drm/radeon/r600_cs.c @@ -2316,7 +2316,7 @@ int r600_cs_parse(struct radeon_cs_parser *p) p->track = NULL; return r; } - } while (p->idx < p->chunks[p->chunk_ib_idx].length_dw); + } while (p->idx < p->chunk_ib->length_dw); #if 0 for (r = 0; r < p->ib.length_dw; r++) { printk(KERN_INFO "%05d 0x%08X\n", r, p->ib.ptr[r]); @@ -2351,7 +2351,7 @@ static void r600_cs_parser_fini(struct radeon_cs_parser *parser, int error)
static int r600_cs_parser_relocs_legacy(struct radeon_cs_parser *p) { - if (p->chunk_relocs_idx == -1) { + if (p->chunk_relocs == NULL) { return 0; } p->relocs = kzalloc(sizeof(struct radeon_cs_reloc), GFP_KERNEL); @@ -2398,7 +2398,7 @@ int r600_cs_legacy(struct drm_device *dev, void *data, struct drm_file *filp, /* Copy the packet into the IB, the parser will read from the * input memory (cached) and write to the IB (which can be * uncached). */ - ib_chunk = &parser.chunks[parser.chunk_ib_idx]; + ib_chunk = parser.chunk_ib; parser.ib.length_dw = ib_chunk->length_dw; *l = parser.ib.length_dw; if (copy_from_user(ib, ib_chunk->user_ptr, ib_chunk->length_dw * 4)) { @@ -2441,11 +2441,11 @@ int r600_dma_cs_next_reloc(struct radeon_cs_parser *p, unsigned idx;
*cs_reloc = NULL; - if (p->chunk_relocs_idx == -1) { + if (p->chunk_relocs == NULL) { DRM_ERROR("No relocation chunk !\n"); return -EINVAL; } - relocs_chunk = &p->chunks[p->chunk_relocs_idx]; + relocs_chunk = p->chunk_relocs; idx = p->dma_reloc_idx; if (idx >= p->nrelocs) { DRM_ERROR("Relocs at %d after relocations chunk end %d !\n", @@ -2472,7 +2472,7 @@ int r600_dma_cs_next_reloc(struct radeon_cs_parser *p, **/ int r600_dma_cs_parse(struct radeon_cs_parser *p) { - struct radeon_cs_chunk *ib_chunk = &p->chunks[p->chunk_ib_idx]; + struct radeon_cs_chunk *ib_chunk = p->chunk_ib; struct radeon_cs_reloc *src_reloc, *dst_reloc; u32 header, cmd, count, tiled; volatile u32 *ib = p->ib.ptr; @@ -2619,7 +2619,7 @@ int r600_dma_cs_parse(struct radeon_cs_parser *p) DRM_ERROR("Unknown packet type %d at %d !\n", cmd, idx); return -EINVAL; } - } while (p->idx < p->chunks[p->chunk_ib_idx].length_dw); + } while (p->idx < p->chunk_ib->length_dw); #if 0 for (r = 0; r < p->ib->length_dw; r++) { printk(KERN_INFO "%05d 0x%08X\n", r, p->ib.ptr[r]); diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index ab71050..b3b4e96 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -1046,7 +1046,6 @@ struct radeon_cs_reloc { };
struct radeon_cs_chunk { - uint32_t chunk_id; uint32_t length_dw; uint32_t *kdata; void __user *user_ptr; @@ -1070,10 +1069,10 @@ struct radeon_cs_parser { struct list_head validated; unsigned dma_reloc_idx; /* indices of various chunks */ - int chunk_ib_idx; - int chunk_relocs_idx; - int chunk_flags_idx; - int chunk_const_ib_idx; + struct radeon_cs_chunk *chunk_ib; + struct radeon_cs_chunk *chunk_relocs; + struct radeon_cs_chunk *chunk_flags; + struct radeon_cs_chunk *chunk_const_ib; struct radeon_ib ib; struct radeon_ib const_ib; void *track; @@ -1087,7 +1086,7 @@ struct radeon_cs_parser {
static inline u32 radeon_get_ib_value(struct radeon_cs_parser *p, int idx) { - struct radeon_cs_chunk *ibc = &p->chunks[p->chunk_ib_idx]; + struct radeon_cs_chunk *ibc = p->chunk_ib;
if (ibc->kdata) return ibc->kdata[idx]; diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c index 9e6714b..b8f20a5 100644 --- a/drivers/gpu/drm/radeon/radeon_cs.c +++ b/drivers/gpu/drm/radeon/radeon_cs.c @@ -81,10 +81,10 @@ static int radeon_cs_parser_relocs(struct radeon_cs_parser *p) bool duplicate, need_mmap_lock = false; int r;
- if (p->chunk_relocs_idx == -1) { + if (p->chunk_relocs == NULL) { return 0; } - chunk = &p->chunks[p->chunk_relocs_idx]; + chunk = p->chunk_relocs; p->dma_reloc_idx = 0; /* FIXME: we assume that each relocs use 4 dwords */ p->nrelocs = chunk->length_dw / 4; @@ -281,10 +281,10 @@ int radeon_cs_parser_init(struct radeon_cs_parser *p, void *data) p->idx = 0; p->ib.sa_bo = NULL; p->const_ib.sa_bo = NULL; - p->chunk_ib_idx = -1; - p->chunk_relocs_idx = -1; - p->chunk_flags_idx = -1; - p->chunk_const_ib_idx = -1; + p->chunk_ib = NULL; + p->chunk_relocs = NULL; + p->chunk_flags = NULL; + p->chunk_const_ib = NULL; p->chunks_array = kcalloc(cs->num_chunks, sizeof(uint64_t), GFP_KERNEL); if (p->chunks_array == NULL) { return -ENOMEM; @@ -311,24 +311,23 @@ int radeon_cs_parser_init(struct radeon_cs_parser *p, void *data) return -EFAULT; } p->chunks[i].length_dw = user_chunk.length_dw; - p->chunks[i].chunk_id = user_chunk.chunk_id; - if (p->chunks[i].chunk_id == RADEON_CHUNK_ID_RELOCS) { - p->chunk_relocs_idx = i; + if (user_chunk.chunk_id == RADEON_CHUNK_ID_RELOCS) { + p->chunk_relocs = &p->chunks[i]; } - if (p->chunks[i].chunk_id == RADEON_CHUNK_ID_IB) { - p->chunk_ib_idx = i; + if (user_chunk.chunk_id == RADEON_CHUNK_ID_IB) { + p->chunk_ib = &p->chunks[i]; /* zero length IB isn't useful */ if (p->chunks[i].length_dw == 0) return -EINVAL; } - if (p->chunks[i].chunk_id == RADEON_CHUNK_ID_CONST_IB) { - p->chunk_const_ib_idx = i; + if (user_chunk.chunk_id == RADEON_CHUNK_ID_CONST_IB) { + p->chunk_const_ib = &p->chunks[i]; /* zero length CONST IB isn't useful */ if (p->chunks[i].length_dw == 0) return -EINVAL; } - if (p->chunks[i].chunk_id == RADEON_CHUNK_ID_FLAGS) { - p->chunk_flags_idx = i; + if (user_chunk.chunk_id == RADEON_CHUNK_ID_FLAGS) { + p->chunk_flags = &p->chunks[i]; /* zero length flags aren't useful */ if (p->chunks[i].length_dw == 0) return -EINVAL; @@ -337,10 +336,10 @@ int radeon_cs_parser_init(struct radeon_cs_parser *p, void *data) size = p->chunks[i].length_dw; cdata = (void __user *)(unsigned long)user_chunk.chunk_data; p->chunks[i].user_ptr = cdata; - if (p->chunks[i].chunk_id == RADEON_CHUNK_ID_CONST_IB) + if (user_chunk.chunk_id == RADEON_CHUNK_ID_CONST_IB) continue;
- if (p->chunks[i].chunk_id == RADEON_CHUNK_ID_IB) { + if (user_chunk.chunk_id == RADEON_CHUNK_ID_IB) { if (!p->rdev || !(p->rdev->flags & RADEON_IS_AGP)) continue; } @@ -353,7 +352,7 @@ int radeon_cs_parser_init(struct radeon_cs_parser *p, void *data) if (copy_from_user(p->chunks[i].kdata, cdata, size)) { return -EFAULT; } - if (p->chunks[i].chunk_id == RADEON_CHUNK_ID_FLAGS) { + if (user_chunk.chunk_id == RADEON_CHUNK_ID_FLAGS) { p->cs_flags = p->chunks[i].kdata[0]; if (p->chunks[i].length_dw > 1) ring = p->chunks[i].kdata[1]; @@ -457,7 +456,7 @@ static int radeon_cs_ib_chunk(struct radeon_device *rdev, { int r;
- if (parser->chunk_ib_idx == -1) + if (parser->chunk_ib == NULL) return 0;
if (parser->cs_flags & RADEON_CS_USE_VM) @@ -541,7 +540,7 @@ static int radeon_cs_ib_vm_chunk(struct radeon_device *rdev, struct radeon_vm *vm = &fpriv->vm; int r;
- if (parser->chunk_ib_idx == -1) + if (parser->chunk_ib == NULL) return 0; if ((parser->cs_flags & RADEON_CS_USE_VM) == 0) return 0; @@ -569,7 +568,7 @@ static int radeon_cs_ib_vm_chunk(struct radeon_device *rdev, radeon_cs_sync_rings(parser);
if ((rdev->family >= CHIP_TAHITI) && - (parser->chunk_const_ib_idx != -1)) { + (parser->chunk_const_ib != NULL)) { r = radeon_ib_schedule(rdev, &parser->ib, &parser->const_ib, true); } else { r = radeon_ib_schedule(rdev, &parser->ib, NULL, true); @@ -596,7 +595,7 @@ static int radeon_cs_ib_fill(struct radeon_device *rdev, struct radeon_cs_parser struct radeon_vm *vm = NULL; int r;
- if (parser->chunk_ib_idx == -1) + if (parser->chunk_ib == NULL) return 0;
if (parser->cs_flags & RADEON_CS_USE_VM) { @@ -604,8 +603,8 @@ static int radeon_cs_ib_fill(struct radeon_device *rdev, struct radeon_cs_parser vm = &fpriv->vm;
if ((rdev->family >= CHIP_TAHITI) && - (parser->chunk_const_ib_idx != -1)) { - ib_chunk = &parser->chunks[parser->chunk_const_ib_idx]; + (parser->chunk_const_ib != NULL)) { + ib_chunk = parser->chunk_const_ib; if (ib_chunk->length_dw > RADEON_IB_VM_MAX_SIZE) { DRM_ERROR("cs IB CONST too big: %d\n", ib_chunk->length_dw); return -EINVAL; @@ -624,13 +623,13 @@ static int radeon_cs_ib_fill(struct radeon_device *rdev, struct radeon_cs_parser return -EFAULT; }
- ib_chunk = &parser->chunks[parser->chunk_ib_idx]; + ib_chunk = parser->chunk_ib; if (ib_chunk->length_dw > RADEON_IB_VM_MAX_SIZE) { DRM_ERROR("cs IB too big: %d\n", ib_chunk->length_dw); return -EINVAL; } } - ib_chunk = &parser->chunks[parser->chunk_ib_idx]; + ib_chunk = parser->chunk_ib;
r = radeon_ib_get(rdev, parser->ring, &parser->ib, vm, ib_chunk->length_dw * 4); @@ -722,7 +721,7 @@ int radeon_cs_packet_parse(struct radeon_cs_parser *p, struct radeon_cs_packet *pkt, unsigned idx) { - struct radeon_cs_chunk *ib_chunk = &p->chunks[p->chunk_ib_idx]; + struct radeon_cs_chunk *ib_chunk = p->chunk_ib; struct radeon_device *rdev = p->rdev; uint32_t header;
@@ -824,12 +823,12 @@ int radeon_cs_packet_next_reloc(struct radeon_cs_parser *p, unsigned idx; int r;
- if (p->chunk_relocs_idx == -1) { + if (p->chunk_relocs == NULL) { DRM_ERROR("No relocation chunk !\n"); return -EINVAL; } *cs_reloc = NULL; - relocs_chunk = &p->chunks[p->chunk_relocs_idx]; + relocs_chunk = p->chunk_relocs; r = radeon_cs_packet_parse(p, &p3reloc, p->idx); if (r) return r; diff --git a/drivers/gpu/drm/radeon/radeon_trace.h b/drivers/gpu/drm/radeon/radeon_trace.h index 9db74a9..ce075cb 100644 --- a/drivers/gpu/drm/radeon/radeon_trace.h +++ b/drivers/gpu/drm/radeon/radeon_trace.h @@ -38,7 +38,7 @@ TRACE_EVENT(radeon_cs,
TP_fast_assign( __entry->ring = p->ring; - __entry->dw = p->chunks[p->chunk_ib_idx].length_dw; + __entry->dw = p->chunk_ib->length_dw; __entry->fences = radeon_fence_count_emitted( p->rdev, p->ring); ), diff --git a/drivers/gpu/drm/radeon/radeon_uvd.c b/drivers/gpu/drm/radeon/radeon_uvd.c index ba4f389..eb1285d 100644 --- a/drivers/gpu/drm/radeon/radeon_uvd.c +++ b/drivers/gpu/drm/radeon/radeon_uvd.c @@ -492,7 +492,7 @@ static int radeon_uvd_cs_reloc(struct radeon_cs_parser *p, uint64_t start, end; int r;
- relocs_chunk = &p->chunks[p->chunk_relocs_idx]; + relocs_chunk = p->chunk_relocs; offset = radeon_get_ib_value(p, data0); idx = radeon_get_ib_value(p, data1); if (idx >= relocs_chunk->length_dw) { @@ -609,13 +609,13 @@ int radeon_uvd_cs_parse(struct radeon_cs_parser *p) [0x00000003] = 2048, };
- if (p->chunks[p->chunk_ib_idx].length_dw % 16) { + if (p->chunk_ib->length_dw % 16) { DRM_ERROR("UVD IB length (%d) not 16 dwords aligned!\n", - p->chunks[p->chunk_ib_idx].length_dw); + p->chunk_ib->length_dw); return -EINVAL; }
- if (p->chunk_relocs_idx == -1) { + if (p->chunk_relocs == NULL) { DRM_ERROR("No relocation chunk !\n"); return -EINVAL; } @@ -639,7 +639,7 @@ int radeon_uvd_cs_parse(struct radeon_cs_parser *p) DRM_ERROR("Unknown packet type %d !\n", pkt.type); return -EINVAL; } - } while (p->idx < p->chunks[p->chunk_ib_idx].length_dw); + } while (p->idx < p->chunk_ib->length_dw);
if (!has_msg_cmd) { DRM_ERROR("UVD-IBs need a msg command!\n"); diff --git a/drivers/gpu/drm/radeon/radeon_vce.c b/drivers/gpu/drm/radeon/radeon_vce.c index c7190aa..4c57938 100644 --- a/drivers/gpu/drm/radeon/radeon_vce.c +++ b/drivers/gpu/drm/radeon/radeon_vce.c @@ -456,7 +456,7 @@ int radeon_vce_cs_reloc(struct radeon_cs_parser *p, int lo, int hi, uint64_t start, end, offset; unsigned idx;
- relocs_chunk = &p->chunks[p->chunk_relocs_idx]; + relocs_chunk = p->chunk_relocs; offset = radeon_get_ib_value(p, lo); idx = radeon_get_ib_value(p, hi);
@@ -533,7 +533,7 @@ int radeon_vce_cs_parse(struct radeon_cs_parser *p) uint32_t *size = &tmp; int i, r;
- while (p->idx < p->chunks[p->chunk_ib_idx].length_dw) { + while (p->idx < p->chunk_ib->length_dw) { uint32_t len = radeon_get_ib_value(p, p->idx); uint32_t cmd = radeon_get_ib_value(p, p->idx + 1);
From: Christian König christian.koenig@amd.com
This patch adds a new 64bit ID as a result to each command submission.
Signed-off-by: Christian König christian.koenig@amd.com --- drivers/gpu/drm/radeon/Makefile | 2 +- drivers/gpu/drm/radeon/radeon.h | 13 +- drivers/gpu/drm/radeon/radeon_cs.c | 13 ++ drivers/gpu/drm/radeon/radeon_kms.c | 41 +++---- drivers/gpu/drm/radeon/radeon_seq.c | 229 ++++++++++++++++++++++++++++++++++++ include/uapi/drm/radeon_drm.h | 1 + 6 files changed, 277 insertions(+), 22 deletions(-) create mode 100644 drivers/gpu/drm/radeon/radeon_seq.c
diff --git a/drivers/gpu/drm/radeon/Makefile b/drivers/gpu/drm/radeon/Makefile index 8cc9f68..21ee928 100644 --- a/drivers/gpu/drm/radeon/Makefile +++ b/drivers/gpu/drm/radeon/Makefile @@ -81,7 +81,7 @@ radeon-y += radeon_device.o radeon_asic.o radeon_kms.o \ rv770_smc.o cypress_dpm.o btc_dpm.o sumo_dpm.o sumo_smc.o trinity_dpm.o \ trinity_smc.o ni_dpm.o si_smc.o si_dpm.o kv_smc.o kv_dpm.o ci_smc.o \ ci_dpm.o dce6_afmt.o radeon_vm.o radeon_ucode.o radeon_ib.o radeon_mn.o \ - radeon_sync.o + radeon_sync.o radeon_seq.o
# add async DMA block radeon-y += \ diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index b3b4e96..dbfd346 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -423,6 +423,15 @@ static inline bool radeon_fence_is_earlier(struct radeon_fence *a, }
/* + * Userspace command submission identifier generation + */ +struct radeon_seq; + +uint64_t radeon_seq_push(struct radeon_seq **seq, struct radeon_fence *fence); +struct radeon_fence *radeon_seq_query(struct radeon_seq *seq, uint64_t id); +void radeon_seq_destroy(struct radeon_seq **seq); + +/* * Tiling registers */ struct radeon_surface_reg { @@ -946,7 +955,9 @@ struct radeon_vm_manager { * file private structure */ struct radeon_fpriv { - struct radeon_vm vm; + struct radeon_vm vm; + struct mutex seq_lock; + struct radeon_seq *seq; };
/* diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c index b8f20a5..0c0f0b3 100644 --- a/drivers/gpu/drm/radeon/radeon_cs.c +++ b/drivers/gpu/drm/radeon/radeon_cs.c @@ -413,6 +413,19 @@ static void radeon_cs_parser_fini(struct radeon_cs_parser *parser, int error, bo unsigned i;
if (!error) { + if (parser->chunk_flags && + parser->chunk_flags->length_dw > 4) { + struct radeon_fpriv *fpriv = parser->filp->driver_priv; + uint32_t __user *to = parser->chunk_flags->user_ptr; + uint64_t id; + + mutex_lock(&fpriv->seq_lock); + id = radeon_seq_push(&fpriv->seq, parser->ib.fence); + mutex_unlock(&fpriv->seq_lock); + + copy_to_user(&to[3], &id, sizeof(uint64_t)); + } + /* Sort the buffer list from the smallest to largest buffer, * which affects the order of buffers in the LRU list. * This assures that the smallest buffers are added first diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index 85ee6f7..38880af 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -578,39 +578,34 @@ void radeon_driver_lastclose_kms(struct drm_device *dev) */ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) { + struct radeon_fpriv *fpriv = kzalloc(sizeof(*fpriv), GFP_KERNEL); struct radeon_device *rdev = dev->dev_private; int r;
- file_priv->driver_priv = NULL; + if (unlikely(!fpriv)) + return -ENOMEM; + + file_priv->driver_priv = fpriv;
r = pm_runtime_get_sync(dev->dev); if (r < 0) - return r; + goto error;
/* new gpu have virtual address space support */ if (rdev->family >= CHIP_CAYMAN) { - struct radeon_fpriv *fpriv; struct radeon_vm *vm; int r;
- fpriv = kzalloc(sizeof(*fpriv), GFP_KERNEL); - if (unlikely(!fpriv)) { - return -ENOMEM; - } - vm = &fpriv->vm; r = radeon_vm_init(rdev, vm); - if (r) { - kfree(fpriv); - return r; - } + if (r) + goto error;
if (rdev->accel_working) { r = radeon_bo_reserve(rdev->ring_tmp_bo.bo, false); if (r) { radeon_vm_fini(rdev, vm); - kfree(fpriv); - return r; + goto error; }
/* map the ib pool buffer read only into @@ -623,16 +618,20 @@ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) RADEON_VM_PAGE_SNOOPED); if (r) { radeon_vm_fini(rdev, vm); - kfree(fpriv); - return r; + goto error; } } - file_priv->driver_priv = fpriv; }
+ mutex_init(&fpriv->seq_lock); + pm_runtime_mark_last_busy(dev->dev); pm_runtime_put_autosuspend(dev->dev); return 0; + +error: + kfree(fpriv); + return r; }
/** @@ -646,11 +645,13 @@ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) void radeon_driver_postclose_kms(struct drm_device *dev, struct drm_file *file_priv) { + struct radeon_fpriv *fpriv = file_priv->driver_priv; struct radeon_device *rdev = dev->dev_private;
+ radeon_seq_destroy(&fpriv->seq); + /* new gpu have virtual address space support */ if (rdev->family >= CHIP_CAYMAN && file_priv->driver_priv) { - struct radeon_fpriv *fpriv = file_priv->driver_priv; struct radeon_vm *vm = &fpriv->vm; int r;
@@ -664,9 +665,9 @@ void radeon_driver_postclose_kms(struct drm_device *dev, }
radeon_vm_fini(rdev, vm); - kfree(fpriv); - file_priv->driver_priv = NULL; } + kfree(fpriv); + file_priv->driver_priv = NULL; }
/** diff --git a/drivers/gpu/drm/radeon/radeon_seq.c b/drivers/gpu/drm/radeon/radeon_seq.c new file mode 100644 index 0000000..d8857f1 --- /dev/null +++ b/drivers/gpu/drm/radeon/radeon_seq.c @@ -0,0 +1,229 @@ +/* + * Copyright 2014 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sub license, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * The above copyright notice and this permission notice (including the + * next paragraph) shall be included in all copies or substantial portions + * of the Software. + * + */ +/* + * Authors: + * Christian König christian.koenig@amd.com + */ + +#include <drm/drmP.h> +#include "radeon.h" + +/* + * ID sequences + * This code generates a 64bit identifier for a command submission. + * It works by adding the fence of the command submission to a automatically + * resizing ring buffer. + */ + +struct radeon_seq { + uint64_t start; + uint64_t end; + uint64_t mask; + struct radeon_seq *replacement; +}; + +/** + * radeon_seq_create - create a new sequence object + * + * @start: start value for this sequence + * @size: size of the ring buffer, must be power of two + * + * Allocate and initialize a new ring buffer and header. + * Returns NULL if allocation fails, new object otherwise. + */ +static struct radeon_seq *radeon_seq_create(uint64_t start, unsigned size) +{ + unsigned bytes = sizeof(struct radeon_seq) + + size * sizeof(struct radeon_fence *); + + struct radeon_seq *seq; + + seq = kmalloc(bytes, GFP_KERNEL); + if (!seq) + return NULL; + + seq->start = start; + seq->end = start; + seq->mask = size - 1; + seq->replacement = NULL; + + return seq; +} + +/** + * radeon_seq_ring - get pointer to ring buffer + * + * @seq: sequence object + * + * Calculate the address of the ring buffer. + */ +static struct radeon_fence **radeon_seq_ring(struct radeon_seq *seq) +{ + return (struct radeon_fence **)&seq[1]; +} + +/** + * radeon_seq_try_free - try to free fences from the ring buffer + * + * @seq: sequence object + * + * Try to free fences from the start of the ring buffer. + */ +static void radeon_seq_try_free(struct radeon_seq *seq) +{ + struct radeon_fence **ring = radeon_seq_ring(seq); + + while (seq->start != seq->end) { + unsigned idx = seq->start & seq->mask; + struct radeon_fence *fence = ring[idx]; + + if (!radeon_fence_signaled(fence)) + break; + + radeon_fence_unref(&fence); + ++seq->start; + } +} + +/** + * radeon_seq_add - add new fence to the end of the ring buffer + * + * @seq: sequence object + * @f: the fence object + * + * Add the fence and return the generated ID. + */ +static uint64_t radeon_seq_add(struct radeon_seq *seq, struct radeon_fence *f) +{ + struct radeon_fence **ring = radeon_seq_ring(seq); + + ring[seq->end & seq->mask] = radeon_fence_ref(f); + return seq->end++; +} + +/** + * radeon_seq_push - check for room and add the fence + * + * @seq: sequence object + * @fence: the fence object + * + * Check for room on the ring buffer, if there isn't enough + * reallocate the sequence object and add the fence. + * Returns the generated ID. + */ +uint64_t radeon_seq_push(struct radeon_seq **seq, struct radeon_fence *fence) +{ + unsigned size_for_new_seq = 4; + uint64_t start_for_new_seq = 1; + + if (*seq) { + /* try to release old replacements */ + while ((*seq)->replacement) { + radeon_seq_try_free(*seq); + if ((*seq)->start == (*seq)->end) { + struct radeon_seq *repl = (*seq)->replacement; + + kfree(*seq); + *seq = repl; + } else { + /* move on to the current container */ + seq = &(*seq)->replacement; + } + } + + /* check if we have enough room for one more fence */ + radeon_seq_try_free(*seq); + if (((*seq)->end - (*seq)->start) <= (*seq)->mask) + return radeon_seq_add(*seq, fence); + + /* not enough room, let's allocate a replacement */ + size_for_new_seq = ((*seq)->mask + 1) * 2; + start_for_new_seq = (*seq)->end + 1; + seq = &(*seq)->replacement; + } + + *seq = radeon_seq_create(start_for_new_seq, size_for_new_seq); + if (!*seq) { + /* not enough memory for a new sequence object, but failing + here isn't a good idea either cause the commands are already + submitted to the hardware. So just block on the fence. */ + int r = radeon_fence_wait(fence, false); + if (r) + DRM_ERROR("Error waiting for fence (%d)\n", r); + return 0; + } + return radeon_seq_add(*seq, fence); +} + +/** + * radeon_seq_query - lockup fence by it's ID + * + * @seq: sequence object + * @id: the generated ID + * + * Lockup the associated fence by it's ID. + * Returns fence object or NULL if it couldn't be found. + */ +struct radeon_fence *radeon_seq_query(struct radeon_seq *seq, uint64_t id) +{ + struct radeon_fence **ring; + + while (seq && id > seq->end) + seq = seq->replacement; + + if (!seq || id < seq->start) + return NULL; + + ring = radeon_seq_ring(seq); + return ring[id & seq->mask]; +} + +/** + * radeon_seq_destroy - destroy the sequence object + * + * @seq_ptr: pointer to sequence object + * + * Destroy the sequence objects and release all fence references taken. + */ +void radeon_seq_destroy(struct radeon_seq **seq_ptr) +{ + struct radeon_seq *seq = *seq_ptr; + while (seq) { + struct radeon_seq *repl = seq->replacement; + unsigned start = seq->start & seq->mask; + unsigned end = seq->end & seq->mask; + struct radeon_fence **ring; + unsigned i; + + ring = radeon_seq_ring(seq); + for (i = start; i < end; ++i) + radeon_fence_unref(&ring[i]); + + kfree(seq); + seq = repl; + } + *seq_ptr = NULL; +} diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h index 50d0fb4..6b2b2e7 100644 --- a/include/uapi/drm/radeon_drm.h +++ b/include/uapi/drm/radeon_drm.h @@ -959,6 +959,7 @@ struct drm_radeon_gem_va { #define RADEON_CS_RING_VCE 4 /* The third dword of RADEON_CHUNK_ID_FLAGS is a sint32 that sets the priority */ /* 0 = normal, + = higher priority, - = lower priority */ +/* The fourth and fives dword are a 64bit fence ID generated for this CS */
struct drm_radeon_cs_chunk { uint32_t chunk_id;
On Thu, Sep 18, 2014 at 05:30:12PM +0200, Christian König wrote:
From: Christian König christian.koenig@amd.com
This patch adds a new 64bit ID as a result to each command submission.
NAK for design reasons.
First and foremost we DO NOT WANT TO ALLOCATE ANYTHING for submission id. I know that hooking up to fence make life easy but this is not how it should be done.
What you want to expose is the per ring 32bit seq value and report to user space the value and the ring id. To handle the wrap around you add a uniq 32bit wrap counter which is incremented every 1 << 30 or some big number seq value number (but must be smaller than 1 << 31). Then you use arithmetic to determine if a cs seq number is done or not (ie the seq wrap around counter diff should not be bigger then 2 or 3) and the seq number should obviously be after again using arithmetic diff like jiffies code.
All this being hidden in the kernel all the user space have to know is : 32 bit seq value per ring 32 bit ring uniq id 32 wrap counter per ring
With such scheme you never allocate anything or worry about any fence. Way simpler and less code.
Cheers, Jérôme
Signed-off-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/radeon/Makefile | 2 +- drivers/gpu/drm/radeon/radeon.h | 13 +- drivers/gpu/drm/radeon/radeon_cs.c | 13 ++ drivers/gpu/drm/radeon/radeon_kms.c | 41 +++---- drivers/gpu/drm/radeon/radeon_seq.c | 229 ++++++++++++++++++++++++++++++++++++ include/uapi/drm/radeon_drm.h | 1 + 6 files changed, 277 insertions(+), 22 deletions(-) create mode 100644 drivers/gpu/drm/radeon/radeon_seq.c
diff --git a/drivers/gpu/drm/radeon/Makefile b/drivers/gpu/drm/radeon/Makefile index 8cc9f68..21ee928 100644 --- a/drivers/gpu/drm/radeon/Makefile +++ b/drivers/gpu/drm/radeon/Makefile @@ -81,7 +81,7 @@ radeon-y += radeon_device.o radeon_asic.o radeon_kms.o \ rv770_smc.o cypress_dpm.o btc_dpm.o sumo_dpm.o sumo_smc.o trinity_dpm.o \ trinity_smc.o ni_dpm.o si_smc.o si_dpm.o kv_smc.o kv_dpm.o ci_smc.o \ ci_dpm.o dce6_afmt.o radeon_vm.o radeon_ucode.o radeon_ib.o radeon_mn.o \
- radeon_sync.o
- radeon_sync.o radeon_seq.o
# add async DMA block radeon-y += \ diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index b3b4e96..dbfd346 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -423,6 +423,15 @@ static inline bool radeon_fence_is_earlier(struct radeon_fence *a, }
/*
- Userspace command submission identifier generation
- */
+struct radeon_seq;
+uint64_t radeon_seq_push(struct radeon_seq **seq, struct radeon_fence *fence); +struct radeon_fence *radeon_seq_query(struct radeon_seq *seq, uint64_t id); +void radeon_seq_destroy(struct radeon_seq **seq);
+/*
- Tiling registers
*/ struct radeon_surface_reg { @@ -946,7 +955,9 @@ struct radeon_vm_manager {
- file private structure
*/ struct radeon_fpriv {
- struct radeon_vm vm;
- struct radeon_vm vm;
- struct mutex seq_lock;
- struct radeon_seq *seq;
};
/* diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c index b8f20a5..0c0f0b3 100644 --- a/drivers/gpu/drm/radeon/radeon_cs.c +++ b/drivers/gpu/drm/radeon/radeon_cs.c @@ -413,6 +413,19 @@ static void radeon_cs_parser_fini(struct radeon_cs_parser *parser, int error, bo unsigned i;
if (!error) {
if (parser->chunk_flags &&
parser->chunk_flags->length_dw > 4) {
struct radeon_fpriv *fpriv = parser->filp->driver_priv;
uint32_t __user *to = parser->chunk_flags->user_ptr;
uint64_t id;
mutex_lock(&fpriv->seq_lock);
id = radeon_seq_push(&fpriv->seq, parser->ib.fence);
mutex_unlock(&fpriv->seq_lock);
copy_to_user(&to[3], &id, sizeof(uint64_t));
}
- /* Sort the buffer list from the smallest to largest buffer,
- which affects the order of buffers in the LRU list.
- This assures that the smallest buffers are added first
diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index 85ee6f7..38880af 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -578,39 +578,34 @@ void radeon_driver_lastclose_kms(struct drm_device *dev) */ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) {
- struct radeon_fpriv *fpriv = kzalloc(sizeof(*fpriv), GFP_KERNEL); struct radeon_device *rdev = dev->dev_private; int r;
- file_priv->driver_priv = NULL;
if (unlikely(!fpriv))
return -ENOMEM;
file_priv->driver_priv = fpriv;
r = pm_runtime_get_sync(dev->dev); if (r < 0)
return r;
goto error;
/* new gpu have virtual address space support */ if (rdev->family >= CHIP_CAYMAN) {
struct radeon_fpriv *fpriv;
struct radeon_vm *vm; int r;
fpriv = kzalloc(sizeof(*fpriv), GFP_KERNEL);
if (unlikely(!fpriv)) {
return -ENOMEM;
}
vm = &fpriv->vm; r = radeon_vm_init(rdev, vm);
if (r) {
kfree(fpriv);
return r;
}
if (r)
goto error;
if (rdev->accel_working) { r = radeon_bo_reserve(rdev->ring_tmp_bo.bo, false); if (r) { radeon_vm_fini(rdev, vm);
kfree(fpriv);
return r;
goto error; } /* map the ib pool buffer read only into
@@ -623,16 +618,20 @@ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) RADEON_VM_PAGE_SNOOPED); if (r) { radeon_vm_fini(rdev, vm);
kfree(fpriv);
return r;
}goto error; }
}file_priv->driver_priv = fpriv;
- mutex_init(&fpriv->seq_lock);
- pm_runtime_mark_last_busy(dev->dev); pm_runtime_put_autosuspend(dev->dev); return 0;
+error:
- kfree(fpriv);
- return r;
}
/** @@ -646,11 +645,13 @@ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) void radeon_driver_postclose_kms(struct drm_device *dev, struct drm_file *file_priv) {
struct radeon_fpriv *fpriv = file_priv->driver_priv; struct radeon_device *rdev = dev->dev_private;
radeon_seq_destroy(&fpriv->seq);
/* new gpu have virtual address space support */ if (rdev->family >= CHIP_CAYMAN && file_priv->driver_priv) {
struct radeon_vm *vm = &fpriv->vm; int r;struct radeon_fpriv *fpriv = file_priv->driver_priv;
@@ -664,9 +665,9 @@ void radeon_driver_postclose_kms(struct drm_device *dev, }
radeon_vm_fini(rdev, vm);
kfree(fpriv);
}file_priv->driver_priv = NULL;
- kfree(fpriv);
- file_priv->driver_priv = NULL;
}
/** diff --git a/drivers/gpu/drm/radeon/radeon_seq.c b/drivers/gpu/drm/radeon/radeon_seq.c new file mode 100644 index 0000000..d8857f1 --- /dev/null +++ b/drivers/gpu/drm/radeon/radeon_seq.c @@ -0,0 +1,229 @@ +/*
- Copyright 2014 Advanced Micro Devices, Inc.
- All Rights Reserved.
- Permission is hereby granted, free of charge, to any person obtaining a
- copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sub license, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
- DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- USE OR OTHER DEALINGS IN THE SOFTWARE.
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial portions
- of the Software.
- */
+/*
- Authors:
- Christian König christian.koenig@amd.com
- */
+#include <drm/drmP.h> +#include "radeon.h"
+/*
- ID sequences
- This code generates a 64bit identifier for a command submission.
- It works by adding the fence of the command submission to a automatically
- resizing ring buffer.
- */
+struct radeon_seq {
- uint64_t start;
- uint64_t end;
- uint64_t mask;
- struct radeon_seq *replacement;
+};
+/**
- radeon_seq_create - create a new sequence object
- @start: start value for this sequence
- @size: size of the ring buffer, must be power of two
- Allocate and initialize a new ring buffer and header.
- Returns NULL if allocation fails, new object otherwise.
- */
+static struct radeon_seq *radeon_seq_create(uint64_t start, unsigned size) +{
- unsigned bytes = sizeof(struct radeon_seq) +
size * sizeof(struct radeon_fence *);
- struct radeon_seq *seq;
- seq = kmalloc(bytes, GFP_KERNEL);
- if (!seq)
return NULL;
- seq->start = start;
- seq->end = start;
- seq->mask = size - 1;
- seq->replacement = NULL;
- return seq;
+}
+/**
- radeon_seq_ring - get pointer to ring buffer
- @seq: sequence object
- Calculate the address of the ring buffer.
- */
+static struct radeon_fence **radeon_seq_ring(struct radeon_seq *seq) +{
- return (struct radeon_fence **)&seq[1];
+}
+/**
- radeon_seq_try_free - try to free fences from the ring buffer
- @seq: sequence object
- Try to free fences from the start of the ring buffer.
- */
+static void radeon_seq_try_free(struct radeon_seq *seq) +{
- struct radeon_fence **ring = radeon_seq_ring(seq);
- while (seq->start != seq->end) {
unsigned idx = seq->start & seq->mask;
struct radeon_fence *fence = ring[idx];
if (!radeon_fence_signaled(fence))
break;
radeon_fence_unref(&fence);
++seq->start;
- }
+}
+/**
- radeon_seq_add - add new fence to the end of the ring buffer
- @seq: sequence object
- @f: the fence object
- Add the fence and return the generated ID.
- */
+static uint64_t radeon_seq_add(struct radeon_seq *seq, struct radeon_fence *f) +{
- struct radeon_fence **ring = radeon_seq_ring(seq);
- ring[seq->end & seq->mask] = radeon_fence_ref(f);
- return seq->end++;
+}
+/**
- radeon_seq_push - check for room and add the fence
- @seq: sequence object
- @fence: the fence object
- Check for room on the ring buffer, if there isn't enough
- reallocate the sequence object and add the fence.
- Returns the generated ID.
- */
+uint64_t radeon_seq_push(struct radeon_seq **seq, struct radeon_fence *fence) +{
- unsigned size_for_new_seq = 4;
- uint64_t start_for_new_seq = 1;
- if (*seq) {
/* try to release old replacements */
while ((*seq)->replacement) {
radeon_seq_try_free(*seq);
if ((*seq)->start == (*seq)->end) {
struct radeon_seq *repl = (*seq)->replacement;
kfree(*seq);
*seq = repl;
} else {
/* move on to the current container */
seq = &(*seq)->replacement;
}
}
/* check if we have enough room for one more fence */
radeon_seq_try_free(*seq);
if (((*seq)->end - (*seq)->start) <= (*seq)->mask)
return radeon_seq_add(*seq, fence);
/* not enough room, let's allocate a replacement */
size_for_new_seq = ((*seq)->mask + 1) * 2;
start_for_new_seq = (*seq)->end + 1;
seq = &(*seq)->replacement;
- }
- *seq = radeon_seq_create(start_for_new_seq, size_for_new_seq);
- if (!*seq) {
/* not enough memory for a new sequence object, but failing
here isn't a good idea either cause the commands are already
submitted to the hardware. So just block on the fence. */
int r = radeon_fence_wait(fence, false);
if (r)
DRM_ERROR("Error waiting for fence (%d)\n", r);
return 0;
- }
- return radeon_seq_add(*seq, fence);
+}
+/**
- radeon_seq_query - lockup fence by it's ID
- @seq: sequence object
- @id: the generated ID
- Lockup the associated fence by it's ID.
- Returns fence object or NULL if it couldn't be found.
- */
+struct radeon_fence *radeon_seq_query(struct radeon_seq *seq, uint64_t id) +{
- struct radeon_fence **ring;
- while (seq && id > seq->end)
seq = seq->replacement;
- if (!seq || id < seq->start)
return NULL;
- ring = radeon_seq_ring(seq);
- return ring[id & seq->mask];
+}
+/**
- radeon_seq_destroy - destroy the sequence object
- @seq_ptr: pointer to sequence object
- Destroy the sequence objects and release all fence references taken.
- */
+void radeon_seq_destroy(struct radeon_seq **seq_ptr) +{
- struct radeon_seq *seq = *seq_ptr;
- while (seq) {
struct radeon_seq *repl = seq->replacement;
unsigned start = seq->start & seq->mask;
unsigned end = seq->end & seq->mask;
struct radeon_fence **ring;
unsigned i;
ring = radeon_seq_ring(seq);
for (i = start; i < end; ++i)
radeon_fence_unref(&ring[i]);
kfree(seq);
seq = repl;
- }
- *seq_ptr = NULL;
+} diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h index 50d0fb4..6b2b2e7 100644 --- a/include/uapi/drm/radeon_drm.h +++ b/include/uapi/drm/radeon_drm.h @@ -959,6 +959,7 @@ struct drm_radeon_gem_va { #define RADEON_CS_RING_VCE 4 /* The third dword of RADEON_CHUNK_ID_FLAGS is a sint32 that sets the priority */ /* 0 = normal, + = higher priority, - = lower priority */ +/* The fourth and fives dword are a 64bit fence ID generated for this CS */
struct drm_radeon_cs_chunk { uint32_t chunk_id; -- 1.9.1
dri-devel mailing list dri-devel@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/dri-devel
On Thu, Sep 18, 2014 at 08:42:16PM -0400, Jerome Glisse wrote:
On Thu, Sep 18, 2014 at 05:30:12PM +0200, Christian König wrote:
From: Christian König christian.koenig@amd.com
This patch adds a new 64bit ID as a result to each command submission.
NAK for design reasons.
First and foremost we DO NOT WANT TO ALLOCATE ANYTHING for submission id. I know that hooking up to fence make life easy but this is not how it should be done.
What you want to expose is the per ring 32bit seq value and report to user space the value and the ring id. To handle the wrap around you add a uniq 32bit wrap counter which is incremented every 1 << 30 or some big number seq value number (but must be smaller than 1 << 31). Then you use arithmetic to determine if a cs seq number is done or not (ie the seq wrap around counter diff should not be bigger then 2 or 3) and the seq number should obviously be after again using arithmetic diff like jiffies code.
All this being hidden in the kernel all the user space have to know is : 32 bit seq value per ring 32 bit ring uniq id 32 wrap counter per ring
With such scheme you never allocate anything or worry about any fence. Way simpler and less code.
Cheers, Jérôme
Because code is clearer than words attached is a patch of what i am thinking about. The arithmetic should be right as well as the coherency and proper memory barrier. Thought it is completely untested and will require couple fixes for properly checking userspace arguments and other aspect (see FIXME in patches).
There is no allocation, it is a pretty small patch, and it provide a fast ligthweight solution. While the new ioctl argument and return value can be improved (like reporting more information to userspace like for instance warning userspace when it is querying very old fences). I think the overall design is sound.
Let me know.
Cheers, Jérôme
Signed-off-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/radeon/Makefile | 2 +- drivers/gpu/drm/radeon/radeon.h | 13 +- drivers/gpu/drm/radeon/radeon_cs.c | 13 ++ drivers/gpu/drm/radeon/radeon_kms.c | 41 +++---- drivers/gpu/drm/radeon/radeon_seq.c | 229 ++++++++++++++++++++++++++++++++++++ include/uapi/drm/radeon_drm.h | 1 + 6 files changed, 277 insertions(+), 22 deletions(-) create mode 100644 drivers/gpu/drm/radeon/radeon_seq.c
diff --git a/drivers/gpu/drm/radeon/Makefile b/drivers/gpu/drm/radeon/Makefile index 8cc9f68..21ee928 100644 --- a/drivers/gpu/drm/radeon/Makefile +++ b/drivers/gpu/drm/radeon/Makefile @@ -81,7 +81,7 @@ radeon-y += radeon_device.o radeon_asic.o radeon_kms.o \ rv770_smc.o cypress_dpm.o btc_dpm.o sumo_dpm.o sumo_smc.o trinity_dpm.o \ trinity_smc.o ni_dpm.o si_smc.o si_dpm.o kv_smc.o kv_dpm.o ci_smc.o \ ci_dpm.o dce6_afmt.o radeon_vm.o radeon_ucode.o radeon_ib.o radeon_mn.o \
- radeon_sync.o
- radeon_sync.o radeon_seq.o
# add async DMA block radeon-y += \ diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index b3b4e96..dbfd346 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -423,6 +423,15 @@ static inline bool radeon_fence_is_earlier(struct radeon_fence *a, }
/*
- Userspace command submission identifier generation
- */
+struct radeon_seq;
+uint64_t radeon_seq_push(struct radeon_seq **seq, struct radeon_fence *fence); +struct radeon_fence *radeon_seq_query(struct radeon_seq *seq, uint64_t id); +void radeon_seq_destroy(struct radeon_seq **seq);
+/*
- Tiling registers
*/ struct radeon_surface_reg { @@ -946,7 +955,9 @@ struct radeon_vm_manager {
- file private structure
*/ struct radeon_fpriv {
- struct radeon_vm vm;
- struct radeon_vm vm;
- struct mutex seq_lock;
- struct radeon_seq *seq;
};
/* diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c index b8f20a5..0c0f0b3 100644 --- a/drivers/gpu/drm/radeon/radeon_cs.c +++ b/drivers/gpu/drm/radeon/radeon_cs.c @@ -413,6 +413,19 @@ static void radeon_cs_parser_fini(struct radeon_cs_parser *parser, int error, bo unsigned i;
if (!error) {
if (parser->chunk_flags &&
parser->chunk_flags->length_dw > 4) {
struct radeon_fpriv *fpriv = parser->filp->driver_priv;
uint32_t __user *to = parser->chunk_flags->user_ptr;
uint64_t id;
mutex_lock(&fpriv->seq_lock);
id = radeon_seq_push(&fpriv->seq, parser->ib.fence);
mutex_unlock(&fpriv->seq_lock);
copy_to_user(&to[3], &id, sizeof(uint64_t));
}
- /* Sort the buffer list from the smallest to largest buffer,
- which affects the order of buffers in the LRU list.
- This assures that the smallest buffers are added first
diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index 85ee6f7..38880af 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -578,39 +578,34 @@ void radeon_driver_lastclose_kms(struct drm_device *dev) */ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) {
- struct radeon_fpriv *fpriv = kzalloc(sizeof(*fpriv), GFP_KERNEL); struct radeon_device *rdev = dev->dev_private; int r;
- file_priv->driver_priv = NULL;
if (unlikely(!fpriv))
return -ENOMEM;
file_priv->driver_priv = fpriv;
r = pm_runtime_get_sync(dev->dev); if (r < 0)
return r;
goto error;
/* new gpu have virtual address space support */ if (rdev->family >= CHIP_CAYMAN) {
struct radeon_fpriv *fpriv;
struct radeon_vm *vm; int r;
fpriv = kzalloc(sizeof(*fpriv), GFP_KERNEL);
if (unlikely(!fpriv)) {
return -ENOMEM;
}
vm = &fpriv->vm; r = radeon_vm_init(rdev, vm);
if (r) {
kfree(fpriv);
return r;
}
if (r)
goto error;
if (rdev->accel_working) { r = radeon_bo_reserve(rdev->ring_tmp_bo.bo, false); if (r) { radeon_vm_fini(rdev, vm);
kfree(fpriv);
return r;
goto error; } /* map the ib pool buffer read only into
@@ -623,16 +618,20 @@ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) RADEON_VM_PAGE_SNOOPED); if (r) { radeon_vm_fini(rdev, vm);
kfree(fpriv);
return r;
}goto error; }
}file_priv->driver_priv = fpriv;
- mutex_init(&fpriv->seq_lock);
- pm_runtime_mark_last_busy(dev->dev); pm_runtime_put_autosuspend(dev->dev); return 0;
+error:
- kfree(fpriv);
- return r;
}
/** @@ -646,11 +645,13 @@ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) void radeon_driver_postclose_kms(struct drm_device *dev, struct drm_file *file_priv) {
struct radeon_fpriv *fpriv = file_priv->driver_priv; struct radeon_device *rdev = dev->dev_private;
radeon_seq_destroy(&fpriv->seq);
/* new gpu have virtual address space support */ if (rdev->family >= CHIP_CAYMAN && file_priv->driver_priv) {
struct radeon_vm *vm = &fpriv->vm; int r;struct radeon_fpriv *fpriv = file_priv->driver_priv;
@@ -664,9 +665,9 @@ void radeon_driver_postclose_kms(struct drm_device *dev, }
radeon_vm_fini(rdev, vm);
kfree(fpriv);
}file_priv->driver_priv = NULL;
- kfree(fpriv);
- file_priv->driver_priv = NULL;
}
/** diff --git a/drivers/gpu/drm/radeon/radeon_seq.c b/drivers/gpu/drm/radeon/radeon_seq.c new file mode 100644 index 0000000..d8857f1 --- /dev/null +++ b/drivers/gpu/drm/radeon/radeon_seq.c @@ -0,0 +1,229 @@ +/*
- Copyright 2014 Advanced Micro Devices, Inc.
- All Rights Reserved.
- Permission is hereby granted, free of charge, to any person obtaining a
- copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sub license, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
- DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- USE OR OTHER DEALINGS IN THE SOFTWARE.
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial portions
- of the Software.
- */
+/*
- Authors:
- Christian König christian.koenig@amd.com
- */
+#include <drm/drmP.h> +#include "radeon.h"
+/*
- ID sequences
- This code generates a 64bit identifier for a command submission.
- It works by adding the fence of the command submission to a automatically
- resizing ring buffer.
- */
+struct radeon_seq {
- uint64_t start;
- uint64_t end;
- uint64_t mask;
- struct radeon_seq *replacement;
+};
+/**
- radeon_seq_create - create a new sequence object
- @start: start value for this sequence
- @size: size of the ring buffer, must be power of two
- Allocate and initialize a new ring buffer and header.
- Returns NULL if allocation fails, new object otherwise.
- */
+static struct radeon_seq *radeon_seq_create(uint64_t start, unsigned size) +{
- unsigned bytes = sizeof(struct radeon_seq) +
size * sizeof(struct radeon_fence *);
- struct radeon_seq *seq;
- seq = kmalloc(bytes, GFP_KERNEL);
- if (!seq)
return NULL;
- seq->start = start;
- seq->end = start;
- seq->mask = size - 1;
- seq->replacement = NULL;
- return seq;
+}
+/**
- radeon_seq_ring - get pointer to ring buffer
- @seq: sequence object
- Calculate the address of the ring buffer.
- */
+static struct radeon_fence **radeon_seq_ring(struct radeon_seq *seq) +{
- return (struct radeon_fence **)&seq[1];
+}
+/**
- radeon_seq_try_free - try to free fences from the ring buffer
- @seq: sequence object
- Try to free fences from the start of the ring buffer.
- */
+static void radeon_seq_try_free(struct radeon_seq *seq) +{
- struct radeon_fence **ring = radeon_seq_ring(seq);
- while (seq->start != seq->end) {
unsigned idx = seq->start & seq->mask;
struct radeon_fence *fence = ring[idx];
if (!radeon_fence_signaled(fence))
break;
radeon_fence_unref(&fence);
++seq->start;
- }
+}
+/**
- radeon_seq_add - add new fence to the end of the ring buffer
- @seq: sequence object
- @f: the fence object
- Add the fence and return the generated ID.
- */
+static uint64_t radeon_seq_add(struct radeon_seq *seq, struct radeon_fence *f) +{
- struct radeon_fence **ring = radeon_seq_ring(seq);
- ring[seq->end & seq->mask] = radeon_fence_ref(f);
- return seq->end++;
+}
+/**
- radeon_seq_push - check for room and add the fence
- @seq: sequence object
- @fence: the fence object
- Check for room on the ring buffer, if there isn't enough
- reallocate the sequence object and add the fence.
- Returns the generated ID.
- */
+uint64_t radeon_seq_push(struct radeon_seq **seq, struct radeon_fence *fence) +{
- unsigned size_for_new_seq = 4;
- uint64_t start_for_new_seq = 1;
- if (*seq) {
/* try to release old replacements */
while ((*seq)->replacement) {
radeon_seq_try_free(*seq);
if ((*seq)->start == (*seq)->end) {
struct radeon_seq *repl = (*seq)->replacement;
kfree(*seq);
*seq = repl;
} else {
/* move on to the current container */
seq = &(*seq)->replacement;
}
}
/* check if we have enough room for one more fence */
radeon_seq_try_free(*seq);
if (((*seq)->end - (*seq)->start) <= (*seq)->mask)
return radeon_seq_add(*seq, fence);
/* not enough room, let's allocate a replacement */
size_for_new_seq = ((*seq)->mask + 1) * 2;
start_for_new_seq = (*seq)->end + 1;
seq = &(*seq)->replacement;
- }
- *seq = radeon_seq_create(start_for_new_seq, size_for_new_seq);
- if (!*seq) {
/* not enough memory for a new sequence object, but failing
here isn't a good idea either cause the commands are already
submitted to the hardware. So just block on the fence. */
int r = radeon_fence_wait(fence, false);
if (r)
DRM_ERROR("Error waiting for fence (%d)\n", r);
return 0;
- }
- return radeon_seq_add(*seq, fence);
+}
+/**
- radeon_seq_query - lockup fence by it's ID
- @seq: sequence object
- @id: the generated ID
- Lockup the associated fence by it's ID.
- Returns fence object or NULL if it couldn't be found.
- */
+struct radeon_fence *radeon_seq_query(struct radeon_seq *seq, uint64_t id) +{
- struct radeon_fence **ring;
- while (seq && id > seq->end)
seq = seq->replacement;
- if (!seq || id < seq->start)
return NULL;
- ring = radeon_seq_ring(seq);
- return ring[id & seq->mask];
+}
+/**
- radeon_seq_destroy - destroy the sequence object
- @seq_ptr: pointer to sequence object
- Destroy the sequence objects and release all fence references taken.
- */
+void radeon_seq_destroy(struct radeon_seq **seq_ptr) +{
- struct radeon_seq *seq = *seq_ptr;
- while (seq) {
struct radeon_seq *repl = seq->replacement;
unsigned start = seq->start & seq->mask;
unsigned end = seq->end & seq->mask;
struct radeon_fence **ring;
unsigned i;
ring = radeon_seq_ring(seq);
for (i = start; i < end; ++i)
radeon_fence_unref(&ring[i]);
kfree(seq);
seq = repl;
- }
- *seq_ptr = NULL;
+} diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h index 50d0fb4..6b2b2e7 100644 --- a/include/uapi/drm/radeon_drm.h +++ b/include/uapi/drm/radeon_drm.h @@ -959,6 +959,7 @@ struct drm_radeon_gem_va { #define RADEON_CS_RING_VCE 4 /* The third dword of RADEON_CHUNK_ID_FLAGS is a sint32 that sets the priority */ /* 0 = normal, + = higher priority, - = lower priority */ +/* The fourth and fives dword are a 64bit fence ID generated for this CS */
struct drm_radeon_cs_chunk { uint32_t chunk_id; -- 1.9.1
dri-devel mailing list dri-devel@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/dri-devel
On Thu, Sep 18, 2014 at 11:11:57PM -0400, Jerome Glisse wrote:
On Thu, Sep 18, 2014 at 08:42:16PM -0400, Jerome Glisse wrote:
On Thu, Sep 18, 2014 at 05:30:12PM +0200, Christian König wrote:
From: Christian König christian.koenig@amd.com
This patch adds a new 64bit ID as a result to each command submission.
NAK for design reasons.
First and foremost we DO NOT WANT TO ALLOCATE ANYTHING for submission id. I know that hooking up to fence make life easy but this is not how it should be done.
What you want to expose is the per ring 32bit seq value and report to user space the value and the ring id. To handle the wrap around you add a uniq 32bit wrap counter which is incremented every 1 << 30 or some big number seq value number (but must be smaller than 1 << 31). Then you use arithmetic to determine if a cs seq number is done or not (ie the seq wrap around counter diff should not be bigger then 2 or 3) and the seq number should obviously be after again using arithmetic diff like jiffies code.
All this being hidden in the kernel all the user space have to know is : 32 bit seq value per ring 32 bit ring uniq id 32 wrap counter per ring
With such scheme you never allocate anything or worry about any fence. Way simpler and less code.
Cheers, Jérôme
Because code is clearer than words attached is a patch of what i am thinking about. The arithmetic should be right as well as the coherency and proper memory barrier. Thought it is completely untested and will require couple fixes for properly checking userspace arguments and other aspect (see FIXME in patches).
There is no allocation, it is a pretty small patch, and it provide a fast ligthweight solution. While the new ioctl argument and return value can be improved (like reporting more information to userspace like for instance warning userspace when it is querying very old fences). I think the overall design is sound.
Let me know.
Ok actually after a beer (given that planet express manual for Bender, which also apply to me, clearly stipulate to expect malfunction if not properly inebriate) it became clear that by reversing the problem i could make it a lot simpler. So attached is an even simpler patch that handle gracefully user space querying for very old fence (ie that could have wrap around).
Code explains it all.
Cheers, Jérôme
Signed-off-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/radeon/Makefile | 2 +- drivers/gpu/drm/radeon/radeon.h | 13 +- drivers/gpu/drm/radeon/radeon_cs.c | 13 ++ drivers/gpu/drm/radeon/radeon_kms.c | 41 +++---- drivers/gpu/drm/radeon/radeon_seq.c | 229 ++++++++++++++++++++++++++++++++++++ include/uapi/drm/radeon_drm.h | 1 + 6 files changed, 277 insertions(+), 22 deletions(-) create mode 100644 drivers/gpu/drm/radeon/radeon_seq.c
diff --git a/drivers/gpu/drm/radeon/Makefile b/drivers/gpu/drm/radeon/Makefile index 8cc9f68..21ee928 100644 --- a/drivers/gpu/drm/radeon/Makefile +++ b/drivers/gpu/drm/radeon/Makefile @@ -81,7 +81,7 @@ radeon-y += radeon_device.o radeon_asic.o radeon_kms.o \ rv770_smc.o cypress_dpm.o btc_dpm.o sumo_dpm.o sumo_smc.o trinity_dpm.o \ trinity_smc.o ni_dpm.o si_smc.o si_dpm.o kv_smc.o kv_dpm.o ci_smc.o \ ci_dpm.o dce6_afmt.o radeon_vm.o radeon_ucode.o radeon_ib.o radeon_mn.o \
- radeon_sync.o
- radeon_sync.o radeon_seq.o
# add async DMA block radeon-y += \ diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index b3b4e96..dbfd346 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -423,6 +423,15 @@ static inline bool radeon_fence_is_earlier(struct radeon_fence *a, }
/*
- Userspace command submission identifier generation
- */
+struct radeon_seq;
+uint64_t radeon_seq_push(struct radeon_seq **seq, struct radeon_fence *fence); +struct radeon_fence *radeon_seq_query(struct radeon_seq *seq, uint64_t id); +void radeon_seq_destroy(struct radeon_seq **seq);
+/*
- Tiling registers
*/ struct radeon_surface_reg { @@ -946,7 +955,9 @@ struct radeon_vm_manager {
- file private structure
*/ struct radeon_fpriv {
- struct radeon_vm vm;
- struct radeon_vm vm;
- struct mutex seq_lock;
- struct radeon_seq *seq;
};
/* diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c index b8f20a5..0c0f0b3 100644 --- a/drivers/gpu/drm/radeon/radeon_cs.c +++ b/drivers/gpu/drm/radeon/radeon_cs.c @@ -413,6 +413,19 @@ static void radeon_cs_parser_fini(struct radeon_cs_parser *parser, int error, bo unsigned i;
if (!error) {
if (parser->chunk_flags &&
parser->chunk_flags->length_dw > 4) {
struct radeon_fpriv *fpriv = parser->filp->driver_priv;
uint32_t __user *to = parser->chunk_flags->user_ptr;
uint64_t id;
mutex_lock(&fpriv->seq_lock);
id = radeon_seq_push(&fpriv->seq, parser->ib.fence);
mutex_unlock(&fpriv->seq_lock);
copy_to_user(&to[3], &id, sizeof(uint64_t));
}
- /* Sort the buffer list from the smallest to largest buffer,
- which affects the order of buffers in the LRU list.
- This assures that the smallest buffers are added first
diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index 85ee6f7..38880af 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -578,39 +578,34 @@ void radeon_driver_lastclose_kms(struct drm_device *dev) */ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) {
- struct radeon_fpriv *fpriv = kzalloc(sizeof(*fpriv), GFP_KERNEL); struct radeon_device *rdev = dev->dev_private; int r;
- file_priv->driver_priv = NULL;
if (unlikely(!fpriv))
return -ENOMEM;
file_priv->driver_priv = fpriv;
r = pm_runtime_get_sync(dev->dev); if (r < 0)
return r;
goto error;
/* new gpu have virtual address space support */ if (rdev->family >= CHIP_CAYMAN) {
struct radeon_fpriv *fpriv;
struct radeon_vm *vm; int r;
fpriv = kzalloc(sizeof(*fpriv), GFP_KERNEL);
if (unlikely(!fpriv)) {
return -ENOMEM;
}
vm = &fpriv->vm; r = radeon_vm_init(rdev, vm);
if (r) {
kfree(fpriv);
return r;
}
if (r)
goto error;
if (rdev->accel_working) { r = radeon_bo_reserve(rdev->ring_tmp_bo.bo, false); if (r) { radeon_vm_fini(rdev, vm);
kfree(fpriv);
return r;
goto error; } /* map the ib pool buffer read only into
@@ -623,16 +618,20 @@ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) RADEON_VM_PAGE_SNOOPED); if (r) { radeon_vm_fini(rdev, vm);
kfree(fpriv);
return r;
}goto error; }
}file_priv->driver_priv = fpriv;
- mutex_init(&fpriv->seq_lock);
- pm_runtime_mark_last_busy(dev->dev); pm_runtime_put_autosuspend(dev->dev); return 0;
+error:
- kfree(fpriv);
- return r;
}
/** @@ -646,11 +645,13 @@ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) void radeon_driver_postclose_kms(struct drm_device *dev, struct drm_file *file_priv) {
struct radeon_fpriv *fpriv = file_priv->driver_priv; struct radeon_device *rdev = dev->dev_private;
radeon_seq_destroy(&fpriv->seq);
/* new gpu have virtual address space support */ if (rdev->family >= CHIP_CAYMAN && file_priv->driver_priv) {
struct radeon_vm *vm = &fpriv->vm; int r;struct radeon_fpriv *fpriv = file_priv->driver_priv;
@@ -664,9 +665,9 @@ void radeon_driver_postclose_kms(struct drm_device *dev, }
radeon_vm_fini(rdev, vm);
kfree(fpriv);
}file_priv->driver_priv = NULL;
- kfree(fpriv);
- file_priv->driver_priv = NULL;
}
/** diff --git a/drivers/gpu/drm/radeon/radeon_seq.c b/drivers/gpu/drm/radeon/radeon_seq.c new file mode 100644 index 0000000..d8857f1 --- /dev/null +++ b/drivers/gpu/drm/radeon/radeon_seq.c @@ -0,0 +1,229 @@ +/*
- Copyright 2014 Advanced Micro Devices, Inc.
- All Rights Reserved.
- Permission is hereby granted, free of charge, to any person obtaining a
- copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sub license, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
- DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- USE OR OTHER DEALINGS IN THE SOFTWARE.
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial portions
- of the Software.
- */
+/*
- Authors:
- Christian König christian.koenig@amd.com
- */
+#include <drm/drmP.h> +#include "radeon.h"
+/*
- ID sequences
- This code generates a 64bit identifier for a command submission.
- It works by adding the fence of the command submission to a automatically
- resizing ring buffer.
- */
+struct radeon_seq {
- uint64_t start;
- uint64_t end;
- uint64_t mask;
- struct radeon_seq *replacement;
+};
+/**
- radeon_seq_create - create a new sequence object
- @start: start value for this sequence
- @size: size of the ring buffer, must be power of two
- Allocate and initialize a new ring buffer and header.
- Returns NULL if allocation fails, new object otherwise.
- */
+static struct radeon_seq *radeon_seq_create(uint64_t start, unsigned size) +{
- unsigned bytes = sizeof(struct radeon_seq) +
size * sizeof(struct radeon_fence *);
- struct radeon_seq *seq;
- seq = kmalloc(bytes, GFP_KERNEL);
- if (!seq)
return NULL;
- seq->start = start;
- seq->end = start;
- seq->mask = size - 1;
- seq->replacement = NULL;
- return seq;
+}
+/**
- radeon_seq_ring - get pointer to ring buffer
- @seq: sequence object
- Calculate the address of the ring buffer.
- */
+static struct radeon_fence **radeon_seq_ring(struct radeon_seq *seq) +{
- return (struct radeon_fence **)&seq[1];
+}
+/**
- radeon_seq_try_free - try to free fences from the ring buffer
- @seq: sequence object
- Try to free fences from the start of the ring buffer.
- */
+static void radeon_seq_try_free(struct radeon_seq *seq) +{
- struct radeon_fence **ring = radeon_seq_ring(seq);
- while (seq->start != seq->end) {
unsigned idx = seq->start & seq->mask;
struct radeon_fence *fence = ring[idx];
if (!radeon_fence_signaled(fence))
break;
radeon_fence_unref(&fence);
++seq->start;
- }
+}
+/**
- radeon_seq_add - add new fence to the end of the ring buffer
- @seq: sequence object
- @f: the fence object
- Add the fence and return the generated ID.
- */
+static uint64_t radeon_seq_add(struct radeon_seq *seq, struct radeon_fence *f) +{
- struct radeon_fence **ring = radeon_seq_ring(seq);
- ring[seq->end & seq->mask] = radeon_fence_ref(f);
- return seq->end++;
+}
+/**
- radeon_seq_push - check for room and add the fence
- @seq: sequence object
- @fence: the fence object
- Check for room on the ring buffer, if there isn't enough
- reallocate the sequence object and add the fence.
- Returns the generated ID.
- */
+uint64_t radeon_seq_push(struct radeon_seq **seq, struct radeon_fence *fence) +{
- unsigned size_for_new_seq = 4;
- uint64_t start_for_new_seq = 1;
- if (*seq) {
/* try to release old replacements */
while ((*seq)->replacement) {
radeon_seq_try_free(*seq);
if ((*seq)->start == (*seq)->end) {
struct radeon_seq *repl = (*seq)->replacement;
kfree(*seq);
*seq = repl;
} else {
/* move on to the current container */
seq = &(*seq)->replacement;
}
}
/* check if we have enough room for one more fence */
radeon_seq_try_free(*seq);
if (((*seq)->end - (*seq)->start) <= (*seq)->mask)
return radeon_seq_add(*seq, fence);
/* not enough room, let's allocate a replacement */
size_for_new_seq = ((*seq)->mask + 1) * 2;
start_for_new_seq = (*seq)->end + 1;
seq = &(*seq)->replacement;
- }
- *seq = radeon_seq_create(start_for_new_seq, size_for_new_seq);
- if (!*seq) {
/* not enough memory for a new sequence object, but failing
here isn't a good idea either cause the commands are already
submitted to the hardware. So just block on the fence. */
int r = radeon_fence_wait(fence, false);
if (r)
DRM_ERROR("Error waiting for fence (%d)\n", r);
return 0;
- }
- return radeon_seq_add(*seq, fence);
+}
+/**
- radeon_seq_query - lockup fence by it's ID
- @seq: sequence object
- @id: the generated ID
- Lockup the associated fence by it's ID.
- Returns fence object or NULL if it couldn't be found.
- */
+struct radeon_fence *radeon_seq_query(struct radeon_seq *seq, uint64_t id) +{
- struct radeon_fence **ring;
- while (seq && id > seq->end)
seq = seq->replacement;
- if (!seq || id < seq->start)
return NULL;
- ring = radeon_seq_ring(seq);
- return ring[id & seq->mask];
+}
+/**
- radeon_seq_destroy - destroy the sequence object
- @seq_ptr: pointer to sequence object
- Destroy the sequence objects and release all fence references taken.
- */
+void radeon_seq_destroy(struct radeon_seq **seq_ptr) +{
- struct radeon_seq *seq = *seq_ptr;
- while (seq) {
struct radeon_seq *repl = seq->replacement;
unsigned start = seq->start & seq->mask;
unsigned end = seq->end & seq->mask;
struct radeon_fence **ring;
unsigned i;
ring = radeon_seq_ring(seq);
for (i = start; i < end; ++i)
radeon_fence_unref(&ring[i]);
kfree(seq);
seq = repl;
- }
- *seq_ptr = NULL;
+} diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h index 50d0fb4..6b2b2e7 100644 --- a/include/uapi/drm/radeon_drm.h +++ b/include/uapi/drm/radeon_drm.h @@ -959,6 +959,7 @@ struct drm_radeon_gem_va { #define RADEON_CS_RING_VCE 4 /* The third dword of RADEON_CHUNK_ID_FLAGS is a sint32 that sets the priority */ /* 0 = normal, + = higher priority, - = lower priority */ +/* The fourth and fives dword are a 64bit fence ID generated for this CS */
struct drm_radeon_cs_chunk { uint32_t chunk_id; -- 1.9.1
dri-devel mailing list dri-devel@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/dri-devel
From b95fe503dad89875da9db2d11d05f93eca41232d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= jglisse@redhat.com Date: Thu, 18 Sep 2014 22:51:21 -0400 Subject: [PATCH] drm/radeon: cs sequence id and cs completion query. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit
This report back to userspace ring id and sequence number that can be use by userspace to query for the completetion of the cs on the hardware. This also add a new ioctl to perform such query.
There is an extra value supplied to userspace the wrap counter that is use to detect wrap around and to gracefuly handle the case where userspace might be querying about a very, very, very old cs executed long time ago in a far far away universe.
This patch is aimed to introduce the necessary ground work for user space explicit synchronization. By allowing userspace to query about cs completetion on hardware, user space can perform operation and synchronization on buffer by itself without having the cs ioctl to implicitly wait for older cs completetion before scheduling new cs. This part is however left to a follow-up patch.
Signed-off-by: Jérôme Glisse jglisse@redhat.com
drivers/gpu/drm/radeon/radeon.h | 2 ++ drivers/gpu/drm/radeon/radeon_cs.c | 60 +++++++++++++++++++++++++++++++++++ drivers/gpu/drm/radeon/radeon_fence.c | 4 +++ drivers/gpu/drm/radeon/radeon_kms.c | 1 + include/uapi/drm/radeon_drm.h | 9 ++++++ 5 files changed, 76 insertions(+)
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 5f05b4c..1f1dd1f 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -118,6 +118,7 @@ extern int radeon_bapm; #define RADEON_DEBUGFS_MAX_COMPONENTS 32 #define RADEONFB_CONN_LIMIT 4 #define RADEON_BIOS_NUM_SCRATCH 8 +#define RADEON_SEQ_WRAP_VALUE (1 << 30)
/* fence seq are set to this number when signaled */ #define RADEON_FENCE_SIGNALED_SEQ 0LL @@ -355,6 +356,7 @@ struct radeon_fence_driver { /* sync_seq is protected by ring emission lock */ uint64_t sync_seq[RADEON_NUM_RINGS]; atomic64_t last_seq;
- int32_t wrap_seq; bool initialized;
};
diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c index 83f382e..be4ae25 100644 --- a/drivers/gpu/drm/radeon/radeon_cs.c +++ b/drivers/gpu/drm/radeon/radeon_cs.c @@ -404,6 +404,17 @@ static void radeon_cs_parser_fini(struct radeon_cs_parser *parser, int error, bo ttm_eu_fence_buffer_objects(&parser->ticket, &parser->validated, parser->ib.fence);
if (parser->chunk_flags && parser->chunk_flags->length_dw > 4) {
uint32_t __user *to = parser->chunk_flags->user_ptr;
uint32_t tmp;
tmp = lower_32_bits(parser->ib.fence->seq);
copy_to_user(&to[3], &tmp, sizeof(uint32_t));
tmp = parser->ib.fence->ring;
copy_to_user(&to[4], &tmp, sizeof(uint32_t));
tmp = rdev->fence_drv[tmp]->wrap_seq;
copy_to_user(&to[5], &tmp, sizeof(uint32_t));
} else if (backoff) { ttm_eu_backoff_reservation(&parser->ticket, &parser->validated);}
@@ -823,3 +834,52 @@ int radeon_cs_packet_next_reloc(struct radeon_cs_parser *p, *cs_reloc = p->relocs_ptr[(idx / 4)]; return 0; }
+int radeon_cs_done_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) +{
- struct radeon_device *rdev = dev->dev_private;
- struct drm_radeon_cs_done *args = data;
- unsigned i = args->ring;
- int32_t last_seq, sync_seq, wrap_seq;
- /* FIXME check args->ring value is ok. */
- /*
* The memory barrier is match with the one in radeon_fence_emit() and
* it insure us that we get the right matching wrap_seq and sync_seq.
*
* Note that no need to protect the fence_drv.sync_seq here as barrier
* insure us we will get the coherency we need.
*/
- wrap_seq = ACCESS_ONCE(rdev->fence_drv[i].wrap_seq);
- smp_rmb();
- sync_seq = lower_32_bits(ACCESS_ONCE(rdev->fence_drv[i].sync_seq[i]));
- /*
* So if current wrap_seq and one we are queried with are differ by
* more than one this means that we are queried about a very old fence
* seq value and we can assume it is long done.
*
* Well this is not entirely true, for it to be true we would need to
* stall when we increment the wrap counter if cs in previous wrap were
* not completed but this is highly unlikely. So live with the trill of
* it going wrong !
*/
- if (abs((unsigned)wrap_seq - (unsigned)args->wrap) > 1)
return 1;
- /* Now check if currently reported fence seq is done or not. */
- /* FIXME call fence func to update last_seq just in case. */
- last_seq = lower_32_bits(atomic64_read(&rdev->fence_drv[i].last_seq));
- if ((last_seq - arg->seq) >= 0)
return 1;
- /*
* Last failsafe to handle the horrible case were userspace holded on
* a wrap and seq value for so long without querying that it we wrapped
* around. This is here to avoid userspace waiting for a fence that was
* emited a long time ago but the current sync_seq[ring] value migth be
* stuck and thus we might go bigger than this very old seq value.
*/
- if (((sync_seq - args->seq) < 0) && args->wrap == wrap_seq)
return 1;
- return 0;
+} diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 9137870..a6adcf6 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -119,6 +119,10 @@ int radeon_fence_emit(struct radeon_device *rdev, kref_init(&((*fence)->kref)); (*fence)->rdev = rdev; (*fence)->seq = ++rdev->fence_drv[ring].sync_seq[ring];
- /* Barrier is important for radeon_fence_cs_done_ioctl. */
- smp_wmb();
- if (rdev->fence_drv[ring].sync_seq[ring] == RADEON_SEQ_WRAP_VALUE)
(*fence)->ring = ring; radeon_fence_ring_emit(rdev, ring, *fence); trace_radeon_fence_emit(rdev->ddev, ring, (*fence)->seq);rdev->fence_drv[ring].wrap_seq++;
diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index eb7164d..c9cfcf5 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -885,5 +885,6 @@ const struct drm_ioctl_desc radeon_ioctls_kms[] = { DRM_IOCTL_DEF_DRV(RADEON_GEM_BUSY, radeon_gem_busy_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(RADEON_GEM_VA, radeon_gem_va_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(RADEON_GEM_OP, radeon_gem_op_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW),
- DRM_IOCTL_DEF_DRV(RADEON_CS_DONE, radeon_cs_done_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW),
}; int radeon_max_kms_ioctl = ARRAY_SIZE(radeon_ioctls_kms); diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h index fea6099..a0c215d 100644 --- a/include/uapi/drm/radeon_drm.h +++ b/include/uapi/drm/radeon_drm.h @@ -554,6 +554,7 @@ typedef struct { #define DRM_IOCTL_RADEON_GEM_BUSY DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_BUSY, struct drm_radeon_gem_busy) #define DRM_IOCTL_RADEON_GEM_VA DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_VA, struct drm_radeon_gem_va) #define DRM_IOCTL_RADEON_GEM_OP DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_OP, struct drm_radeon_gem_op) +#define DRM_IOCTL_RADEON_CS_DONE DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_CS_DONE, struct drm_radeon_cs_done)
typedef struct drm_radeon_init { enum { @@ -936,6 +937,7 @@ struct drm_radeon_gem_va { #define RADEON_CS_RING_VCE 4 /* The third dword of RADEON_CHUNK_ID_FLAGS is a sint32 that sets the priority */ /* 0 = normal, + = higher priority, - = lower priority */ +/* The fifth, sixth, seventh dword are a 32bit fence ID, ring id and wrap id of this CS */
struct drm_radeon_cs_chunk { uint32_t chunk_id; @@ -1038,4 +1040,11 @@ struct drm_radeon_info {
#define CIK_TILE_MODE_DEPTH_STENCIL_1D 5
+struct drm_radeon_cs_done {
- int32_t seq;
- int32_t ring;
- int32_t wrap;
- int32_t pad;
+};
#endif
1.9.3
Hi Jerome,
this is exactly the approach which we wanted to avoid. And according to Alex it was actually you guys (I think Dave) who noted that this is probably not such a good idea.
The issue is that by telling userspace the ring specific sequence number we nail down the order of execution. E.g. a scheduler who want to change the order in which IBs are submitted to the hardware can't do so any more because we already gave userspace a sequence number to identify the command submission.
Additional to that for hardware inter device synchronization we want to refer to a certain kernel object and not just the fence number. So we can only use some kind of handle based approach, e.g. as discussed before we want to be able to turn this sequence number in a FD fence.
Regards, Christian.
Am 19.09.2014 um 06:06 schrieb Jerome Glisse:
On Thu, Sep 18, 2014 at 11:11:57PM -0400, Jerome Glisse wrote:
On Thu, Sep 18, 2014 at 08:42:16PM -0400, Jerome Glisse wrote:
On Thu, Sep 18, 2014 at 05:30:12PM +0200, Christian König wrote:
From: Christian König christian.koenig@amd.com
This patch adds a new 64bit ID as a result to each command submission.
NAK for design reasons.
First and foremost we DO NOT WANT TO ALLOCATE ANYTHING for submission id. I know that hooking up to fence make life easy but this is not how it should be done.
What you want to expose is the per ring 32bit seq value and report to user space the value and the ring id. To handle the wrap around you add a uniq 32bit wrap counter which is incremented every 1 << 30 or some big number seq value number (but must be smaller than 1 << 31). Then you use arithmetic to determine if a cs seq number is done or not (ie the seq wrap around counter diff should not be bigger then 2 or 3) and the seq number should obviously be after again using arithmetic diff like jiffies code.
All this being hidden in the kernel all the user space have to know is : 32 bit seq value per ring 32 bit ring uniq id 32 wrap counter per ring
With such scheme you never allocate anything or worry about any fence. Way simpler and less code.
Cheers, Jérôme
Because code is clearer than words attached is a patch of what i am thinking about. The arithmetic should be right as well as the coherency and proper memory barrier. Thought it is completely untested and will require couple fixes for properly checking userspace arguments and other aspect (see FIXME in patches).
There is no allocation, it is a pretty small patch, and it provide a fast ligthweight solution. While the new ioctl argument and return value can be improved (like reporting more information to userspace like for instance warning userspace when it is querying very old fences). I think the overall design is sound.
Let me know.
Ok actually after a beer (given that planet express manual for Bender, which also apply to me, clearly stipulate to expect malfunction if not properly inebriate) it became clear that by reversing the problem i could make it a lot simpler. So attached is an even simpler patch that handle gracefully user space querying for very old fence (ie that could have wrap around).
Code explains it all.
Cheers, Jérôme
Signed-off-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/radeon/Makefile | 2 +- drivers/gpu/drm/radeon/radeon.h | 13 +- drivers/gpu/drm/radeon/radeon_cs.c | 13 ++ drivers/gpu/drm/radeon/radeon_kms.c | 41 +++---- drivers/gpu/drm/radeon/radeon_seq.c | 229 ++++++++++++++++++++++++++++++++++++ include/uapi/drm/radeon_drm.h | 1 + 6 files changed, 277 insertions(+), 22 deletions(-) create mode 100644 drivers/gpu/drm/radeon/radeon_seq.c
diff --git a/drivers/gpu/drm/radeon/Makefile b/drivers/gpu/drm/radeon/Makefile index 8cc9f68..21ee928 100644 --- a/drivers/gpu/drm/radeon/Makefile +++ b/drivers/gpu/drm/radeon/Makefile @@ -81,7 +81,7 @@ radeon-y += radeon_device.o radeon_asic.o radeon_kms.o \ rv770_smc.o cypress_dpm.o btc_dpm.o sumo_dpm.o sumo_smc.o trinity_dpm.o \ trinity_smc.o ni_dpm.o si_smc.o si_dpm.o kv_smc.o kv_dpm.o ci_smc.o \ ci_dpm.o dce6_afmt.o radeon_vm.o radeon_ucode.o radeon_ib.o radeon_mn.o \
- radeon_sync.o
radeon_sync.o radeon_seq.o
# add async DMA block radeon-y += \
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index b3b4e96..dbfd346 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -423,6 +423,15 @@ static inline bool radeon_fence_is_earlier(struct radeon_fence *a, }
/*
- Userspace command submission identifier generation
- */
+struct radeon_seq;
+uint64_t radeon_seq_push(struct radeon_seq **seq, struct radeon_fence *fence); +struct radeon_fence *radeon_seq_query(struct radeon_seq *seq, uint64_t id); +void radeon_seq_destroy(struct radeon_seq **seq);
+/*
- Tiling registers
*/ struct radeon_surface_reg { @@ -946,7 +955,9 @@ struct radeon_vm_manager {
- file private structure
*/ struct radeon_fpriv {
- struct radeon_vm vm;
struct radeon_vm vm;
struct mutex seq_lock;
struct radeon_seq *seq; };
/*
diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c index b8f20a5..0c0f0b3 100644 --- a/drivers/gpu/drm/radeon/radeon_cs.c +++ b/drivers/gpu/drm/radeon/radeon_cs.c @@ -413,6 +413,19 @@ static void radeon_cs_parser_fini(struct radeon_cs_parser *parser, int error, bo unsigned i;
if (!error) {
if (parser->chunk_flags &&
parser->chunk_flags->length_dw > 4) {
struct radeon_fpriv *fpriv = parser->filp->driver_priv;
uint32_t __user *to = parser->chunk_flags->user_ptr;
uint64_t id;
mutex_lock(&fpriv->seq_lock);
id = radeon_seq_push(&fpriv->seq, parser->ib.fence);
mutex_unlock(&fpriv->seq_lock);
copy_to_user(&to[3], &id, sizeof(uint64_t));
}
- /* Sort the buffer list from the smallest to largest buffer,
- which affects the order of buffers in the LRU list.
- This assures that the smallest buffers are added first
diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index 85ee6f7..38880af 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -578,39 +578,34 @@ void radeon_driver_lastclose_kms(struct drm_device *dev) */ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) {
- struct radeon_fpriv *fpriv = kzalloc(sizeof(*fpriv), GFP_KERNEL); struct radeon_device *rdev = dev->dev_private; int r;
- file_priv->driver_priv = NULL;
if (unlikely(!fpriv))
return -ENOMEM;
file_priv->driver_priv = fpriv;
r = pm_runtime_get_sync(dev->dev); if (r < 0)
return r;
goto error;
/* new gpu have virtual address space support */ if (rdev->family >= CHIP_CAYMAN) {
struct radeon_fpriv *fpriv;
struct radeon_vm *vm; int r;
fpriv = kzalloc(sizeof(*fpriv), GFP_KERNEL);
if (unlikely(!fpriv)) {
return -ENOMEM;
}
vm = &fpriv->vm; r = radeon_vm_init(rdev, vm);
if (r) {
kfree(fpriv);
return r;
}
if (r)
goto error;
if (rdev->accel_working) { r = radeon_bo_reserve(rdev->ring_tmp_bo.bo, false); if (r) { radeon_vm_fini(rdev, vm);
kfree(fpriv);
return r;
goto error; } /* map the ib pool buffer read only into
@@ -623,16 +618,20 @@ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) RADEON_VM_PAGE_SNOOPED); if (r) { radeon_vm_fini(rdev, vm);
kfree(fpriv);
return r;
}goto error; }
}file_priv->driver_priv = fpriv;
- mutex_init(&fpriv->seq_lock);
- pm_runtime_mark_last_busy(dev->dev); pm_runtime_put_autosuspend(dev->dev); return 0;
+error:
kfree(fpriv);
return r; }
/**
@@ -646,11 +645,13 @@ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) void radeon_driver_postclose_kms(struct drm_device *dev, struct drm_file *file_priv) {
struct radeon_fpriv *fpriv = file_priv->driver_priv; struct radeon_device *rdev = dev->dev_private;
radeon_seq_destroy(&fpriv->seq);
/* new gpu have virtual address space support */ if (rdev->family >= CHIP_CAYMAN && file_priv->driver_priv) {
struct radeon_vm *vm = &fpriv->vm; int r;struct radeon_fpriv *fpriv = file_priv->driver_priv;
@@ -664,9 +665,9 @@ void radeon_driver_postclose_kms(struct drm_device *dev, }
radeon_vm_fini(rdev, vm);
kfree(fpriv);
}file_priv->driver_priv = NULL;
kfree(fpriv);
file_priv->driver_priv = NULL; }
/**
diff --git a/drivers/gpu/drm/radeon/radeon_seq.c b/drivers/gpu/drm/radeon/radeon_seq.c new file mode 100644 index 0000000..d8857f1 --- /dev/null +++ b/drivers/gpu/drm/radeon/radeon_seq.c @@ -0,0 +1,229 @@ +/*
- Copyright 2014 Advanced Micro Devices, Inc.
- All Rights Reserved.
- Permission is hereby granted, free of charge, to any person obtaining a
- copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sub license, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
- DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- USE OR OTHER DEALINGS IN THE SOFTWARE.
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial portions
- of the Software.
- */
+/*
- Authors:
- Christian König christian.koenig@amd.com
- */
+#include <drm/drmP.h> +#include "radeon.h"
+/*
- ID sequences
- This code generates a 64bit identifier for a command submission.
- It works by adding the fence of the command submission to a automatically
- resizing ring buffer.
- */
+struct radeon_seq {
- uint64_t start;
- uint64_t end;
- uint64_t mask;
- struct radeon_seq *replacement;
+};
+/**
- radeon_seq_create - create a new sequence object
- @start: start value for this sequence
- @size: size of the ring buffer, must be power of two
- Allocate and initialize a new ring buffer and header.
- Returns NULL if allocation fails, new object otherwise.
- */
+static struct radeon_seq *radeon_seq_create(uint64_t start, unsigned size) +{
- unsigned bytes = sizeof(struct radeon_seq) +
size * sizeof(struct radeon_fence *);
- struct radeon_seq *seq;
- seq = kmalloc(bytes, GFP_KERNEL);
- if (!seq)
return NULL;
- seq->start = start;
- seq->end = start;
- seq->mask = size - 1;
- seq->replacement = NULL;
- return seq;
+}
+/**
- radeon_seq_ring - get pointer to ring buffer
- @seq: sequence object
- Calculate the address of the ring buffer.
- */
+static struct radeon_fence **radeon_seq_ring(struct radeon_seq *seq) +{
- return (struct radeon_fence **)&seq[1];
+}
+/**
- radeon_seq_try_free - try to free fences from the ring buffer
- @seq: sequence object
- Try to free fences from the start of the ring buffer.
- */
+static void radeon_seq_try_free(struct radeon_seq *seq) +{
- struct radeon_fence **ring = radeon_seq_ring(seq);
- while (seq->start != seq->end) {
unsigned idx = seq->start & seq->mask;
struct radeon_fence *fence = ring[idx];
if (!radeon_fence_signaled(fence))
break;
radeon_fence_unref(&fence);
++seq->start;
- }
+}
+/**
- radeon_seq_add - add new fence to the end of the ring buffer
- @seq: sequence object
- @f: the fence object
- Add the fence and return the generated ID.
- */
+static uint64_t radeon_seq_add(struct radeon_seq *seq, struct radeon_fence *f) +{
- struct radeon_fence **ring = radeon_seq_ring(seq);
- ring[seq->end & seq->mask] = radeon_fence_ref(f);
- return seq->end++;
+}
+/**
- radeon_seq_push - check for room and add the fence
- @seq: sequence object
- @fence: the fence object
- Check for room on the ring buffer, if there isn't enough
- reallocate the sequence object and add the fence.
- Returns the generated ID.
- */
+uint64_t radeon_seq_push(struct radeon_seq **seq, struct radeon_fence *fence) +{
- unsigned size_for_new_seq = 4;
- uint64_t start_for_new_seq = 1;
- if (*seq) {
/* try to release old replacements */
while ((*seq)->replacement) {
radeon_seq_try_free(*seq);
if ((*seq)->start == (*seq)->end) {
struct radeon_seq *repl = (*seq)->replacement;
kfree(*seq);
*seq = repl;
} else {
/* move on to the current container */
seq = &(*seq)->replacement;
}
}
/* check if we have enough room for one more fence */
radeon_seq_try_free(*seq);
if (((*seq)->end - (*seq)->start) <= (*seq)->mask)
return radeon_seq_add(*seq, fence);
/* not enough room, let's allocate a replacement */
size_for_new_seq = ((*seq)->mask + 1) * 2;
start_for_new_seq = (*seq)->end + 1;
seq = &(*seq)->replacement;
- }
- *seq = radeon_seq_create(start_for_new_seq, size_for_new_seq);
- if (!*seq) {
/* not enough memory for a new sequence object, but failing
here isn't a good idea either cause the commands are already
submitted to the hardware. So just block on the fence. */
int r = radeon_fence_wait(fence, false);
if (r)
DRM_ERROR("Error waiting for fence (%d)\n", r);
return 0;
- }
- return radeon_seq_add(*seq, fence);
+}
+/**
- radeon_seq_query - lockup fence by it's ID
- @seq: sequence object
- @id: the generated ID
- Lockup the associated fence by it's ID.
- Returns fence object or NULL if it couldn't be found.
- */
+struct radeon_fence *radeon_seq_query(struct radeon_seq *seq, uint64_t id) +{
- struct radeon_fence **ring;
- while (seq && id > seq->end)
seq = seq->replacement;
- if (!seq || id < seq->start)
return NULL;
- ring = radeon_seq_ring(seq);
- return ring[id & seq->mask];
+}
+/**
- radeon_seq_destroy - destroy the sequence object
- @seq_ptr: pointer to sequence object
- Destroy the sequence objects and release all fence references taken.
- */
+void radeon_seq_destroy(struct radeon_seq **seq_ptr) +{
- struct radeon_seq *seq = *seq_ptr;
- while (seq) {
struct radeon_seq *repl = seq->replacement;
unsigned start = seq->start & seq->mask;
unsigned end = seq->end & seq->mask;
struct radeon_fence **ring;
unsigned i;
ring = radeon_seq_ring(seq);
for (i = start; i < end; ++i)
radeon_fence_unref(&ring[i]);
kfree(seq);
seq = repl;
- }
- *seq_ptr = NULL;
+} diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h index 50d0fb4..6b2b2e7 100644 --- a/include/uapi/drm/radeon_drm.h +++ b/include/uapi/drm/radeon_drm.h @@ -959,6 +959,7 @@ struct drm_radeon_gem_va { #define RADEON_CS_RING_VCE 4 /* The third dword of RADEON_CHUNK_ID_FLAGS is a sint32 that sets the priority */ /* 0 = normal, + = higher priority, - = lower priority */ +/* The fourth and fives dword are a 64bit fence ID generated for this CS */
struct drm_radeon_cs_chunk { uint32_t chunk_id; -- 1.9.1
dri-devel mailing list dri-devel@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/dri-devel
From b95fe503dad89875da9db2d11d05f93eca41232d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= jglisse@redhat.com Date: Thu, 18 Sep 2014 22:51:21 -0400 Subject: [PATCH] drm/radeon: cs sequence id and cs completion query. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit
This report back to userspace ring id and sequence number that can be use by userspace to query for the completetion of the cs on the hardware. This also add a new ioctl to perform such query.
There is an extra value supplied to userspace the wrap counter that is use to detect wrap around and to gracefuly handle the case where userspace might be querying about a very, very, very old cs executed long time ago in a far far away universe.
This patch is aimed to introduce the necessary ground work for user space explicit synchronization. By allowing userspace to query about cs completetion on hardware, user space can perform operation and synchronization on buffer by itself without having the cs ioctl to implicitly wait for older cs completetion before scheduling new cs. This part is however left to a follow-up patch.
Signed-off-by: Jérôme Glisse jglisse@redhat.com
drivers/gpu/drm/radeon/radeon.h | 2 ++ drivers/gpu/drm/radeon/radeon_cs.c | 60 +++++++++++++++++++++++++++++++++++ drivers/gpu/drm/radeon/radeon_fence.c | 4 +++ drivers/gpu/drm/radeon/radeon_kms.c | 1 + include/uapi/drm/radeon_drm.h | 9 ++++++ 5 files changed, 76 insertions(+)
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 5f05b4c..1f1dd1f 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -118,6 +118,7 @@ extern int radeon_bapm; #define RADEON_DEBUGFS_MAX_COMPONENTS 32 #define RADEONFB_CONN_LIMIT 4 #define RADEON_BIOS_NUM_SCRATCH 8 +#define RADEON_SEQ_WRAP_VALUE (1 << 30)
/* fence seq are set to this number when signaled */ #define RADEON_FENCE_SIGNALED_SEQ 0LL @@ -355,6 +356,7 @@ struct radeon_fence_driver { /* sync_seq is protected by ring emission lock */ uint64_t sync_seq[RADEON_NUM_RINGS]; atomic64_t last_seq;
- int32_t wrap_seq; bool initialized; };
diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c index 83f382e..be4ae25 100644 --- a/drivers/gpu/drm/radeon/radeon_cs.c +++ b/drivers/gpu/drm/radeon/radeon_cs.c @@ -404,6 +404,17 @@ static void radeon_cs_parser_fini(struct radeon_cs_parser *parser, int error, bo ttm_eu_fence_buffer_objects(&parser->ticket, &parser->validated, parser->ib.fence);
if (parser->chunk_flags && parser->chunk_flags->length_dw > 4) {
uint32_t __user *to = parser->chunk_flags->user_ptr;
uint32_t tmp;
tmp = lower_32_bits(parser->ib.fence->seq);
copy_to_user(&to[3], &tmp, sizeof(uint32_t));
tmp = parser->ib.fence->ring;
copy_to_user(&to[4], &tmp, sizeof(uint32_t));
tmp = rdev->fence_drv[tmp]->wrap_seq;
copy_to_user(&to[5], &tmp, sizeof(uint32_t));
} else if (backoff) { ttm_eu_backoff_reservation(&parser->ticket, &parser->validated);}
@@ -823,3 +834,52 @@ int radeon_cs_packet_next_reloc(struct radeon_cs_parser *p, *cs_reloc = p->relocs_ptr[(idx / 4)]; return 0; }
+int radeon_cs_done_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) +{
- struct radeon_device *rdev = dev->dev_private;
- struct drm_radeon_cs_done *args = data;
- unsigned i = args->ring;
- int32_t last_seq, sync_seq, wrap_seq;
- /* FIXME check args->ring value is ok. */
- /*
* The memory barrier is match with the one in radeon_fence_emit() and
* it insure us that we get the right matching wrap_seq and sync_seq.
*
* Note that no need to protect the fence_drv.sync_seq here as barrier
* insure us we will get the coherency we need.
*/
- wrap_seq = ACCESS_ONCE(rdev->fence_drv[i].wrap_seq);
- smp_rmb();
- sync_seq = lower_32_bits(ACCESS_ONCE(rdev->fence_drv[i].sync_seq[i]));
- /*
* So if current wrap_seq and one we are queried with are differ by
* more than one this means that we are queried about a very old fence
* seq value and we can assume it is long done.
*
* Well this is not entirely true, for it to be true we would need to
* stall when we increment the wrap counter if cs in previous wrap were
* not completed but this is highly unlikely. So live with the trill of
* it going wrong !
*/
- if (abs((unsigned)wrap_seq - (unsigned)args->wrap) > 1)
return 1;
- /* Now check if currently reported fence seq is done or not. */
- /* FIXME call fence func to update last_seq just in case. */
- last_seq = lower_32_bits(atomic64_read(&rdev->fence_drv[i].last_seq));
- if ((last_seq - arg->seq) >= 0)
return 1;
- /*
* Last failsafe to handle the horrible case were userspace holded on
* a wrap and seq value for so long without querying that it we wrapped
* around. This is here to avoid userspace waiting for a fence that was
* emited a long time ago but the current sync_seq[ring] value migth be
* stuck and thus we might go bigger than this very old seq value.
*/
- if (((sync_seq - args->seq) < 0) && args->wrap == wrap_seq)
return 1;
- return 0;
+} diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 9137870..a6adcf6 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -119,6 +119,10 @@ int radeon_fence_emit(struct radeon_device *rdev, kref_init(&((*fence)->kref)); (*fence)->rdev = rdev; (*fence)->seq = ++rdev->fence_drv[ring].sync_seq[ring];
- /* Barrier is important for radeon_fence_cs_done_ioctl. */
- smp_wmb();
- if (rdev->fence_drv[ring].sync_seq[ring] == RADEON_SEQ_WRAP_VALUE)
(*fence)->ring = ring; radeon_fence_ring_emit(rdev, ring, *fence); trace_radeon_fence_emit(rdev->ddev, ring, (*fence)->seq);rdev->fence_drv[ring].wrap_seq++;
diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index eb7164d..c9cfcf5 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -885,5 +885,6 @@ const struct drm_ioctl_desc radeon_ioctls_kms[] = { DRM_IOCTL_DEF_DRV(RADEON_GEM_BUSY, radeon_gem_busy_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(RADEON_GEM_VA, radeon_gem_va_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(RADEON_GEM_OP, radeon_gem_op_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW),
- DRM_IOCTL_DEF_DRV(RADEON_CS_DONE, radeon_cs_done_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), }; int radeon_max_kms_ioctl = ARRAY_SIZE(radeon_ioctls_kms);
diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h index fea6099..a0c215d 100644 --- a/include/uapi/drm/radeon_drm.h +++ b/include/uapi/drm/radeon_drm.h @@ -554,6 +554,7 @@ typedef struct { #define DRM_IOCTL_RADEON_GEM_BUSY DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_BUSY, struct drm_radeon_gem_busy) #define DRM_IOCTL_RADEON_GEM_VA DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_VA, struct drm_radeon_gem_va) #define DRM_IOCTL_RADEON_GEM_OP DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_OP, struct drm_radeon_gem_op) +#define DRM_IOCTL_RADEON_CS_DONE DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_CS_DONE, struct drm_radeon_cs_done)
typedef struct drm_radeon_init { enum { @@ -936,6 +937,7 @@ struct drm_radeon_gem_va { #define RADEON_CS_RING_VCE 4 /* The third dword of RADEON_CHUNK_ID_FLAGS is a sint32 that sets the priority */ /* 0 = normal, + = higher priority, - = lower priority */ +/* The fifth, sixth, seventh dword are a 32bit fence ID, ring id and wrap id of this CS */
struct drm_radeon_cs_chunk { uint32_t chunk_id; @@ -1038,4 +1040,11 @@ struct drm_radeon_info {
#define CIK_TILE_MODE_DEPTH_STENCIL_1D 5
+struct drm_radeon_cs_done {
- int32_t seq;
- int32_t ring;
- int32_t wrap;
- int32_t pad;
+};
- #endif
-- 1.9.3
On Fri, Sep 19, 2014 at 10:03:43AM +0200, Christian König wrote:
Hi Jerome,
this is exactly the approach which we wanted to avoid. And according to Alex it was actually you guys (I think Dave) who noted that this is probably not such a good idea.
The issue is that by telling userspace the ring specific sequence number we nail down the order of execution. E.g. a scheduler who want to change the order in which IBs are submitted to the hardware can't do so any more because we already gave userspace a sequence number to identify the command submission.
Well for curent code this design does work, for hw where you want to have the hw/driver do some scheduling this can still be achieve by having a seq number per ring per process.
Additional to that for hardware inter device synchronization we want to refer to a certain kernel object and not just the fence number. So we can only use some kind of handle based approach, e.g. as discussed before we want to be able to turn this sequence number in a FD fence.
Inter-device is not an issue here. For inter-device you want to convert this seq number to an fd at which point you can create a fence structure with the proper information ie create a structure only if it is needed.
Really this is the only sane design if we start allocating structure for each cs (well for each cs userspace cares) this gonna play bad with memory pressure.
Cheers, Jérôme
Regards, Christian.
Am 19.09.2014 um 06:06 schrieb Jerome Glisse:
On Thu, Sep 18, 2014 at 11:11:57PM -0400, Jerome Glisse wrote:
On Thu, Sep 18, 2014 at 08:42:16PM -0400, Jerome Glisse wrote:
On Thu, Sep 18, 2014 at 05:30:12PM +0200, Christian König wrote:
From: Christian König christian.koenig@amd.com
This patch adds a new 64bit ID as a result to each command submission.
NAK for design reasons.
First and foremost we DO NOT WANT TO ALLOCATE ANYTHING for submission id. I know that hooking up to fence make life easy but this is not how it should be done.
What you want to expose is the per ring 32bit seq value and report to user space the value and the ring id. To handle the wrap around you add a uniq 32bit wrap counter which is incremented every 1 << 30 or some big number seq value number (but must be smaller than 1 << 31). Then you use arithmetic to determine if a cs seq number is done or not (ie the seq wrap around counter diff should not be bigger then 2 or 3) and the seq number should obviously be after again using arithmetic diff like jiffies code.
All this being hidden in the kernel all the user space have to know is : 32 bit seq value per ring 32 bit ring uniq id 32 wrap counter per ring
With such scheme you never allocate anything or worry about any fence. Way simpler and less code.
Cheers, Jérôme
Because code is clearer than words attached is a patch of what i am thinking about. The arithmetic should be right as well as the coherency and proper memory barrier. Thought it is completely untested and will require couple fixes for properly checking userspace arguments and other aspect (see FIXME in patches).
There is no allocation, it is a pretty small patch, and it provide a fast ligthweight solution. While the new ioctl argument and return value can be improved (like reporting more information to userspace like for instance warning userspace when it is querying very old fences). I think the overall design is sound.
Let me know.
Ok actually after a beer (given that planet express manual for Bender, which also apply to me, clearly stipulate to expect malfunction if not properly inebriate) it became clear that by reversing the problem i could make it a lot simpler. So attached is an even simpler patch that handle gracefully user space querying for very old fence (ie that could have wrap around).
Code explains it all.
Cheers, Jérôme
Signed-off-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/radeon/Makefile | 2 +- drivers/gpu/drm/radeon/radeon.h | 13 +- drivers/gpu/drm/radeon/radeon_cs.c | 13 ++ drivers/gpu/drm/radeon/radeon_kms.c | 41 +++---- drivers/gpu/drm/radeon/radeon_seq.c | 229 ++++++++++++++++++++++++++++++++++++ include/uapi/drm/radeon_drm.h | 1 + 6 files changed, 277 insertions(+), 22 deletions(-) create mode 100644 drivers/gpu/drm/radeon/radeon_seq.c
diff --git a/drivers/gpu/drm/radeon/Makefile b/drivers/gpu/drm/radeon/Makefile index 8cc9f68..21ee928 100644 --- a/drivers/gpu/drm/radeon/Makefile +++ b/drivers/gpu/drm/radeon/Makefile @@ -81,7 +81,7 @@ radeon-y += radeon_device.o radeon_asic.o radeon_kms.o \ rv770_smc.o cypress_dpm.o btc_dpm.o sumo_dpm.o sumo_smc.o trinity_dpm.o \ trinity_smc.o ni_dpm.o si_smc.o si_dpm.o kv_smc.o kv_dpm.o ci_smc.o \ ci_dpm.o dce6_afmt.o radeon_vm.o radeon_ucode.o radeon_ib.o radeon_mn.o \
- radeon_sync.o
- radeon_sync.o radeon_seq.o
# add async DMA block radeon-y += \ diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index b3b4e96..dbfd346 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -423,6 +423,15 @@ static inline bool radeon_fence_is_earlier(struct radeon_fence *a, } /*
- Userspace command submission identifier generation
- */
+struct radeon_seq;
+uint64_t radeon_seq_push(struct radeon_seq **seq, struct radeon_fence *fence); +struct radeon_fence *radeon_seq_query(struct radeon_seq *seq, uint64_t id); +void radeon_seq_destroy(struct radeon_seq **seq);
+/*
- Tiling registers
*/ struct radeon_surface_reg { @@ -946,7 +955,9 @@ struct radeon_vm_manager {
- file private structure
*/ struct radeon_fpriv {
- struct radeon_vm vm;
- struct radeon_vm vm;
- struct mutex seq_lock;
- struct radeon_seq *seq;
}; /* diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c index b8f20a5..0c0f0b3 100644 --- a/drivers/gpu/drm/radeon/radeon_cs.c +++ b/drivers/gpu/drm/radeon/radeon_cs.c @@ -413,6 +413,19 @@ static void radeon_cs_parser_fini(struct radeon_cs_parser *parser, int error, bo unsigned i; if (!error) {
if (parser->chunk_flags &&
parser->chunk_flags->length_dw > 4) {
struct radeon_fpriv *fpriv = parser->filp->driver_priv;
uint32_t __user *to = parser->chunk_flags->user_ptr;
uint64_t id;
mutex_lock(&fpriv->seq_lock);
id = radeon_seq_push(&fpriv->seq, parser->ib.fence);
mutex_unlock(&fpriv->seq_lock);
copy_to_user(&to[3], &id, sizeof(uint64_t));
}
- /* Sort the buffer list from the smallest to largest buffer,
- which affects the order of buffers in the LRU list.
- This assures that the smallest buffers are added first
diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index 85ee6f7..38880af 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -578,39 +578,34 @@ void radeon_driver_lastclose_kms(struct drm_device *dev) */ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) {
- struct radeon_fpriv *fpriv = kzalloc(sizeof(*fpriv), GFP_KERNEL); struct radeon_device *rdev = dev->dev_private; int r;
- file_priv->driver_priv = NULL;
- if (unlikely(!fpriv))
return -ENOMEM;
- file_priv->driver_priv = fpriv; r = pm_runtime_get_sync(dev->dev); if (r < 0)
return r;
/* new gpu have virtual address space support */ if (rdev->family >= CHIP_CAYMAN) {goto error;
struct radeon_vm *vm; int r;struct radeon_fpriv *fpriv;
fpriv = kzalloc(sizeof(*fpriv), GFP_KERNEL);
if (unlikely(!fpriv)) {
return -ENOMEM;
}
- vm = &fpriv->vm; r = radeon_vm_init(rdev, vm);
if (r) {
kfree(fpriv);
return r;
}
if (r)
if (rdev->accel_working) { r = radeon_bo_reserve(rdev->ring_tmp_bo.bo, false); if (r) { radeon_vm_fini(rdev, vm);goto error;
kfree(fpriv);
return r;
goto error; } /* map the ib pool buffer read only into
@@ -623,16 +618,20 @@ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) RADEON_VM_PAGE_SNOOPED); if (r) { radeon_vm_fini(rdev, vm);
kfree(fpriv);
return r;
}goto error; }
}file_priv->driver_priv = fpriv;
- mutex_init(&fpriv->seq_lock);
- pm_runtime_mark_last_busy(dev->dev); pm_runtime_put_autosuspend(dev->dev); return 0;
+error:
- kfree(fpriv);
- return r;
} /** @@ -646,11 +645,13 @@ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) void radeon_driver_postclose_kms(struct drm_device *dev, struct drm_file *file_priv) {
- struct radeon_fpriv *fpriv = file_priv->driver_priv; struct radeon_device *rdev = dev->dev_private;
- radeon_seq_destroy(&fpriv->seq);
- /* new gpu have virtual address space support */ if (rdev->family >= CHIP_CAYMAN && file_priv->driver_priv) {
struct radeon_vm *vm = &fpriv->vm; int r;struct radeon_fpriv *fpriv = file_priv->driver_priv;
@@ -664,9 +665,9 @@ void radeon_driver_postclose_kms(struct drm_device *dev, } radeon_vm_fini(rdev, vm);
kfree(fpriv);
}file_priv->driver_priv = NULL;
- kfree(fpriv);
- file_priv->driver_priv = NULL;
} /** diff --git a/drivers/gpu/drm/radeon/radeon_seq.c b/drivers/gpu/drm/radeon/radeon_seq.c new file mode 100644 index 0000000..d8857f1 --- /dev/null +++ b/drivers/gpu/drm/radeon/radeon_seq.c @@ -0,0 +1,229 @@ +/*
- Copyright 2014 Advanced Micro Devices, Inc.
- All Rights Reserved.
- Permission is hereby granted, free of charge, to any person obtaining a
- copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sub license, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
- DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- USE OR OTHER DEALINGS IN THE SOFTWARE.
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial portions
- of the Software.
- */
+/*
- Authors:
- Christian König christian.koenig@amd.com
- */
+#include <drm/drmP.h> +#include "radeon.h"
+/*
- ID sequences
- This code generates a 64bit identifier for a command submission.
- It works by adding the fence of the command submission to a automatically
- resizing ring buffer.
- */
+struct radeon_seq {
- uint64_t start;
- uint64_t end;
- uint64_t mask;
- struct radeon_seq *replacement;
+};
+/**
- radeon_seq_create - create a new sequence object
- @start: start value for this sequence
- @size: size of the ring buffer, must be power of two
- Allocate and initialize a new ring buffer and header.
- Returns NULL if allocation fails, new object otherwise.
- */
+static struct radeon_seq *radeon_seq_create(uint64_t start, unsigned size) +{
- unsigned bytes = sizeof(struct radeon_seq) +
size * sizeof(struct radeon_fence *);
- struct radeon_seq *seq;
- seq = kmalloc(bytes, GFP_KERNEL);
- if (!seq)
return NULL;
- seq->start = start;
- seq->end = start;
- seq->mask = size - 1;
- seq->replacement = NULL;
- return seq;
+}
+/**
- radeon_seq_ring - get pointer to ring buffer
- @seq: sequence object
- Calculate the address of the ring buffer.
- */
+static struct radeon_fence **radeon_seq_ring(struct radeon_seq *seq) +{
- return (struct radeon_fence **)&seq[1];
+}
+/**
- radeon_seq_try_free - try to free fences from the ring buffer
- @seq: sequence object
- Try to free fences from the start of the ring buffer.
- */
+static void radeon_seq_try_free(struct radeon_seq *seq) +{
- struct radeon_fence **ring = radeon_seq_ring(seq);
- while (seq->start != seq->end) {
unsigned idx = seq->start & seq->mask;
struct radeon_fence *fence = ring[idx];
if (!radeon_fence_signaled(fence))
break;
radeon_fence_unref(&fence);
++seq->start;
- }
+}
+/**
- radeon_seq_add - add new fence to the end of the ring buffer
- @seq: sequence object
- @f: the fence object
- Add the fence and return the generated ID.
- */
+static uint64_t radeon_seq_add(struct radeon_seq *seq, struct radeon_fence *f) +{
- struct radeon_fence **ring = radeon_seq_ring(seq);
- ring[seq->end & seq->mask] = radeon_fence_ref(f);
- return seq->end++;
+}
+/**
- radeon_seq_push - check for room and add the fence
- @seq: sequence object
- @fence: the fence object
- Check for room on the ring buffer, if there isn't enough
- reallocate the sequence object and add the fence.
- Returns the generated ID.
- */
+uint64_t radeon_seq_push(struct radeon_seq **seq, struct radeon_fence *fence) +{
- unsigned size_for_new_seq = 4;
- uint64_t start_for_new_seq = 1;
- if (*seq) {
/* try to release old replacements */
while ((*seq)->replacement) {
radeon_seq_try_free(*seq);
if ((*seq)->start == (*seq)->end) {
struct radeon_seq *repl = (*seq)->replacement;
kfree(*seq);
*seq = repl;
} else {
/* move on to the current container */
seq = &(*seq)->replacement;
}
}
/* check if we have enough room for one more fence */
radeon_seq_try_free(*seq);
if (((*seq)->end - (*seq)->start) <= (*seq)->mask)
return radeon_seq_add(*seq, fence);
/* not enough room, let's allocate a replacement */
size_for_new_seq = ((*seq)->mask + 1) * 2;
start_for_new_seq = (*seq)->end + 1;
seq = &(*seq)->replacement;
- }
- *seq = radeon_seq_create(start_for_new_seq, size_for_new_seq);
- if (!*seq) {
/* not enough memory for a new sequence object, but failing
here isn't a good idea either cause the commands are already
submitted to the hardware. So just block on the fence. */
int r = radeon_fence_wait(fence, false);
if (r)
DRM_ERROR("Error waiting for fence (%d)\n", r);
return 0;
- }
- return radeon_seq_add(*seq, fence);
+}
+/**
- radeon_seq_query - lockup fence by it's ID
- @seq: sequence object
- @id: the generated ID
- Lockup the associated fence by it's ID.
- Returns fence object or NULL if it couldn't be found.
- */
+struct radeon_fence *radeon_seq_query(struct radeon_seq *seq, uint64_t id) +{
- struct radeon_fence **ring;
- while (seq && id > seq->end)
seq = seq->replacement;
- if (!seq || id < seq->start)
return NULL;
- ring = radeon_seq_ring(seq);
- return ring[id & seq->mask];
+}
+/**
- radeon_seq_destroy - destroy the sequence object
- @seq_ptr: pointer to sequence object
- Destroy the sequence objects and release all fence references taken.
- */
+void radeon_seq_destroy(struct radeon_seq **seq_ptr) +{
- struct radeon_seq *seq = *seq_ptr;
- while (seq) {
struct radeon_seq *repl = seq->replacement;
unsigned start = seq->start & seq->mask;
unsigned end = seq->end & seq->mask;
struct radeon_fence **ring;
unsigned i;
ring = radeon_seq_ring(seq);
for (i = start; i < end; ++i)
radeon_fence_unref(&ring[i]);
kfree(seq);
seq = repl;
- }
- *seq_ptr = NULL;
+} diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h index 50d0fb4..6b2b2e7 100644 --- a/include/uapi/drm/radeon_drm.h +++ b/include/uapi/drm/radeon_drm.h @@ -959,6 +959,7 @@ struct drm_radeon_gem_va { #define RADEON_CS_RING_VCE 4 /* The third dword of RADEON_CHUNK_ID_FLAGS is a sint32 that sets the priority */ /* 0 = normal, + = higher priority, - = lower priority */ +/* The fourth and fives dword are a 64bit fence ID generated for this CS */ struct drm_radeon_cs_chunk { uint32_t chunk_id; -- 1.9.1
dri-devel mailing list dri-devel@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/dri-devel
From b95fe503dad89875da9db2d11d05f93eca41232d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= jglisse@redhat.com Date: Thu, 18 Sep 2014 22:51:21 -0400 Subject: [PATCH] drm/radeon: cs sequence id and cs completion query. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit
This report back to userspace ring id and sequence number that can be use by userspace to query for the completetion of the cs on the hardware. This also add a new ioctl to perform such query.
There is an extra value supplied to userspace the wrap counter that is use to detect wrap around and to gracefuly handle the case where userspace might be querying about a very, very, very old cs executed long time ago in a far far away universe.
This patch is aimed to introduce the necessary ground work for user space explicit synchronization. By allowing userspace to query about cs completetion on hardware, user space can perform operation and synchronization on buffer by itself without having the cs ioctl to implicitly wait for older cs completetion before scheduling new cs. This part is however left to a follow-up patch.
Signed-off-by: Jérôme Glisse jglisse@redhat.com
drivers/gpu/drm/radeon/radeon.h | 2 ++ drivers/gpu/drm/radeon/radeon_cs.c | 60 +++++++++++++++++++++++++++++++++++ drivers/gpu/drm/radeon/radeon_fence.c | 4 +++ drivers/gpu/drm/radeon/radeon_kms.c | 1 + include/uapi/drm/radeon_drm.h | 9 ++++++ 5 files changed, 76 insertions(+)
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 5f05b4c..1f1dd1f 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -118,6 +118,7 @@ extern int radeon_bapm; #define RADEON_DEBUGFS_MAX_COMPONENTS 32 #define RADEONFB_CONN_LIMIT 4 #define RADEON_BIOS_NUM_SCRATCH 8 +#define RADEON_SEQ_WRAP_VALUE (1 << 30) /* fence seq are set to this number when signaled */ #define RADEON_FENCE_SIGNALED_SEQ 0LL @@ -355,6 +356,7 @@ struct radeon_fence_driver { /* sync_seq is protected by ring emission lock */ uint64_t sync_seq[RADEON_NUM_RINGS]; atomic64_t last_seq;
- int32_t wrap_seq; bool initialized;
}; diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c index 83f382e..be4ae25 100644 --- a/drivers/gpu/drm/radeon/radeon_cs.c +++ b/drivers/gpu/drm/radeon/radeon_cs.c @@ -404,6 +404,17 @@ static void radeon_cs_parser_fini(struct radeon_cs_parser *parser, int error, bo ttm_eu_fence_buffer_objects(&parser->ticket, &parser->validated, parser->ib.fence);
if (parser->chunk_flags && parser->chunk_flags->length_dw > 4) {
uint32_t __user *to = parser->chunk_flags->user_ptr;
uint32_t tmp;
tmp = lower_32_bits(parser->ib.fence->seq);
copy_to_user(&to[3], &tmp, sizeof(uint32_t));
tmp = parser->ib.fence->ring;
copy_to_user(&to[4], &tmp, sizeof(uint32_t));
tmp = rdev->fence_drv[tmp]->wrap_seq;
copy_to_user(&to[5], &tmp, sizeof(uint32_t));
} else if (backoff) { ttm_eu_backoff_reservation(&parser->ticket, &parser->validated);}
@@ -823,3 +834,52 @@ int radeon_cs_packet_next_reloc(struct radeon_cs_parser *p, *cs_reloc = p->relocs_ptr[(idx / 4)]; return 0; }
+int radeon_cs_done_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) +{
- struct radeon_device *rdev = dev->dev_private;
- struct drm_radeon_cs_done *args = data;
- unsigned i = args->ring;
- int32_t last_seq, sync_seq, wrap_seq;
- /* FIXME check args->ring value is ok. */
- /*
* The memory barrier is match with the one in radeon_fence_emit() and
* it insure us that we get the right matching wrap_seq and sync_seq.
*
* Note that no need to protect the fence_drv.sync_seq here as barrier
* insure us we will get the coherency we need.
*/
- wrap_seq = ACCESS_ONCE(rdev->fence_drv[i].wrap_seq);
- smp_rmb();
- sync_seq = lower_32_bits(ACCESS_ONCE(rdev->fence_drv[i].sync_seq[i]));
- /*
* So if current wrap_seq and one we are queried with are differ by
* more than one this means that we are queried about a very old fence
* seq value and we can assume it is long done.
*
* Well this is not entirely true, for it to be true we would need to
* stall when we increment the wrap counter if cs in previous wrap were
* not completed but this is highly unlikely. So live with the trill of
* it going wrong !
*/
- if (abs((unsigned)wrap_seq - (unsigned)args->wrap) > 1)
return 1;
- /* Now check if currently reported fence seq is done or not. */
- /* FIXME call fence func to update last_seq just in case. */
- last_seq = lower_32_bits(atomic64_read(&rdev->fence_drv[i].last_seq));
- if ((last_seq - arg->seq) >= 0)
return 1;
- /*
* Last failsafe to handle the horrible case were userspace holded on
* a wrap and seq value for so long without querying that it we wrapped
* around. This is here to avoid userspace waiting for a fence that was
* emited a long time ago but the current sync_seq[ring] value migth be
* stuck and thus we might go bigger than this very old seq value.
*/
- if (((sync_seq - args->seq) < 0) && args->wrap == wrap_seq)
return 1;
- return 0;
+} diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 9137870..a6adcf6 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -119,6 +119,10 @@ int radeon_fence_emit(struct radeon_device *rdev, kref_init(&((*fence)->kref)); (*fence)->rdev = rdev; (*fence)->seq = ++rdev->fence_drv[ring].sync_seq[ring];
- /* Barrier is important for radeon_fence_cs_done_ioctl. */
- smp_wmb();
- if (rdev->fence_drv[ring].sync_seq[ring] == RADEON_SEQ_WRAP_VALUE)
(*fence)->ring = ring; radeon_fence_ring_emit(rdev, ring, *fence); trace_radeon_fence_emit(rdev->ddev, ring, (*fence)->seq);rdev->fence_drv[ring].wrap_seq++;
diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index eb7164d..c9cfcf5 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -885,5 +885,6 @@ const struct drm_ioctl_desc radeon_ioctls_kms[] = { DRM_IOCTL_DEF_DRV(RADEON_GEM_BUSY, radeon_gem_busy_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(RADEON_GEM_VA, radeon_gem_va_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(RADEON_GEM_OP, radeon_gem_op_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW),
- DRM_IOCTL_DEF_DRV(RADEON_CS_DONE, radeon_cs_done_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW),
}; int radeon_max_kms_ioctl = ARRAY_SIZE(radeon_ioctls_kms); diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h index fea6099..a0c215d 100644 --- a/include/uapi/drm/radeon_drm.h +++ b/include/uapi/drm/radeon_drm.h @@ -554,6 +554,7 @@ typedef struct { #define DRM_IOCTL_RADEON_GEM_BUSY DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_BUSY, struct drm_radeon_gem_busy) #define DRM_IOCTL_RADEON_GEM_VA DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_VA, struct drm_radeon_gem_va) #define DRM_IOCTL_RADEON_GEM_OP DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_OP, struct drm_radeon_gem_op) +#define DRM_IOCTL_RADEON_CS_DONE DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_CS_DONE, struct drm_radeon_cs_done) typedef struct drm_radeon_init { enum { @@ -936,6 +937,7 @@ struct drm_radeon_gem_va { #define RADEON_CS_RING_VCE 4 /* The third dword of RADEON_CHUNK_ID_FLAGS is a sint32 that sets the priority */ /* 0 = normal, + = higher priority, - = lower priority */ +/* The fifth, sixth, seventh dword are a 32bit fence ID, ring id and wrap id of this CS */ struct drm_radeon_cs_chunk { uint32_t chunk_id; @@ -1038,4 +1040,11 @@ struct drm_radeon_info { #define CIK_TILE_MODE_DEPTH_STENCIL_1D 5 +struct drm_radeon_cs_done {
- int32_t seq;
- int32_t ring;
- int32_t wrap;
- int32_t pad;
+};
#endif
1.9.3
Am 19.09.2014 um 15:40 schrieb Jerome Glisse:
On Fri, Sep 19, 2014 at 10:03:43AM +0200, Christian König wrote:
Hi Jerome,
this is exactly the approach which we wanted to avoid. And according to Alex it was actually you guys (I think Dave) who noted that this is probably not such a good idea.
The issue is that by telling userspace the ring specific sequence number we nail down the order of execution. E.g. a scheduler who want to change the order in which IBs are submitted to the hardware can't do so any more because we already gave userspace a sequence number to identify the command submission.
Well for curent code this design does work, for hw where you want to have the hw/driver do some scheduling this can still be achieve by having a seq number per ring per process.
Correct, for current design we could do so but we have more long term goals with that in mind. I will reorder my work a bit which should make it more clear why we need the original fence and not only the sequence for this.
Additional to that for hardware inter device synchronization we want to refer to a certain kernel object and not just the fence number. So we can only use some kind of handle based approach, e.g. as discussed before we want to be able to turn this sequence number in a FD fence.
Inter-device is not an issue here. For inter-device you want to convert this seq number to an fd at which point you can create a fence structure with the proper information ie create a structure only if it is needed.
Really this is the only sane design if we start allocating structure for each cs (well for each cs userspace cares) this gonna play bad with memory pressure.
Mhm, what's the matter with this?
My implementation only allocates a single ring buffer for each client which automatically resizes, apart from that we only have the fence structure which is allocated anyway and released when automatically as soon as all users goes away.
And the scheduler implementation the Intel guys are working on would use even more memory. As long as everything as accounted to the correct process I don't really see a problem with that.
Regards, Christian.
Cheers, Jérôme
Regards, Christian.
Am 19.09.2014 um 06:06 schrieb Jerome Glisse:
On Thu, Sep 18, 2014 at 11:11:57PM -0400, Jerome Glisse wrote:
On Thu, Sep 18, 2014 at 08:42:16PM -0400, Jerome Glisse wrote:
On Thu, Sep 18, 2014 at 05:30:12PM +0200, Christian König wrote:
From: Christian König christian.koenig@amd.com
This patch adds a new 64bit ID as a result to each command submission.
NAK for design reasons.
First and foremost we DO NOT WANT TO ALLOCATE ANYTHING for submission id. I know that hooking up to fence make life easy but this is not how it should be done.
What you want to expose is the per ring 32bit seq value and report to user space the value and the ring id. To handle the wrap around you add a uniq 32bit wrap counter which is incremented every 1 << 30 or some big number seq value number (but must be smaller than 1 << 31). Then you use arithmetic to determine if a cs seq number is done or not (ie the seq wrap around counter diff should not be bigger then 2 or 3) and the seq number should obviously be after again using arithmetic diff like jiffies code.
All this being hidden in the kernel all the user space have to know is : 32 bit seq value per ring 32 bit ring uniq id 32 wrap counter per ring
With such scheme you never allocate anything or worry about any fence. Way simpler and less code.
Cheers, Jérôme
Because code is clearer than words attached is a patch of what i am thinking about. The arithmetic should be right as well as the coherency and proper memory barrier. Thought it is completely untested and will require couple fixes for properly checking userspace arguments and other aspect (see FIXME in patches).
There is no allocation, it is a pretty small patch, and it provide a fast ligthweight solution. While the new ioctl argument and return value can be improved (like reporting more information to userspace like for instance warning userspace when it is querying very old fences). I think the overall design is sound.
Let me know.
Ok actually after a beer (given that planet express manual for Bender, which also apply to me, clearly stipulate to expect malfunction if not properly inebriate) it became clear that by reversing the problem i could make it a lot simpler. So attached is an even simpler patch that handle gracefully user space querying for very old fence (ie that could have wrap around).
Code explains it all.
Cheers, Jérôme
Signed-off-by: Christian König christian.koenig@amd.com
drivers/gpu/drm/radeon/Makefile | 2 +- drivers/gpu/drm/radeon/radeon.h | 13 +- drivers/gpu/drm/radeon/radeon_cs.c | 13 ++ drivers/gpu/drm/radeon/radeon_kms.c | 41 +++---- drivers/gpu/drm/radeon/radeon_seq.c | 229 ++++++++++++++++++++++++++++++++++++ include/uapi/drm/radeon_drm.h | 1 + 6 files changed, 277 insertions(+), 22 deletions(-) create mode 100644 drivers/gpu/drm/radeon/radeon_seq.c
diff --git a/drivers/gpu/drm/radeon/Makefile b/drivers/gpu/drm/radeon/Makefile index 8cc9f68..21ee928 100644 --- a/drivers/gpu/drm/radeon/Makefile +++ b/drivers/gpu/drm/radeon/Makefile @@ -81,7 +81,7 @@ radeon-y += radeon_device.o radeon_asic.o radeon_kms.o \ rv770_smc.o cypress_dpm.o btc_dpm.o sumo_dpm.o sumo_smc.o trinity_dpm.o \ trinity_smc.o ni_dpm.o si_smc.o si_dpm.o kv_smc.o kv_dpm.o ci_smc.o \ ci_dpm.o dce6_afmt.o radeon_vm.o radeon_ucode.o radeon_ib.o radeon_mn.o \
- radeon_sync.o
- radeon_sync.o radeon_seq.o # add async DMA block radeon-y += \
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index b3b4e96..dbfd346 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -423,6 +423,15 @@ static inline bool radeon_fence_is_earlier(struct radeon_fence *a, } /*
- Userspace command submission identifier generation
- */
+struct radeon_seq;
+uint64_t radeon_seq_push(struct radeon_seq **seq, struct radeon_fence *fence); +struct radeon_fence *radeon_seq_query(struct radeon_seq *seq, uint64_t id); +void radeon_seq_destroy(struct radeon_seq **seq);
+/*
- Tiling registers
*/ struct radeon_surface_reg { @@ -946,7 +955,9 @@ struct radeon_vm_manager {
- file private structure
*/ struct radeon_fpriv {
- struct radeon_vm vm;
- struct radeon_vm vm;
- struct mutex seq_lock;
- struct radeon_seq *seq; }; /*
diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c index b8f20a5..0c0f0b3 100644 --- a/drivers/gpu/drm/radeon/radeon_cs.c +++ b/drivers/gpu/drm/radeon/radeon_cs.c @@ -413,6 +413,19 @@ static void radeon_cs_parser_fini(struct radeon_cs_parser *parser, int error, bo unsigned i; if (!error) {
if (parser->chunk_flags &&
parser->chunk_flags->length_dw > 4) {
struct radeon_fpriv *fpriv = parser->filp->driver_priv;
uint32_t __user *to = parser->chunk_flags->user_ptr;
uint64_t id;
mutex_lock(&fpriv->seq_lock);
id = radeon_seq_push(&fpriv->seq, parser->ib.fence);
mutex_unlock(&fpriv->seq_lock);
copy_to_user(&to[3], &id, sizeof(uint64_t));
}
- /* Sort the buffer list from the smallest to largest buffer,
- which affects the order of buffers in the LRU list.
- This assures that the smallest buffers are added first
diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index 85ee6f7..38880af 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -578,39 +578,34 @@ void radeon_driver_lastclose_kms(struct drm_device *dev) */ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) {
- struct radeon_fpriv *fpriv = kzalloc(sizeof(*fpriv), GFP_KERNEL); struct radeon_device *rdev = dev->dev_private; int r;
- file_priv->driver_priv = NULL;
- if (unlikely(!fpriv))
return -ENOMEM;
- file_priv->driver_priv = fpriv; r = pm_runtime_get_sync(dev->dev); if (r < 0)
return r;
/* new gpu have virtual address space support */ if (rdev->family >= CHIP_CAYMAN) {goto error;
struct radeon_vm *vm; int r;struct radeon_fpriv *fpriv;
fpriv = kzalloc(sizeof(*fpriv), GFP_KERNEL);
if (unlikely(!fpriv)) {
return -ENOMEM;
}
- vm = &fpriv->vm; r = radeon_vm_init(rdev, vm);
if (r) {
kfree(fpriv);
return r;
}
if (r)
if (rdev->accel_working) { r = radeon_bo_reserve(rdev->ring_tmp_bo.bo, false); if (r) { radeon_vm_fini(rdev, vm);goto error;
kfree(fpriv);
return r;
goto error; } /* map the ib pool buffer read only into
@@ -623,16 +618,20 @@ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) RADEON_VM_PAGE_SNOOPED); if (r) { radeon_vm_fini(rdev, vm);
kfree(fpriv);
return r;
}goto error; }
}file_priv->driver_priv = fpriv;
- mutex_init(&fpriv->seq_lock);
- pm_runtime_mark_last_busy(dev->dev); pm_runtime_put_autosuspend(dev->dev); return 0;
+error:
- kfree(fpriv);
- return r; } /**
@@ -646,11 +645,13 @@ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) void radeon_driver_postclose_kms(struct drm_device *dev, struct drm_file *file_priv) {
- struct radeon_fpriv *fpriv = file_priv->driver_priv; struct radeon_device *rdev = dev->dev_private;
- radeon_seq_destroy(&fpriv->seq);
- /* new gpu have virtual address space support */ if (rdev->family >= CHIP_CAYMAN && file_priv->driver_priv) {
struct radeon_vm *vm = &fpriv->vm; int r;struct radeon_fpriv *fpriv = file_priv->driver_priv;
@@ -664,9 +665,9 @@ void radeon_driver_postclose_kms(struct drm_device *dev, } radeon_vm_fini(rdev, vm);
kfree(fpriv);
}file_priv->driver_priv = NULL;
- kfree(fpriv);
- file_priv->driver_priv = NULL; } /**
diff --git a/drivers/gpu/drm/radeon/radeon_seq.c b/drivers/gpu/drm/radeon/radeon_seq.c new file mode 100644 index 0000000..d8857f1 --- /dev/null +++ b/drivers/gpu/drm/radeon/radeon_seq.c @@ -0,0 +1,229 @@ +/*
- Copyright 2014 Advanced Micro Devices, Inc.
- All Rights Reserved.
- Permission is hereby granted, free of charge, to any person obtaining a
- copy of this software and associated documentation files (the
- "Software"), to deal in the Software without restriction, including
- without limitation the rights to use, copy, modify, merge, publish,
- distribute, sub license, and/or sell copies of the Software, and to
- permit persons to whom the Software is furnished to do so, subject to
- the following conditions:
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
- DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- USE OR OTHER DEALINGS IN THE SOFTWARE.
- The above copyright notice and this permission notice (including the
- next paragraph) shall be included in all copies or substantial portions
- of the Software.
- */
+/*
- Authors:
- Christian König christian.koenig@amd.com
- */
+#include <drm/drmP.h> +#include "radeon.h"
+/*
- ID sequences
- This code generates a 64bit identifier for a command submission.
- It works by adding the fence of the command submission to a automatically
- resizing ring buffer.
- */
+struct radeon_seq {
- uint64_t start;
- uint64_t end;
- uint64_t mask;
- struct radeon_seq *replacement;
+};
+/**
- radeon_seq_create - create a new sequence object
- @start: start value for this sequence
- @size: size of the ring buffer, must be power of two
- Allocate and initialize a new ring buffer and header.
- Returns NULL if allocation fails, new object otherwise.
- */
+static struct radeon_seq *radeon_seq_create(uint64_t start, unsigned size) +{
- unsigned bytes = sizeof(struct radeon_seq) +
size * sizeof(struct radeon_fence *);
- struct radeon_seq *seq;
- seq = kmalloc(bytes, GFP_KERNEL);
- if (!seq)
return NULL;
- seq->start = start;
- seq->end = start;
- seq->mask = size - 1;
- seq->replacement = NULL;
- return seq;
+}
+/**
- radeon_seq_ring - get pointer to ring buffer
- @seq: sequence object
- Calculate the address of the ring buffer.
- */
+static struct radeon_fence **radeon_seq_ring(struct radeon_seq *seq) +{
- return (struct radeon_fence **)&seq[1];
+}
+/**
- radeon_seq_try_free - try to free fences from the ring buffer
- @seq: sequence object
- Try to free fences from the start of the ring buffer.
- */
+static void radeon_seq_try_free(struct radeon_seq *seq) +{
- struct radeon_fence **ring = radeon_seq_ring(seq);
- while (seq->start != seq->end) {
unsigned idx = seq->start & seq->mask;
struct radeon_fence *fence = ring[idx];
if (!radeon_fence_signaled(fence))
break;
radeon_fence_unref(&fence);
++seq->start;
- }
+}
+/**
- radeon_seq_add - add new fence to the end of the ring buffer
- @seq: sequence object
- @f: the fence object
- Add the fence and return the generated ID.
- */
+static uint64_t radeon_seq_add(struct radeon_seq *seq, struct radeon_fence *f) +{
- struct radeon_fence **ring = radeon_seq_ring(seq);
- ring[seq->end & seq->mask] = radeon_fence_ref(f);
- return seq->end++;
+}
+/**
- radeon_seq_push - check for room and add the fence
- @seq: sequence object
- @fence: the fence object
- Check for room on the ring buffer, if there isn't enough
- reallocate the sequence object and add the fence.
- Returns the generated ID.
- */
+uint64_t radeon_seq_push(struct radeon_seq **seq, struct radeon_fence *fence) +{
- unsigned size_for_new_seq = 4;
- uint64_t start_for_new_seq = 1;
- if (*seq) {
/* try to release old replacements */
while ((*seq)->replacement) {
radeon_seq_try_free(*seq);
if ((*seq)->start == (*seq)->end) {
struct radeon_seq *repl = (*seq)->replacement;
kfree(*seq);
*seq = repl;
} else {
/* move on to the current container */
seq = &(*seq)->replacement;
}
}
/* check if we have enough room for one more fence */
radeon_seq_try_free(*seq);
if (((*seq)->end - (*seq)->start) <= (*seq)->mask)
return radeon_seq_add(*seq, fence);
/* not enough room, let's allocate a replacement */
size_for_new_seq = ((*seq)->mask + 1) * 2;
start_for_new_seq = (*seq)->end + 1;
seq = &(*seq)->replacement;
- }
- *seq = radeon_seq_create(start_for_new_seq, size_for_new_seq);
- if (!*seq) {
/* not enough memory for a new sequence object, but failing
here isn't a good idea either cause the commands are already
submitted to the hardware. So just block on the fence. */
int r = radeon_fence_wait(fence, false);
if (r)
DRM_ERROR("Error waiting for fence (%d)\n", r);
return 0;
- }
- return radeon_seq_add(*seq, fence);
+}
+/**
- radeon_seq_query - lockup fence by it's ID
- @seq: sequence object
- @id: the generated ID
- Lockup the associated fence by it's ID.
- Returns fence object or NULL if it couldn't be found.
- */
+struct radeon_fence *radeon_seq_query(struct radeon_seq *seq, uint64_t id) +{
- struct radeon_fence **ring;
- while (seq && id > seq->end)
seq = seq->replacement;
- if (!seq || id < seq->start)
return NULL;
- ring = radeon_seq_ring(seq);
- return ring[id & seq->mask];
+}
+/**
- radeon_seq_destroy - destroy the sequence object
- @seq_ptr: pointer to sequence object
- Destroy the sequence objects and release all fence references taken.
- */
+void radeon_seq_destroy(struct radeon_seq **seq_ptr) +{
- struct radeon_seq *seq = *seq_ptr;
- while (seq) {
struct radeon_seq *repl = seq->replacement;
unsigned start = seq->start & seq->mask;
unsigned end = seq->end & seq->mask;
struct radeon_fence **ring;
unsigned i;
ring = radeon_seq_ring(seq);
for (i = start; i < end; ++i)
radeon_fence_unref(&ring[i]);
kfree(seq);
seq = repl;
- }
- *seq_ptr = NULL;
+} diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h index 50d0fb4..6b2b2e7 100644 --- a/include/uapi/drm/radeon_drm.h +++ b/include/uapi/drm/radeon_drm.h @@ -959,6 +959,7 @@ struct drm_radeon_gem_va { #define RADEON_CS_RING_VCE 4 /* The third dword of RADEON_CHUNK_ID_FLAGS is a sint32 that sets the priority */ /* 0 = normal, + = higher priority, - = lower priority */ +/* The fourth and fives dword are a 64bit fence ID generated for this CS */ struct drm_radeon_cs_chunk { uint32_t chunk_id; -- 1.9.1
dri-devel mailing list dri-devel@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/dri-devel
From b95fe503dad89875da9db2d11d05f93eca41232d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= jglisse@redhat.com Date: Thu, 18 Sep 2014 22:51:21 -0400 Subject: [PATCH] drm/radeon: cs sequence id and cs completion query. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit
This report back to userspace ring id and sequence number that can be use by userspace to query for the completetion of the cs on the hardware. This also add a new ioctl to perform such query.
There is an extra value supplied to userspace the wrap counter that is use to detect wrap around and to gracefuly handle the case where userspace might be querying about a very, very, very old cs executed long time ago in a far far away universe.
This patch is aimed to introduce the necessary ground work for user space explicit synchronization. By allowing userspace to query about cs completetion on hardware, user space can perform operation and synchronization on buffer by itself without having the cs ioctl to implicitly wait for older cs completetion before scheduling new cs. This part is however left to a follow-up patch.
Signed-off-by: Jérôme Glisse jglisse@redhat.com
drivers/gpu/drm/radeon/radeon.h | 2 ++ drivers/gpu/drm/radeon/radeon_cs.c | 60 +++++++++++++++++++++++++++++++++++ drivers/gpu/drm/radeon/radeon_fence.c | 4 +++ drivers/gpu/drm/radeon/radeon_kms.c | 1 + include/uapi/drm/radeon_drm.h | 9 ++++++ 5 files changed, 76 insertions(+)
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 5f05b4c..1f1dd1f 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -118,6 +118,7 @@ extern int radeon_bapm; #define RADEON_DEBUGFS_MAX_COMPONENTS 32 #define RADEONFB_CONN_LIMIT 4 #define RADEON_BIOS_NUM_SCRATCH 8 +#define RADEON_SEQ_WRAP_VALUE (1 << 30) /* fence seq are set to this number when signaled */ #define RADEON_FENCE_SIGNALED_SEQ 0LL @@ -355,6 +356,7 @@ struct radeon_fence_driver { /* sync_seq is protected by ring emission lock */ uint64_t sync_seq[RADEON_NUM_RINGS]; atomic64_t last_seq;
- int32_t wrap_seq; bool initialized; };
diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c index 83f382e..be4ae25 100644 --- a/drivers/gpu/drm/radeon/radeon_cs.c +++ b/drivers/gpu/drm/radeon/radeon_cs.c @@ -404,6 +404,17 @@ static void radeon_cs_parser_fini(struct radeon_cs_parser *parser, int error, bo ttm_eu_fence_buffer_objects(&parser->ticket, &parser->validated, parser->ib.fence);
if (parser->chunk_flags && parser->chunk_flags->length_dw > 4) {
uint32_t __user *to = parser->chunk_flags->user_ptr;
uint32_t tmp;
tmp = lower_32_bits(parser->ib.fence->seq);
copy_to_user(&to[3], &tmp, sizeof(uint32_t));
tmp = parser->ib.fence->ring;
copy_to_user(&to[4], &tmp, sizeof(uint32_t));
tmp = rdev->fence_drv[tmp]->wrap_seq;
copy_to_user(&to[5], &tmp, sizeof(uint32_t));
} else if (backoff) { ttm_eu_backoff_reservation(&parser->ticket, &parser->validated);}
@@ -823,3 +834,52 @@ int radeon_cs_packet_next_reloc(struct radeon_cs_parser *p, *cs_reloc = p->relocs_ptr[(idx / 4)]; return 0; }
+int radeon_cs_done_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) +{
- struct radeon_device *rdev = dev->dev_private;
- struct drm_radeon_cs_done *args = data;
- unsigned i = args->ring;
- int32_t last_seq, sync_seq, wrap_seq;
- /* FIXME check args->ring value is ok. */
- /*
* The memory barrier is match with the one in radeon_fence_emit() and
* it insure us that we get the right matching wrap_seq and sync_seq.
*
* Note that no need to protect the fence_drv.sync_seq here as barrier
* insure us we will get the coherency we need.
*/
- wrap_seq = ACCESS_ONCE(rdev->fence_drv[i].wrap_seq);
- smp_rmb();
- sync_seq = lower_32_bits(ACCESS_ONCE(rdev->fence_drv[i].sync_seq[i]));
- /*
* So if current wrap_seq and one we are queried with are differ by
* more than one this means that we are queried about a very old fence
* seq value and we can assume it is long done.
*
* Well this is not entirely true, for it to be true we would need to
* stall when we increment the wrap counter if cs in previous wrap were
* not completed but this is highly unlikely. So live with the trill of
* it going wrong !
*/
- if (abs((unsigned)wrap_seq - (unsigned)args->wrap) > 1)
return 1;
- /* Now check if currently reported fence seq is done or not. */
- /* FIXME call fence func to update last_seq just in case. */
- last_seq = lower_32_bits(atomic64_read(&rdev->fence_drv[i].last_seq));
- if ((last_seq - arg->seq) >= 0)
return 1;
- /*
* Last failsafe to handle the horrible case were userspace holded on
* a wrap and seq value for so long without querying that it we wrapped
* around. This is here to avoid userspace waiting for a fence that was
* emited a long time ago but the current sync_seq[ring] value migth be
* stuck and thus we might go bigger than this very old seq value.
*/
- if (((sync_seq - args->seq) < 0) && args->wrap == wrap_seq)
return 1;
- return 0;
+} diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 9137870..a6adcf6 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -119,6 +119,10 @@ int radeon_fence_emit(struct radeon_device *rdev, kref_init(&((*fence)->kref)); (*fence)->rdev = rdev; (*fence)->seq = ++rdev->fence_drv[ring].sync_seq[ring];
- /* Barrier is important for radeon_fence_cs_done_ioctl. */
- smp_wmb();
- if (rdev->fence_drv[ring].sync_seq[ring] == RADEON_SEQ_WRAP_VALUE)
(*fence)->ring = ring; radeon_fence_ring_emit(rdev, ring, *fence); trace_radeon_fence_emit(rdev->ddev, ring, (*fence)->seq);rdev->fence_drv[ring].wrap_seq++;
diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index eb7164d..c9cfcf5 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -885,5 +885,6 @@ const struct drm_ioctl_desc radeon_ioctls_kms[] = { DRM_IOCTL_DEF_DRV(RADEON_GEM_BUSY, radeon_gem_busy_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(RADEON_GEM_VA, radeon_gem_va_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(RADEON_GEM_OP, radeon_gem_op_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW),
- DRM_IOCTL_DEF_DRV(RADEON_CS_DONE, radeon_cs_done_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), }; int radeon_max_kms_ioctl = ARRAY_SIZE(radeon_ioctls_kms);
diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h index fea6099..a0c215d 100644 --- a/include/uapi/drm/radeon_drm.h +++ b/include/uapi/drm/radeon_drm.h @@ -554,6 +554,7 @@ typedef struct { #define DRM_IOCTL_RADEON_GEM_BUSY DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_BUSY, struct drm_radeon_gem_busy) #define DRM_IOCTL_RADEON_GEM_VA DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_VA, struct drm_radeon_gem_va) #define DRM_IOCTL_RADEON_GEM_OP DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_OP, struct drm_radeon_gem_op) +#define DRM_IOCTL_RADEON_CS_DONE DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_CS_DONE, struct drm_radeon_cs_done) typedef struct drm_radeon_init { enum { @@ -936,6 +937,7 @@ struct drm_radeon_gem_va { #define RADEON_CS_RING_VCE 4 /* The third dword of RADEON_CHUNK_ID_FLAGS is a sint32 that sets the priority */ /* 0 = normal, + = higher priority, - = lower priority */ +/* The fifth, sixth, seventh dword are a 32bit fence ID, ring id and wrap id of this CS */ struct drm_radeon_cs_chunk { uint32_t chunk_id; @@ -1038,4 +1040,11 @@ struct drm_radeon_info { #define CIK_TILE_MODE_DEPTH_STENCIL_1D 5 +struct drm_radeon_cs_done {
- int32_t seq;
- int32_t ring;
- int32_t wrap;
- int32_t pad;
+};
- #endif
-- 1.9.3
On Fri, Sep 19, 2014 at 04:29:35PM +0200, Christian König wrote:
Am 19.09.2014 um 15:40 schrieb Jerome Glisse:
On Fri, Sep 19, 2014 at 10:03:43AM +0200, Christian König wrote:
Hi Jerome,
this is exactly the approach which we wanted to avoid. And according to Alex it was actually you guys (I think Dave) who noted that this is probably not such a good idea.
The issue is that by telling userspace the ring specific sequence number we nail down the order of execution. E.g. a scheduler who want to change the order in which IBs are submitted to the hardware can't do so any more because we already gave userspace a sequence number to identify the command submission.
Well for curent code this design does work, for hw where you want to have the hw/driver do some scheduling this can still be achieve by having a seq number per ring per process.
Correct, for current design we could do so but we have more long term goals with that in mind. I will reorder my work a bit which should make it more clear why we need the original fence and not only the sequence for this.
Well yes i am eagger to understand why a simple solution is not the one people are aiming for. I should add that if you are working on a new kernel driver to support new hardware there is one thing you should definitly do and that is getting rid of implicit synchronization support. If you do so then there is no longer the need to allocate a fence whatsoever. That was my whole point in the dma fence discussion. With explicit fencing all you need is to give some seq number back to userspace.
Note that memory placement obviously is another beast and if hardware you are targetting for have more scheduling capabilities that it is obvious that the way we do thing now is pointless ie there is no need to associate fence with buffer object and buffer should be move as a preamble to each cs.
Additional to that for hardware inter device synchronization we want to refer to a certain kernel object and not just the fence number. So we can only use some kind of handle based approach, e.g. as discussed before we want to be able to turn this sequence number in a FD fence.
Inter-device is not an issue here. For inter-device you want to convert this seq number to an fd at which point you can create a fence structure with the proper information ie create a structure only if it is needed.
Really this is the only sane design if we start allocating structure for each cs (well for each cs userspace cares) this gonna play bad with memory pressure.
Mhm, what's the matter with this?
My implementation only allocates a single ring buffer for each client which automatically resizes, apart from that we only have the fence structure which is allocated anyway and released when automatically as soon as all users goes away.
This is already too much, as i said with explicit synchronization there is never a need for any allocation. All what is needed is a seq number.
And the scheduler implementation the Intel guys are working on would use even more memory. As long as everything as accounted to the correct process I don't really see a problem with that.
Does not mean you have to copy their bad choices.
Regards, Christian.
Cheers, Jérôme
Regards, Christian.
Am 19.09.2014 um 06:06 schrieb Jerome Glisse:
On Thu, Sep 18, 2014 at 11:11:57PM -0400, Jerome Glisse wrote:
On Thu, Sep 18, 2014 at 08:42:16PM -0400, Jerome Glisse wrote:
On Thu, Sep 18, 2014 at 05:30:12PM +0200, Christian König wrote: >From: Christian König christian.koenig@amd.com > >This patch adds a new 64bit ID as a result to each command submission. NAK for design reasons.
First and foremost we DO NOT WANT TO ALLOCATE ANYTHING for submission id. I know that hooking up to fence make life easy but this is not how it should be done.
What you want to expose is the per ring 32bit seq value and report to user space the value and the ring id. To handle the wrap around you add a uniq 32bit wrap counter which is incremented every 1 << 30 or some big number seq value number (but must be smaller than 1 << 31). Then you use arithmetic to determine if a cs seq number is done or not (ie the seq wrap around counter diff should not be bigger then 2 or 3) and the seq number should obviously be after again using arithmetic diff like jiffies code.
All this being hidden in the kernel all the user space have to know is : 32 bit seq value per ring 32 bit ring uniq id 32 wrap counter per ring
With such scheme you never allocate anything or worry about any fence. Way simpler and less code.
Cheers, Jérôme
Because code is clearer than words attached is a patch of what i am thinking about. The arithmetic should be right as well as the coherency and proper memory barrier. Thought it is completely untested and will require couple fixes for properly checking userspace arguments and other aspect (see FIXME in patches).
There is no allocation, it is a pretty small patch, and it provide a fast ligthweight solution. While the new ioctl argument and return value can be improved (like reporting more information to userspace like for instance warning userspace when it is querying very old fences). I think the overall design is sound.
Let me know.
Ok actually after a beer (given that planet express manual for Bender, which also apply to me, clearly stipulate to expect malfunction if not properly inebriate) it became clear that by reversing the problem i could make it a lot simpler. So attached is an even simpler patch that handle gracefully user space querying for very old fence (ie that could have wrap around).
Code explains it all.
Cheers, Jérôme
>Signed-off-by: Christian König christian.koenig@amd.com >--- > drivers/gpu/drm/radeon/Makefile | 2 +- > drivers/gpu/drm/radeon/radeon.h | 13 +- > drivers/gpu/drm/radeon/radeon_cs.c | 13 ++ > drivers/gpu/drm/radeon/radeon_kms.c | 41 +++---- > drivers/gpu/drm/radeon/radeon_seq.c | 229 ++++++++++++++++++++++++++++++++++++ > include/uapi/drm/radeon_drm.h | 1 + > 6 files changed, 277 insertions(+), 22 deletions(-) > create mode 100644 drivers/gpu/drm/radeon/radeon_seq.c > >diff --git a/drivers/gpu/drm/radeon/Makefile b/drivers/gpu/drm/radeon/Makefile >index 8cc9f68..21ee928 100644 >--- a/drivers/gpu/drm/radeon/Makefile >+++ b/drivers/gpu/drm/radeon/Makefile >@@ -81,7 +81,7 @@ radeon-y += radeon_device.o radeon_asic.o radeon_kms.o \ > rv770_smc.o cypress_dpm.o btc_dpm.o sumo_dpm.o sumo_smc.o trinity_dpm.o \ > trinity_smc.o ni_dpm.o si_smc.o si_dpm.o kv_smc.o kv_dpm.o ci_smc.o \ > ci_dpm.o dce6_afmt.o radeon_vm.o radeon_ucode.o radeon_ib.o radeon_mn.o \ >- radeon_sync.o >+ radeon_sync.o radeon_seq.o > # add async DMA block > radeon-y += \ >diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h >index b3b4e96..dbfd346 100644 >--- a/drivers/gpu/drm/radeon/radeon.h >+++ b/drivers/gpu/drm/radeon/radeon.h >@@ -423,6 +423,15 @@ static inline bool radeon_fence_is_earlier(struct radeon_fence *a, > } > /* >+ * Userspace command submission identifier generation >+ */ >+struct radeon_seq; >+ >+uint64_t radeon_seq_push(struct radeon_seq **seq, struct radeon_fence *fence); >+struct radeon_fence *radeon_seq_query(struct radeon_seq *seq, uint64_t id); >+void radeon_seq_destroy(struct radeon_seq **seq); >+ >+/* > * Tiling registers > */ > struct radeon_surface_reg { >@@ -946,7 +955,9 @@ struct radeon_vm_manager { > * file private structure > */ > struct radeon_fpriv { >- struct radeon_vm vm; >+ struct radeon_vm vm; >+ struct mutex seq_lock; >+ struct radeon_seq *seq; > }; > /* >diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c >index b8f20a5..0c0f0b3 100644 >--- a/drivers/gpu/drm/radeon/radeon_cs.c >+++ b/drivers/gpu/drm/radeon/radeon_cs.c >@@ -413,6 +413,19 @@ static void radeon_cs_parser_fini(struct radeon_cs_parser *parser, int error, bo > unsigned i; > if (!error) { >+ if (parser->chunk_flags && >+ parser->chunk_flags->length_dw > 4) { >+ struct radeon_fpriv *fpriv = parser->filp->driver_priv; >+ uint32_t __user *to = parser->chunk_flags->user_ptr; >+ uint64_t id; >+ >+ mutex_lock(&fpriv->seq_lock); >+ id = radeon_seq_push(&fpriv->seq, parser->ib.fence); >+ mutex_unlock(&fpriv->seq_lock); >+ >+ copy_to_user(&to[3], &id, sizeof(uint64_t)); >+ } >+ > /* Sort the buffer list from the smallest to largest buffer, > * which affects the order of buffers in the LRU list. > * This assures that the smallest buffers are added first >diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c >index 85ee6f7..38880af 100644 >--- a/drivers/gpu/drm/radeon/radeon_kms.c >+++ b/drivers/gpu/drm/radeon/radeon_kms.c >@@ -578,39 +578,34 @@ void radeon_driver_lastclose_kms(struct drm_device *dev) > */ > int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) > { >+ struct radeon_fpriv *fpriv = kzalloc(sizeof(*fpriv), GFP_KERNEL); > struct radeon_device *rdev = dev->dev_private; > int r; >- file_priv->driver_priv = NULL; >+ if (unlikely(!fpriv)) >+ return -ENOMEM; >+ >+ file_priv->driver_priv = fpriv; > r = pm_runtime_get_sync(dev->dev); > if (r < 0) >- return r; >+ goto error; > /* new gpu have virtual address space support */ > if (rdev->family >= CHIP_CAYMAN) { >- struct radeon_fpriv *fpriv; > struct radeon_vm *vm; > int r; >- fpriv = kzalloc(sizeof(*fpriv), GFP_KERNEL); >- if (unlikely(!fpriv)) { >- return -ENOMEM; >- } >- > vm = &fpriv->vm; > r = radeon_vm_init(rdev, vm); >- if (r) { >- kfree(fpriv); >- return r; >- } >+ if (r) >+ goto error; > if (rdev->accel_working) { > r = radeon_bo_reserve(rdev->ring_tmp_bo.bo, false); > if (r) { > radeon_vm_fini(rdev, vm); >- kfree(fpriv); >- return r; >+ goto error; > } > /* map the ib pool buffer read only into >@@ -623,16 +618,20 @@ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) > RADEON_VM_PAGE_SNOOPED); > if (r) { > radeon_vm_fini(rdev, vm); >- kfree(fpriv); >- return r; >+ goto error; > } > } >- file_priv->driver_priv = fpriv; > } >+ mutex_init(&fpriv->seq_lock); >+ > pm_runtime_mark_last_busy(dev->dev); > pm_runtime_put_autosuspend(dev->dev); > return 0; >+ >+error: >+ kfree(fpriv); >+ return r; > } > /** >@@ -646,11 +645,13 @@ int radeon_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv) > void radeon_driver_postclose_kms(struct drm_device *dev, > struct drm_file *file_priv) > { >+ struct radeon_fpriv *fpriv = file_priv->driver_priv; > struct radeon_device *rdev = dev->dev_private; >+ radeon_seq_destroy(&fpriv->seq); >+ > /* new gpu have virtual address space support */ > if (rdev->family >= CHIP_CAYMAN && file_priv->driver_priv) { >- struct radeon_fpriv *fpriv = file_priv->driver_priv; > struct radeon_vm *vm = &fpriv->vm; > int r; >@@ -664,9 +665,9 @@ void radeon_driver_postclose_kms(struct drm_device *dev, > } > radeon_vm_fini(rdev, vm); >- kfree(fpriv); >- file_priv->driver_priv = NULL; > } >+ kfree(fpriv); >+ file_priv->driver_priv = NULL; > } > /** >diff --git a/drivers/gpu/drm/radeon/radeon_seq.c b/drivers/gpu/drm/radeon/radeon_seq.c >new file mode 100644 >index 0000000..d8857f1 >--- /dev/null >+++ b/drivers/gpu/drm/radeon/radeon_seq.c >@@ -0,0 +1,229 @@ >+/* >+ * Copyright 2014 Advanced Micro Devices, Inc. >+ * All Rights Reserved. >+ * >+ * Permission is hereby granted, free of charge, to any person obtaining a >+ * copy of this software and associated documentation files (the >+ * "Software"), to deal in the Software without restriction, including >+ * without limitation the rights to use, copy, modify, merge, publish, >+ * distribute, sub license, and/or sell copies of the Software, and to >+ * permit persons to whom the Software is furnished to do so, subject to >+ * the following conditions: >+ * >+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR >+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, >+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL >+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, >+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR >+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE >+ * USE OR OTHER DEALINGS IN THE SOFTWARE. >+ * >+ * The above copyright notice and this permission notice (including the >+ * next paragraph) shall be included in all copies or substantial portions >+ * of the Software. >+ * >+ */ >+/* >+ * Authors: >+ * Christian König christian.koenig@amd.com >+ */ >+ >+#include <drm/drmP.h> >+#include "radeon.h" >+ >+/* >+ * ID sequences >+ * This code generates a 64bit identifier for a command submission. >+ * It works by adding the fence of the command submission to a automatically >+ * resizing ring buffer. >+ */ >+ >+struct radeon_seq { >+ uint64_t start; >+ uint64_t end; >+ uint64_t mask; >+ struct radeon_seq *replacement; >+}; >+ >+/** >+ * radeon_seq_create - create a new sequence object >+ * >+ * @start: start value for this sequence >+ * @size: size of the ring buffer, must be power of two >+ * >+ * Allocate and initialize a new ring buffer and header. >+ * Returns NULL if allocation fails, new object otherwise. >+ */ >+static struct radeon_seq *radeon_seq_create(uint64_t start, unsigned size) >+{ >+ unsigned bytes = sizeof(struct radeon_seq) + >+ size * sizeof(struct radeon_fence *); >+ >+ struct radeon_seq *seq; >+ >+ seq = kmalloc(bytes, GFP_KERNEL); >+ if (!seq) >+ return NULL; >+ >+ seq->start = start; >+ seq->end = start; >+ seq->mask = size - 1; >+ seq->replacement = NULL; >+ >+ return seq; >+} >+ >+/** >+ * radeon_seq_ring - get pointer to ring buffer >+ * >+ * @seq: sequence object >+ * >+ * Calculate the address of the ring buffer. >+ */ >+static struct radeon_fence **radeon_seq_ring(struct radeon_seq *seq) >+{ >+ return (struct radeon_fence **)&seq[1]; >+} >+ >+/** >+ * radeon_seq_try_free - try to free fences from the ring buffer >+ * >+ * @seq: sequence object >+ * >+ * Try to free fences from the start of the ring buffer. >+ */ >+static void radeon_seq_try_free(struct radeon_seq *seq) >+{ >+ struct radeon_fence **ring = radeon_seq_ring(seq); >+ >+ while (seq->start != seq->end) { >+ unsigned idx = seq->start & seq->mask; >+ struct radeon_fence *fence = ring[idx]; >+ >+ if (!radeon_fence_signaled(fence)) >+ break; >+ >+ radeon_fence_unref(&fence); >+ ++seq->start; >+ } >+} >+ >+/** >+ * radeon_seq_add - add new fence to the end of the ring buffer >+ * >+ * @seq: sequence object >+ * @f: the fence object >+ * >+ * Add the fence and return the generated ID. >+ */ >+static uint64_t radeon_seq_add(struct radeon_seq *seq, struct radeon_fence *f) >+{ >+ struct radeon_fence **ring = radeon_seq_ring(seq); >+ >+ ring[seq->end & seq->mask] = radeon_fence_ref(f); >+ return seq->end++; >+} >+ >+/** >+ * radeon_seq_push - check for room and add the fence >+ * >+ * @seq: sequence object >+ * @fence: the fence object >+ * >+ * Check for room on the ring buffer, if there isn't enough >+ * reallocate the sequence object and add the fence. >+ * Returns the generated ID. >+ */ >+uint64_t radeon_seq_push(struct radeon_seq **seq, struct radeon_fence *fence) >+{ >+ unsigned size_for_new_seq = 4; >+ uint64_t start_for_new_seq = 1; >+ >+ if (*seq) { >+ /* try to release old replacements */ >+ while ((*seq)->replacement) { >+ radeon_seq_try_free(*seq); >+ if ((*seq)->start == (*seq)->end) { >+ struct radeon_seq *repl = (*seq)->replacement; >+ >+ kfree(*seq); >+ *seq = repl; >+ } else { >+ /* move on to the current container */ >+ seq = &(*seq)->replacement; >+ } >+ } >+ >+ /* check if we have enough room for one more fence */ >+ radeon_seq_try_free(*seq); >+ if (((*seq)->end - (*seq)->start) <= (*seq)->mask) >+ return radeon_seq_add(*seq, fence); >+ >+ /* not enough room, let's allocate a replacement */ >+ size_for_new_seq = ((*seq)->mask + 1) * 2; >+ start_for_new_seq = (*seq)->end + 1; >+ seq = &(*seq)->replacement; >+ } >+ >+ *seq = radeon_seq_create(start_for_new_seq, size_for_new_seq); >+ if (!*seq) { >+ /* not enough memory for a new sequence object, but failing >+ here isn't a good idea either cause the commands are already >+ submitted to the hardware. So just block on the fence. */ >+ int r = radeon_fence_wait(fence, false); >+ if (r) >+ DRM_ERROR("Error waiting for fence (%d)\n", r); >+ return 0; >+ } >+ return radeon_seq_add(*seq, fence); >+} >+ >+/** >+ * radeon_seq_query - lockup fence by it's ID >+ * >+ * @seq: sequence object >+ * @id: the generated ID >+ * >+ * Lockup the associated fence by it's ID. >+ * Returns fence object or NULL if it couldn't be found. >+ */ >+struct radeon_fence *radeon_seq_query(struct radeon_seq *seq, uint64_t id) >+{ >+ struct radeon_fence **ring; >+ >+ while (seq && id > seq->end) >+ seq = seq->replacement; >+ >+ if (!seq || id < seq->start) >+ return NULL; >+ >+ ring = radeon_seq_ring(seq); >+ return ring[id & seq->mask]; >+} >+ >+/** >+ * radeon_seq_destroy - destroy the sequence object >+ * >+ * @seq_ptr: pointer to sequence object >+ * >+ * Destroy the sequence objects and release all fence references taken. >+ */ >+void radeon_seq_destroy(struct radeon_seq **seq_ptr) >+{ >+ struct radeon_seq *seq = *seq_ptr; >+ while (seq) { >+ struct radeon_seq *repl = seq->replacement; >+ unsigned start = seq->start & seq->mask; >+ unsigned end = seq->end & seq->mask; >+ struct radeon_fence **ring; >+ unsigned i; >+ >+ ring = radeon_seq_ring(seq); >+ for (i = start; i < end; ++i) >+ radeon_fence_unref(&ring[i]); >+ >+ kfree(seq); >+ seq = repl; >+ } >+ *seq_ptr = NULL; >+} >diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h >index 50d0fb4..6b2b2e7 100644 >--- a/include/uapi/drm/radeon_drm.h >+++ b/include/uapi/drm/radeon_drm.h >@@ -959,6 +959,7 @@ struct drm_radeon_gem_va { > #define RADEON_CS_RING_VCE 4 > /* The third dword of RADEON_CHUNK_ID_FLAGS is a sint32 that sets the priority */ > /* 0 = normal, + = higher priority, - = lower priority */ >+/* The fourth and fives dword are a 64bit fence ID generated for this CS */ > struct drm_radeon_cs_chunk { > uint32_t chunk_id; >-- >1.9.1 > >_______________________________________________ >dri-devel mailing list >dri-devel@lists.freedesktop.org >http://lists.freedesktop.org/mailman/listinfo/dri-devel
From b95fe503dad89875da9db2d11d05f93eca41232d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= jglisse@redhat.com Date: Thu, 18 Sep 2014 22:51:21 -0400 Subject: [PATCH] drm/radeon: cs sequence id and cs completion query. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit
This report back to userspace ring id and sequence number that can be use by userspace to query for the completetion of the cs on the hardware. This also add a new ioctl to perform such query.
There is an extra value supplied to userspace the wrap counter that is use to detect wrap around and to gracefuly handle the case where userspace might be querying about a very, very, very old cs executed long time ago in a far far away universe.
This patch is aimed to introduce the necessary ground work for user space explicit synchronization. By allowing userspace to query about cs completetion on hardware, user space can perform operation and synchronization on buffer by itself without having the cs ioctl to implicitly wait for older cs completetion before scheduling new cs. This part is however left to a follow-up patch.
Signed-off-by: Jérôme Glisse jglisse@redhat.com
drivers/gpu/drm/radeon/radeon.h | 2 ++ drivers/gpu/drm/radeon/radeon_cs.c | 60 +++++++++++++++++++++++++++++++++++ drivers/gpu/drm/radeon/radeon_fence.c | 4 +++ drivers/gpu/drm/radeon/radeon_kms.c | 1 + include/uapi/drm/radeon_drm.h | 9 ++++++ 5 files changed, 76 insertions(+)
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 5f05b4c..1f1dd1f 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -118,6 +118,7 @@ extern int radeon_bapm; #define RADEON_DEBUGFS_MAX_COMPONENTS 32 #define RADEONFB_CONN_LIMIT 4 #define RADEON_BIOS_NUM_SCRATCH 8 +#define RADEON_SEQ_WRAP_VALUE (1 << 30) /* fence seq are set to this number when signaled */ #define RADEON_FENCE_SIGNALED_SEQ 0LL @@ -355,6 +356,7 @@ struct radeon_fence_driver { /* sync_seq is protected by ring emission lock */ uint64_t sync_seq[RADEON_NUM_RINGS]; atomic64_t last_seq;
- int32_t wrap_seq; bool initialized;
}; diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c index 83f382e..be4ae25 100644 --- a/drivers/gpu/drm/radeon/radeon_cs.c +++ b/drivers/gpu/drm/radeon/radeon_cs.c @@ -404,6 +404,17 @@ static void radeon_cs_parser_fini(struct radeon_cs_parser *parser, int error, bo ttm_eu_fence_buffer_objects(&parser->ticket, &parser->validated, parser->ib.fence);
if (parser->chunk_flags && parser->chunk_flags->length_dw > 4) {
uint32_t __user *to = parser->chunk_flags->user_ptr;
uint32_t tmp;
tmp = lower_32_bits(parser->ib.fence->seq);
copy_to_user(&to[3], &tmp, sizeof(uint32_t));
tmp = parser->ib.fence->ring;
copy_to_user(&to[4], &tmp, sizeof(uint32_t));
tmp = rdev->fence_drv[tmp]->wrap_seq;
copy_to_user(&to[5], &tmp, sizeof(uint32_t));
} else if (backoff) { ttm_eu_backoff_reservation(&parser->ticket, &parser->validated);}
@@ -823,3 +834,52 @@ int radeon_cs_packet_next_reloc(struct radeon_cs_parser *p, *cs_reloc = p->relocs_ptr[(idx / 4)]; return 0; }
+int radeon_cs_done_ioctl(struct drm_device *dev, void *data, struct drm_file *filp) +{
- struct radeon_device *rdev = dev->dev_private;
- struct drm_radeon_cs_done *args = data;
- unsigned i = args->ring;
- int32_t last_seq, sync_seq, wrap_seq;
- /* FIXME check args->ring value is ok. */
- /*
* The memory barrier is match with the one in radeon_fence_emit() and
* it insure us that we get the right matching wrap_seq and sync_seq.
*
* Note that no need to protect the fence_drv.sync_seq here as barrier
* insure us we will get the coherency we need.
*/
- wrap_seq = ACCESS_ONCE(rdev->fence_drv[i].wrap_seq);
- smp_rmb();
- sync_seq = lower_32_bits(ACCESS_ONCE(rdev->fence_drv[i].sync_seq[i]));
- /*
* So if current wrap_seq and one we are queried with are differ by
* more than one this means that we are queried about a very old fence
* seq value and we can assume it is long done.
*
* Well this is not entirely true, for it to be true we would need to
* stall when we increment the wrap counter if cs in previous wrap were
* not completed but this is highly unlikely. So live with the trill of
* it going wrong !
*/
- if (abs((unsigned)wrap_seq - (unsigned)args->wrap) > 1)
return 1;
- /* Now check if currently reported fence seq is done or not. */
- /* FIXME call fence func to update last_seq just in case. */
- last_seq = lower_32_bits(atomic64_read(&rdev->fence_drv[i].last_seq));
- if ((last_seq - arg->seq) >= 0)
return 1;
- /*
* Last failsafe to handle the horrible case were userspace holded on
* a wrap and seq value for so long without querying that it we wrapped
* around. This is here to avoid userspace waiting for a fence that was
* emited a long time ago but the current sync_seq[ring] value migth be
* stuck and thus we might go bigger than this very old seq value.
*/
- if (((sync_seq - args->seq) < 0) && args->wrap == wrap_seq)
return 1;
- return 0;
+} diff --git a/drivers/gpu/drm/radeon/radeon_fence.c b/drivers/gpu/drm/radeon/radeon_fence.c index 9137870..a6adcf6 100644 --- a/drivers/gpu/drm/radeon/radeon_fence.c +++ b/drivers/gpu/drm/radeon/radeon_fence.c @@ -119,6 +119,10 @@ int radeon_fence_emit(struct radeon_device *rdev, kref_init(&((*fence)->kref)); (*fence)->rdev = rdev; (*fence)->seq = ++rdev->fence_drv[ring].sync_seq[ring];
- /* Barrier is important for radeon_fence_cs_done_ioctl. */
- smp_wmb();
- if (rdev->fence_drv[ring].sync_seq[ring] == RADEON_SEQ_WRAP_VALUE)
(*fence)->ring = ring; radeon_fence_ring_emit(rdev, ring, *fence); trace_radeon_fence_emit(rdev->ddev, ring, (*fence)->seq);rdev->fence_drv[ring].wrap_seq++;
diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index eb7164d..c9cfcf5 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -885,5 +885,6 @@ const struct drm_ioctl_desc radeon_ioctls_kms[] = { DRM_IOCTL_DEF_DRV(RADEON_GEM_BUSY, radeon_gem_busy_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(RADEON_GEM_VA, radeon_gem_va_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW), DRM_IOCTL_DEF_DRV(RADEON_GEM_OP, radeon_gem_op_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW),
- DRM_IOCTL_DEF_DRV(RADEON_CS_DONE, radeon_cs_done_ioctl, DRM_AUTH|DRM_UNLOCKED|DRM_RENDER_ALLOW),
}; int radeon_max_kms_ioctl = ARRAY_SIZE(radeon_ioctls_kms); diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h index fea6099..a0c215d 100644 --- a/include/uapi/drm/radeon_drm.h +++ b/include/uapi/drm/radeon_drm.h @@ -554,6 +554,7 @@ typedef struct { #define DRM_IOCTL_RADEON_GEM_BUSY DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_BUSY, struct drm_radeon_gem_busy) #define DRM_IOCTL_RADEON_GEM_VA DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_VA, struct drm_radeon_gem_va) #define DRM_IOCTL_RADEON_GEM_OP DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_OP, struct drm_radeon_gem_op) +#define DRM_IOCTL_RADEON_CS_DONE DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_CS_DONE, struct drm_radeon_cs_done) typedef struct drm_radeon_init { enum { @@ -936,6 +937,7 @@ struct drm_radeon_gem_va { #define RADEON_CS_RING_VCE 4 /* The third dword of RADEON_CHUNK_ID_FLAGS is a sint32 that sets the priority */ /* 0 = normal, + = higher priority, - = lower priority */ +/* The fifth, sixth, seventh dword are a 32bit fence ID, ring id and wrap id of this CS */ struct drm_radeon_cs_chunk { uint32_t chunk_id; @@ -1038,4 +1040,11 @@ struct drm_radeon_info { #define CIK_TILE_MODE_DEPTH_STENCIL_1D 5 +struct drm_radeon_cs_done {
- int32_t seq;
- int32_t ring;
- int32_t wrap;
- int32_t pad;
+};
#endif
1.9.3
From: Christian König christian.koenig@amd.com
The driver falls back to explicit synchronization as soon as buffers move between clients or are moved by TTM.
Signed-off-by: Christian König christian.koenig@amd.com --- drivers/gpu/drm/radeon/radeon.h | 5 ++++ drivers/gpu/drm/radeon/radeon_cs.c | 49 ++++++++++++++++++++++++++++++++++--- drivers/gpu/drm/radeon/radeon_gem.c | 1 + drivers/gpu/drm/radeon/radeon_ttm.c | 2 ++ include/uapi/drm/radeon_drm.h | 7 +++--- 5 files changed, 57 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index dbfd346..44a8eec 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -474,6 +474,9 @@ struct radeon_bo_va { struct radeon_bo *bo; };
+#define RADEON_BO_OWNER_IMPLICIT_SYNC (0l) +#define RADEON_BO_OWNER_FIRST_CS (~0l) + struct radeon_bo { /* Protected by gem.mutex */ struct list_head list; @@ -489,6 +492,7 @@ struct radeon_bo { u32 tiling_flags; u32 pitch; int surface_reg; + long owner; /* list of all virtual address to which this bo * is associated to */ @@ -1084,6 +1088,7 @@ struct radeon_cs_parser { struct radeon_cs_chunk *chunk_relocs; struct radeon_cs_chunk *chunk_flags; struct radeon_cs_chunk *chunk_const_ib; + struct radeon_cs_chunk *chunk_wait_for; struct radeon_ib ib; struct radeon_ib const_ib; void *track; diff --git a/drivers/gpu/drm/radeon/radeon_cs.c b/drivers/gpu/drm/radeon/radeon_cs.c index 0c0f0b3..8f62698 100644 --- a/drivers/gpu/drm/radeon/radeon_cs.c +++ b/drivers/gpu/drm/radeon/radeon_cs.c @@ -183,7 +183,8 @@ static int radeon_cs_parser_relocs(struct radeon_cs_parser *p) }
p->relocs[i].tv.bo = &p->relocs[i].robj->tbo; - p->relocs[i].tv.shared = !r->write_domain; + p->relocs[i].tv.shared = !r->write_domain || + !!p->chunk_wait_for; p->relocs[i].handle = r->handle;
radeon_cs_buckets_add(&buckets, &p->relocs[i].tv.head, @@ -251,16 +252,40 @@ static int radeon_cs_get_ring(struct radeon_cs_parser *p, u32 ring, s32 priority
static void radeon_cs_sync_rings(struct radeon_cs_parser *p) { + long owner = p->chunk_wait_for ? (long)p->filp : + RADEON_BO_OWNER_IMPLICIT_SYNC; int i;
+ if (p->chunk_wait_for) { + struct radeon_fpriv *fpriv = p->filp->driver_priv; + + for (i = 0; i < p->chunk_wait_for->length_dw; ++i) { + struct radeon_fence *fence; + uint64_t *id; + + id = (uint64_t *)&p->chunk_wait_for->kdata[i]; + + mutex_lock(&fpriv->seq_lock); + fence = radeon_seq_query(fpriv->seq, *id); + mutex_unlock(&fpriv->seq_lock); + + radeon_sync_fence(&p->ib.sync, fence); + } + } + for (i = 0; i < p->nrelocs; i++) { + struct radeon_cs_reloc *reloc = &p->relocs[i]; struct reservation_object *resv;
- if (!p->relocs[i].robj) + if (!reloc->robj) continue;
- resv = p->relocs[i].robj->tbo.resv; - radeon_sync_resv(&p->ib.sync, resv, p->relocs[i].tv.shared); + if (reloc->robj->owner != owner && + reloc->robj->owner != RADEON_BO_OWNER_FIRST_CS) + reloc->tv.shared = false; + + resv = reloc->robj->tbo.resv; + radeon_sync_resv(&p->ib.sync, resv, reloc->tv.shared); } }
@@ -332,6 +357,10 @@ int radeon_cs_parser_init(struct radeon_cs_parser *p, void *data) if (p->chunks[i].length_dw == 0) return -EINVAL; } + if (user_chunk.chunk_id == RADEON_CHUNK_ID_WAIT_FOR) { + p->chunk_wait_for = &p->chunks[i]; + /* zero length wait for list is actually useful */ + }
size = p->chunks[i].length_dw; cdata = (void __user *)(unsigned long)user_chunk.chunk_data; @@ -413,6 +442,18 @@ static void radeon_cs_parser_fini(struct radeon_cs_parser *parser, int error, bo unsigned i;
if (!error) { + long owner = parser->chunk_wait_for ? (long)parser->filp : + RADEON_BO_OWNER_IMPLICIT_SYNC; + + for (i = 0; i < parser->nrelocs; i++) { + struct radeon_cs_reloc *reloc = &parser->relocs[i]; + + if (!reloc->robj) + continue; + + reloc->robj->owner = owner; + } + if (parser->chunk_flags && parser->chunk_flags->length_dw > 4) { struct radeon_fpriv *fpriv = parser->filp->driver_priv; diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c index c100aa6..fa6efff 100644 --- a/drivers/gpu/drm/radeon/radeon_gem.c +++ b/drivers/gpu/drm/radeon/radeon_gem.c @@ -81,6 +81,7 @@ retry: } *obj = &robj->gem_base; robj->pid = task_pid_nr(current); + robj->owner = RADEON_BO_OWNER_FIRST_CS;
mutex_lock(&rdev->gem.mutex); list_add_tail(&robj->list, &rdev->gem.objects); diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c index eca2ce6..52df6d9 100644 --- a/drivers/gpu/drm/radeon/radeon_ttm.c +++ b/drivers/gpu/drm/radeon/radeon_ttm.c @@ -230,6 +230,7 @@ static int radeon_move_blit(struct ttm_buffer_object *bo, struct ttm_mem_reg *new_mem, struct ttm_mem_reg *old_mem) { + struct radeon_bo *rbo = container_of(bo, struct radeon_bo, tbo); struct radeon_device *rdev; uint64_t old_start, new_start; struct radeon_fence *fence; @@ -275,6 +276,7 @@ static int radeon_move_blit(struct ttm_buffer_object *bo, if (IS_ERR(fence)) return PTR_ERR(fence);
+ rbo->owner = RADEON_BO_OWNER_IMPLICIT_SYNC; r = ttm_bo_move_accel_cleanup(bo, &fence->base, evict, no_wait_gpu, new_mem); radeon_fence_unref(&fence); diff --git a/include/uapi/drm/radeon_drm.h b/include/uapi/drm/radeon_drm.h index 6b2b2e7..a34e3db 100644 --- a/include/uapi/drm/radeon_drm.h +++ b/include/uapi/drm/radeon_drm.h @@ -942,10 +942,11 @@ struct drm_radeon_gem_va { uint64_t offset; };
-#define RADEON_CHUNK_ID_RELOCS 0x01 -#define RADEON_CHUNK_ID_IB 0x02 -#define RADEON_CHUNK_ID_FLAGS 0x03 +#define RADEON_CHUNK_ID_RELOCS 0x01 +#define RADEON_CHUNK_ID_IB 0x02 +#define RADEON_CHUNK_ID_FLAGS 0x03 #define RADEON_CHUNK_ID_CONST_IB 0x04 +#define RADEON_CHUNK_ID_WAIT_FOR 0x05
/* The first dword of RADEON_CHUNK_ID_FLAGS is a uint32 of these flags: */ #define RADEON_CS_KEEP_TILING_FLAGS 0x01
dri-devel@lists.freedesktop.org