Superfluous includes.On 14 September 2016 at 15:19, Robert Bragg <robert@sixbynine.org> wrote:
> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_ perf.c
> index 87530f5..5305982 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -28,14 +28,860 @@
> #include <linux/sizes.h>
>
> #include "i915_drv.h"
> +#include "intel_ringbuffer.h"
> +#include "intel_lrc.h"
> +#include "i915_oa_hsw.h"
> +
> +/* Must be a power of two */
> +#define OA_BUFFER_SIZE SZ_16M
It's a power of two between 128K and 16M, maybe add a build_bug_on and
build_bug_on_not_power_of_2 to check this?
> +#define OA_TAKEN(tail, head) ((tail - head) & (OA_BUFFER_SIZE - 1))
> +
> +/* There's a HW race condition between OA unit tail pointer register updates and
> + * writes to memory whereby the tail pointer can sometimes get ahead of what's
> + * been written out to the OA buffer so far.
> + *
> + * Although this can be observed explicitly by checking for a zeroed report-id
> + * field in tail reports, it seems preferable to account for this earlier e.g.
> + * as part of the _oa_buffer_is_empty checks to minimize -EAGAIN polling cycles
> + * in this situation.
> + *
> + * To give time for the most recent reports to land before they may be copied to
> + * userspace, the driver operates as if the tail pointer effectively lags behind
> + * the HW tail pointer by 'tail_margin' bytes. The margin in bytes is calculated
> + * based on this constant in nanoseconds, the current OA sampling exponent
> + * and current report size.
> + *
> + * There is also a fallback check while reading to simply skip over reports with
> + * a zeroed report-id.
> + */
> +#define OA_TAIL_MARGIN_NSEC 100000ULL
Yikes!
OA_BUFFER_SIZE
> +
> +static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv)
> +{
> + /* Pre-DevBDW: OABUFFER must be set with counters off,
> + * before OASTATUS1, but after OASTATUS2
> + */
> + I915_WRITE(GEN7_OASTATUS2, dev_priv->perf.oa.oa_buffer.gtt_offset |
> + OA_MEM_SELECT_GGTT); /* head */
> + I915_WRITE(GEN7_OABUFFER, dev_priv->perf.oa.oa_buffer.gtt_offset);
> + I915_WRITE(GEN7_OASTATUS1, dev_priv->perf.oa.oa_buffer.gtt_offset |
> + OABUFFER_SIZE_16M); /* tail */
> +
> + /* On Haswell we have to track which OASTATUS1 flags we've
> + * already seen since they can't be cleared while periodic
> + * sampling is enabled.
> + */
> + dev_priv->perf.oa.gen7_latched_oastatus1 = 0;
> +
> + /* NB: although the OA buffer will initially be allocated
> + * zeroed via shmfs (and so this memset is redundant when
> + * first allocating), we may re-init the OA buffer, either
> + * when re-enabling a stream or in error/reset paths.
> + *
> + * The reason we clear the buffer for each re-init is for the
> + * sanity check in gen7_append_oa_reports() that looks at the
> + * report-id field to make sure it's non-zero which relies on
> + * the assumption that new reports are being written to zeroed
> + * memory...
> + */
> + memset(dev_priv->perf.oa.oa_buffer.addr, 0, SZ_16M);
i915_gem_object_pin_map can't return NULL.
> +
> + /* Maybe make ->pollin per-stream state if we support multiple
> + * concurrent streams in the future. */
> + atomic_set(&dev_priv->perf.oa.pollin, false);
> +}
> +
> +static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
> +{
> + struct drm_i915_gem_object *bo;
> + enum i915_map_type map;
> + struct i915_vma *vma;
> + int ret;
> +
> + BUG_ON(dev_priv->perf.oa.oa_buffer.obj);
> +
> + ret = i915_mutex_lock_interruptible(&dev_priv->drm);
> + if (ret)
> + return ret;
> +
> + bo = i915_gem_object_create(&dev_priv->drm, OA_BUFFER_SIZE);
> + if (IS_ERR(bo)) {
> + DRM_ERROR("Failed to allocate OA buffer\n");
> + ret = PTR_ERR(bo);
> + goto unlock;
> + }
> + dev_priv->perf.oa.oa_buffer.obj = bo;
> +
> + ret = i915_gem_object_set_cache_level(bo, I915_CACHE_LLC);
> + if (ret)
> + goto err_unref;
> +
> + /* PreHSW required 512K alignment, HSW requires 16M */
> + vma = i915_gem_object_ggtt_pin(bo, NULL, 0, SZ_16M, PIN_MAPPABLE);
> + if (IS_ERR(vma)) {
> + ret = PTR_ERR(vma);
> + goto err_unref;
> + }
> + dev_priv->perf.oa.oa_buffer.vma = vma;
> +
> + map = HAS_LLC(dev_priv) ? I915_MAP_WB : I915_MAP_WC;
> +
> + dev_priv->perf.oa.oa_buffer.gtt_offset = i915_ggtt_offset(vma);
> + dev_priv->perf.oa.oa_buffer.addr = i915_gem_object_pin_map(bo, map);
> + if (dev_priv->perf.oa.oa_buffer.addr == NULL)
gem context state pinned at ggtt offset zero is unlikely but still> + goto err_unpin;
> +
> + dev_priv->perf.oa.ops.init_oa_buffer(dev_priv);
> +
> + DRM_DEBUG_DRIVER("OA Buffer initialized, gtt offset = 0x%x, vaddr = %p",
> + dev_priv->perf.oa.oa_buffer.gtt_offset,
> + dev_priv->perf.oa.oa_buffer.addr);
> +
> + goto unlock;
> +
> +err_unpin:
> + __i915_vma_unpin(vma);
> +
> +err_unref:
> + i915_gem_object_put(bo);
> +
> + dev_priv->perf.oa.oa_buffer.gtt_offset = 0;
> + dev_priv->perf.oa.oa_buffer.addr = NULL;
> + dev_priv->perf.oa.oa_buffer.vma = NULL;
> + dev_priv->perf.oa.oa_buffer.obj = NULL;
> +
> +unlock:
> + mutex_unlock(&dev_priv->drm.struct_mutex);
> + return ret;
> +}
> +
> +static void config_oa_regs(struct drm_i915_private *dev_priv,
> + const struct i915_oa_reg *regs,
> + int n_regs)
> +{
> + int i;
> +
> + for (i = 0; i < n_regs; i++) {
> + const struct i915_oa_reg *reg = regs + i;
> +
> + I915_WRITE(reg->addr, reg->value);
> + }
> +}
> +
> +static int hsw_enable_metric_set(struct drm_i915_private *dev_priv)
> +{
> + int ret = i915_oa_select_metric_set_hsw(dev_priv);
> +
> + if (ret)
> + return ret;
> +
> + I915_WRITE(GDT_CHICKEN_BITS, (I915_READ(GDT_CHICKEN_BITS) |
> + GT_NOA_ENABLE));
> +
> + /* PRM:
> + *
> + * OA unit is using “crclk” for its functionality. When trunk
> + * level clock gating takes place, OA clock would be gated,
> + * unable to count the events from non-render clock domain.
> + * Render clock gating must be disabled when OA is enabled to
> + * count the events from non-render domain. Unit level clock
> + * gating for RCS should also be disabled.
> + */
> + I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) &
> + ~GEN7_DOP_CLOCK_GATE_ENABLE));
> + I915_WRITE(GEN6_UCGCTL1, (I915_READ(GEN6_UCGCTL1) |
> + GEN6_CSUNIT_CLOCK_GATE_DISABLE));
> +
> + config_oa_regs(dev_priv, dev_priv->perf.oa.mux_regs,
> + dev_priv->perf.oa.mux_regs_len);
> +
> + /* It apparently takes a fairly long time for a new MUX
> + * configuration to be be applied after these register writes.
> + * This delay duration was derived empirically based on the
> + * render_basic config but hopefully it covers the maximum
> + * configuration latency.
> + *
> + * As a fallback, the checks in _append_oa_reports() to skip
> + * invalid OA reports do also seem to work to discard reports
> + * generated before this config has completed - albeit not
> + * silently.
> + *
> + * Unfortunately this is essentially a magic number, since we
> + * don't currently know of a reliable mechanism for predicting
> + * how long the MUX config will take to apply and besides
> + * seeing invalid reports we don't know of a reliable way to
> + * explicitly check that the MUX config has landed.
> + *
> + * It's even possible we've miss characterized the underlying
> + * problem - it just seems like the simplest explanation why
> + * a delay at this location would mitigate any invalid reports.
> + */
> + usleep_range(15000, 20000);
> +
> + config_oa_regs(dev_priv, dev_priv->perf.oa.b_counter_regs,
> + dev_priv->perf.oa.b_counter_regs_len);
> +
> + return 0;
> +}
> +
> +static void hsw_disable_metric_set(struct drm_i915_private *dev_priv)
> +{
> + I915_WRITE(GEN6_UCGCTL1, (I915_READ(GEN6_UCGCTL1) &
> + ~GEN6_CSUNIT_CLOCK_GATE_DISABLE));
> + I915_WRITE(GEN7_MISCCPCTL, (I915_READ(GEN7_MISCCPCTL) |
> + GEN7_DOP_CLOCK_GATE_ENABLE));
> +
> + I915_WRITE(GDT_CHICKEN_BITS, (I915_READ(GDT_CHICKEN_BITS) &
> + ~GT_NOA_ENABLE));
> +}
> +
> +static void gen7_update_oacontrol_locked(struct drm_i915_private *dev_priv)
> +{
> + assert_spin_locked(&dev_priv->perf.hook_lock);
> +
> + if (dev_priv->perf.oa.exclusive_stream->enabled) {
> + unsigned long ctx_id = 0;
> +
> + if (dev_priv->perf.oa.exclusive_stream->ctx)
> + ctx_id = dev_priv->perf.oa.specific_ctx_id;
> +
> + if (dev_priv->perf.oa.exclusive_stream->ctx == NULL || ctx_id) {
possible, as pointed
out by Chris.
We already did this step.
> +
> +static int i915_oa_stream_init(struct i915_perf_stream *stream,
> + struct drm_i915_perf_open_param *param,
> + struct perf_open_properties *props)
> +{
> + struct drm_i915_private *dev_priv = stream->dev_priv;
> + int format_size;
> + int ret;
> +
> + if (!(props->sample_flags & SAMPLE_OA_REPORT)) {
> + DRM_ERROR("Only OA report sampling supported\n");
> + return -EINVAL;
> + }
> +
> + if (!dev_priv->perf.oa.ops.init_oa_buffer) {
> + DRM_ERROR("OA unit not supported\n");
> + return -ENODEV;
> + }
> +
> + /* To avoid the complexity of having to accurately filter
> + * counter reports and marshal to the appropriate client
> + * we currently only allow exclusive access
> + */
> + if (dev_priv->perf.oa.exclusive_stream) {
> + DRM_ERROR("OA unit already in use\n");
> + return -EBUSY;
> + }
> +
> + if (!props->metrics_set) {
> + DRM_ERROR("OA metric set not specified\n");
> + return -EINVAL;
> + }
> +
> + if (!props->oa_format) {
> + DRM_ERROR("OA report format not specified\n");
> + return -EINVAL;
> + }
> +
> + stream->sample_size = sizeof(struct drm_i915_perf_record_header);
> +
> + format_size = dev_priv->perf.oa.oa_formats[props->oa_format].size;
> +
> + stream->sample_flags |= SAMPLE_OA_REPORT;
> + stream->sample_size += format_size;
> +
> + dev_priv->perf.oa.oa_buffer.format_size = format_size;
> + BUG_ON(dev_priv->perf.oa.oa_buffer.format_size == 0);
> +
> + dev_priv->perf.oa.oa_buffer.format =
> + dev_priv->perf.oa.oa_formats[props->oa_format].format;
> +
> + dev_priv->perf.oa.metrics_set = props->metrics_set;
> +
> + dev_priv->perf.oa.periodic = props->oa_periodic;
> + if (dev_priv->perf.oa.periodic) {
> + u64 period_ns = oa_exponent_to_ns(dev_priv,
> + props->oa_period_exponent);
> +
> + dev_priv->perf.oa.period_exponent = props->oa_period_exponent;
> +
> + /* See comment for OA_TAIL_MARGIN_NSEC for details
> + * about this tail_margin...
> + */
> + dev_priv->perf.oa.tail_margin =
> + ((OA_TAIL_MARGIN_NSEC / period_ns) + 1) * format_size;
> + }
> +
> + ret = alloc_oa_buffer(dev_priv);
> + if (ret)
> + return ret;
> +
> + /* PRM - observability performance counters:
> + *
> + * OACONTROL, performance counter enable, note:
> + *
> + * "When this bit is set, in order to have coherent counts,
> + * RC6 power state and trunk clock gating must be disabled.
> + * This can be achieved by programming MMIO registers as
> + * 0xA094=0 and 0xA090[31]=1"
> + *
> + * In our case we are expecting that taking pm + FORCEWAKE
> + * references will effectively disable RC6.
> + */
> + intel_runtime_pm_get(dev_priv);
> + intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
> +
> + ret = dev_priv->perf.oa.ops.enable_metric_set(dev_priv);
> + if (ret) {
> + intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
> + intel_runtime_pm_put(dev_priv);
> + free_oa_buffer(dev_priv);
> + return ret;
> + }
> +
> + stream->ops = &i915_oa_stream_ops;
> +
> + /* On Haswell we have to track which OASTATUS1 flags we've already
> + * seen since they can't be cleared while periodic sampling is enabled.
> + */
> + dev_priv->perf.oa.gen7_latched_oastatus1 = 0;
> +
> + dev_priv->perf.oa.exclusive_stream = stream; For consistency, if (!dev_priv->perf.oa.ops.
> +
> + return 0;
> +}
> +
> +static void gen7_update_hw_ctx_id_locked(struct drm_i915_private *dev_priv,
> + u32 ctx_id)
> +{
> + assert_spin_locked(&dev_priv->perf.hook_lock);
> +
> + dev_priv->perf.oa.specific_ctx_id = ctx_id;
> + gen7_update_oacontrol_locked(dev_priv);
> +}
> +
> +static void
> +i915_oa_legacy_context_pin_notify_locked(struct drm_i915_private *dev_priv,
> + struct i915_gem_context *ctx)
> +{
> + assert_spin_locked(&dev_priv->perf.hook_lock);
> +
> + BUG_ON(i915.enable_execlists);
> +
> + if (dev_priv->perf.oa.ops.update_hw_ctx_id_locked == NULL)
update_hw_ctx_id_locked)
Slightly magical.
> + return;
> +
> + if (dev_priv->perf.oa.exclusive_stream &&
> + dev_priv->perf.oa.exclusive_stream->ctx == ctx) {
> + struct i915_vma *vma = ctx->engine[RCS].state;
> + u32 ctx_id = i915_ggtt_offset(vma);
> +
> + dev_priv->perf.oa.ops.update_hw_ctx_id_locked(dev_priv, ctx_id);
> + }
> +}
> +
> @@ -431,8 +1367,35 @@ int i915_perf_open_ioctl(struct drm_device *dev, void *data,
>
> void i915_perf_init(struct drm_i915_private *dev_priv)
> {
> + if (!IS_HASWELL(dev_priv))
> + return;
> +
> + hrtimer_init(&dev_priv->perf.oa.poll_check_timer,
> + CLOCK_MONOTONIC, HRTIMER_MODE_REL);
> + dev_priv->perf.oa.poll_check_timer.function = oa_poll_check_timer_cb;
> + init_waitqueue_head(&dev_priv->perf.oa.poll_wq);
> +
> INIT_LIST_HEAD(&dev_priv->perf.streams);
> mutex_init(&dev_priv->perf.lock);
> + spin_lock_init(&dev_priv->perf.hook_lock);
> +
> + dev_priv->perf.oa.ops.init_oa_buffer = gen7_init_oa_buffer;
> + dev_priv->perf.oa.ops.enable_metric_set = hsw_enable_metric_set;
> + dev_priv->perf.oa.ops.disable_metric_set = hsw_disable_metric_set;
> + dev_priv->perf.oa.ops.oa_enable = gen7_oa_enable;
> + dev_priv->perf.oa.ops.oa_disable = gen7_oa_disable;
> + dev_priv->perf.oa.ops.update_hw_ctx_id_locked =
> + gen7_update_hw_ctx_id_locked;
> + dev_priv->perf.oa.ops.read = gen7_oa_read;
> + dev_priv->perf.oa.ops.oa_buffer_is_empty =
> + gen7_oa_buffer_is_empty_fop_unlocked;
> +
> + dev_priv->perf.oa.timestamp_frequency = 12500000;
Maybe move this up a level and reuse for both paths.
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ ringbuffer.c
> index 7a74750..22d5ff1 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -2043,8 +2043,14 @@ static int intel_ring_context_pin(struct i915_gem_context *ctx,
> if (ret)
> goto error;
>
> - ret = i915_vma_pin(ce->state, 0, ctx->ggtt_alignment,
> - PIN_GLOBAL | PIN_HIGH);
> + if (engine->id == RCS) {
> + u64 vma_flags = PIN_GLOBAL | PIN_HIGH;
> + ret = i915_gem_context_pin_legacy_rcs_state(engine->i915,
> + ctx,
> + vma_flags);
> + } else
> + ret = i915_vma_pin(ce->state, 0, ctx->ggtt_alignment,
> + PIN_GLOBAL | PIN_HIGH);
> if (ret)
> goto error;
> }