Re: [RESEND PATCH v2 2/2] drm/panfrost: Expose perf counters through debugfs

16 May 2019

On 14/05/2019 14:31, Rob Herring wrote:
...
On Tue, May 14, 2019 at 5:48 AM Boris Brezillon
boris.brezillon@collabora.com wrote:
...
Add a way to dump perf counters through debugfs. The implementation is
kept simple and has a number of limitations:

it's not designed for multi-user usage as the counter values are
reset after each dump and there's no per-user context
only accessible to root users
no counters naming/position abstraction. Things are dumped in a raw
format that has to be parsed by the user who has to know where the
relevant values are and what they mean

This implementation is intended to be used by mesa developers to help
debug perf-related issues while we work on a more generic approach that
would allow all GPU drivers to expose their counters in a consistent
way. As a result, this debugfs interface is considered unstable and
might be deprecated in the future.
Signed-off-by: Boris Brezillon boris.brezillon@collabora.com
Changes in v2:

Expose counters through debugfs and keep things simple for now (we'll
work on a generic solution in parallel)


drivers/gpu/drm/panfrost/Makefile           |   3 +-
 drivers/gpu/drm/panfrost/panfrost_device.c  |   9 +
 drivers/gpu/drm/panfrost/panfrost_device.h  |   3 +
 drivers/gpu/drm/panfrost/panfrost_drv.c     |   7 +
 drivers/gpu/drm/panfrost/panfrost_gpu.c     |   7 +
 drivers/gpu/drm/panfrost/panfrost_perfcnt.c | 339 ++++++++++++++++++++
 drivers/gpu/drm/panfrost/panfrost_perfcnt.h |  15 +
 drivers/gpu/drm/panfrost/panfrost_regs.h    |  19 ++
 8 files changed, 401 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/panfrost/panfrost_perfcnt.c
 create mode 100644 drivers/gpu/drm/panfrost/panfrost_perfcnt.h
[...]
...
...
new file mode 100644
index 000000000000..8093783a5a26

--- /dev/null
+++ b/drivers/gpu/drm/panfrost/panfrost_perfcnt.c
@@ -0,0 +1,339 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright 2019 Collabora Ltd */



+#include <drm/drm_file.h>
+#include <drm/drm_gem_shmem_helper.h>
+#include <drm/panfrost_drm.h>
+#include <linux/completion.h>
+#include <linux/debugfs.h>
+#include <linux/iopoll.h>
+#include <linux/pm_runtime.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>



+#include "panfrost_device.h"
+#include "panfrost_features.h"
+#include "panfrost_gem.h"
+#include "panfrost_issues.h"
+#include "panfrost_job.h"
+#include "panfrost_mmu.h"
+#include "panfrost_regs.h"



+#define COUNTERS_PER_BLOCK             64
+#define BYTES_PER_COUNTER              4
+#define BLOCKS_PER_COREGROUP           8
+#define V4_SHADERS_PER_COREGROUP       4



+struct panfrost_perfcnt {

  struct panfrost_gem_object *bo;


  size_t bosize;


  void *buf;


  bool enabled;


  struct mutex lock;


  struct completion dump_comp;



+};



+void panfrost_perfcnt_clean_cache_done(struct panfrost_device *pfdev)
+{

  complete(&pfdev->perfcnt->dump_comp);



+}



+void panfrost_perfcnt_sample_done(struct panfrost_device *pfdev)
+{

  gpu_write(pfdev, GPU_CMD, GPU_CMD_CLEAN_CACHES);



+}



+static void panfrost_perfcnt_setup(struct panfrost_device *pfdev)
+{

  u32 cfg;



  /*


   * Always use address space 0 for now.


   * FIXME: this needs to be updated when we start using different


   * address space.


   */


  cfg = GPU_PERFCNT_CFG_AS(0);


  if (panfrost_model_cmp(pfdev, 0x1000) >= 0)



You've got a couple of these. Perhaps we should add either a
model_is_bifrost() helper or an is_bifrost variable to use.
...

          cfg |= GPU_PERFCNT_CFG_SETSEL(1);



mali_kbase only sets this if CONFIG_MALI_PRFCNT_SET_SECONDARY is defined
- i.e. this selects a different selection of counters. At at the very
least this should be an optional flag that can be set, but I suspect the
primary set are the ones you are interested in.
...
...


  gpu_write(pfdev, GPU_PERFCNT_CFG,


            cfg | GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_OFF));



  if (!pfdev->perfcnt->enabled)


          return;



  gpu_write(pfdev, GPU_PRFCNT_JM_EN, 0xffffffff);


  gpu_write(pfdev, GPU_PRFCNT_SHADER_EN, 0xffffffff);


  gpu_write(pfdev, GPU_PRFCNT_MMU_L2_EN, 0xffffffff);



  /*


   * Due to PRLAM-8186 we need to disable the Tiler before we enable HW


   * counters.



Seems like you could just always apply the work-around? It doesn't
look like it would be much different.
Technically the workaround causes the driver to perform the operation in
the wrong order below (write TILER_EN when the mode is MANUAL) - this is
a workaround for the hardware issue (and therefore works on that
hardware). But I wouldn't like to say it works on other hardware which
doesn't have the issue.
...
...

   */


  if (panfrost_has_hw_issue(pfdev, HW_ISSUE_8186))


          gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0);


  else


          gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0xffffffff);



  gpu_write(pfdev, GPU_PERFCNT_CFG,


            cfg | GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_MANUAL));



  if (panfrost_has_hw_issue(pfdev, HW_ISSUE_8186))


          gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0xffffffff);



+}



+static int panfrost_perfcnt_dump(struct panfrost_device *pfdev)
+{

  u64 gpuva;


  int ret;



  reinit_completion(&pfdev->perfcnt->dump_comp);


  gpuva = pfdev->perfcnt->bo->node.start << PAGE_SHIFT;


  gpu_write(pfdev, GPU_PERFCNT_BASE_LO, gpuva);


  gpu_write(pfdev, GPU_PERFCNT_BASE_HI, gpuva >> 32);


  gpu_write(pfdev, GPU_CMD, GPU_CMD_PERFCNT_SAMPLE);


  ret = wait_for_completion_interruptible_timeout(&pfdev->perfcnt->dump_comp,


                                                  msecs_to_jiffies(1000));


  if (!ret)


          ret = -ETIMEDOUT;


  else if (ret > 0)


          ret = 0;



  return ret;



+}



+void panfrost_perfcnt_resume(struct panfrost_device *pfdev)
No suspend handling needed?
...
+{

  if (pfdev->perfcnt)


          panfrost_perfcnt_setup(pfdev);



+}



+static ssize_t panfrost_perfcnt_enable_read(struct file *file,

                                      char __user *user_buf,


                                      size_t count, loff_t *ppos)



+{

  struct panfrost_device *pfdev = file->private_data;


  ssize_t ret;



  mutex_lock(&pfdev->perfcnt->lock);


  ret = simple_read_from_buffer(user_buf, count, ppos,


                                pfdev->perfcnt->enabled ? "Y\n" : "N\n",


                                2);


  mutex_unlock(&pfdev->perfcnt->lock);



  return ret;



+}



+static ssize_t panfrost_perfcnt_enable_write(struct file *file,

                                       const char __user *user_buf,


                                       size_t count, loff_t *ppos)



+{

  struct panfrost_device *pfdev = file->private_data;


  bool enable;


  int ret;



  ret = kstrtobool_from_user(user_buf, count, &enable);


  if (ret)


          return ret;



  ret = pm_runtime_get_sync(pfdev->dev);


  if (ret < 0)


          return ret;



  mutex_lock(&pfdev->perfcnt->lock);


  if (enable != pfdev->perfcnt->enabled) {


          pfdev->perfcnt->enabled = enable;


          panfrost_perfcnt_setup(pfdev);


  }


  mutex_unlock(&pfdev->perfcnt->lock);



  pm_runtime_mark_last_busy(pfdev->dev);


  pm_runtime_put_autosuspend(pfdev->dev);



  return count;



+}



+static const struct file_operations panfrost_perfcnt_enable_fops = {

  .read = panfrost_perfcnt_enable_read,


  .write = panfrost_perfcnt_enable_write,


  .open = simple_open,


  .llseek = default_llseek,



+};



+static ssize_t panfrost_perfcnt_dump_read(struct file *file,

                                    char __user *user_buf,


                                    size_t count, loff_t *ppos)



+{

  struct panfrost_device *pfdev = file->private_data;


  ssize_t ret;



  ret = pm_runtime_get_sync(pfdev->dev);


  if (ret < 0)


          return ret;



  mutex_lock(&pfdev->perfcnt->lock);


  if (!pfdev->perfcnt->enabled) {


          ret = -EINVAL;


          goto out;


  }



  ret = panfrost_perfcnt_dump(pfdev);


  if (ret)


          goto out;



  ret = simple_read_from_buffer(user_buf, count, ppos,


                                pfdev->perfcnt->buf,


                                pfdev->perfcnt->bosize);




+out:

  mutex_unlock(&pfdev->perfcnt->lock);


  pm_runtime_mark_last_busy(pfdev->dev);


  pm_runtime_put_autosuspend(pfdev->dev);



  return ret;



+}



+static const struct file_operations panfrost_perfcnt_dump_fops = {

  .read = panfrost_perfcnt_dump_read,


  .open = simple_open,


  .llseek = default_llseek,



+};



+int panfrost_perfcnt_debugfs_init(struct drm_minor *minor)
+{

  struct panfrost_device *pfdev = to_panfrost_device(minor->dev);


  struct dentry *file, *dir;



  dir = debugfs_create_dir("perfcnt", minor->debugfs_root);


  if (IS_ERR(dir))


          return PTR_ERR(dir);



  file = debugfs_create_file("dump", 0400, dir, pfdev,


                             &panfrost_perfcnt_dump_fops);


  if (IS_ERR(file))


          return PTR_ERR(file);



  file = debugfs_create_file("enable", 0600, dir, pfdev,


                             &panfrost_perfcnt_enable_fops);


  if (IS_ERR(file))


          return PTR_ERR(file);



  return 0;



+}



+int panfrost_perfcnt_init(struct panfrost_device *pfdev)
+{

  struct panfrost_perfcnt *perfcnt;


  struct drm_gem_shmem_object *bo;


  size_t size;


  u32 status;


  int ret;



  if (panfrost_has_hw_feature(pfdev, HW_FEATURE_V4)) {


          unsigned int ncoregroups;



          ncoregroups = hweight64(pfdev->features.l2_present);


          size = ncoregroups * BLOCKS_PER_COREGROUP *


                 COUNTERS_PER_BLOCK * BYTES_PER_COUNTER;


  } else {


          unsigned int nl2c, ncores;



          /*


           * TODO: define a macro to extract the number of l2 caches from


           * mem_features.


           */


          nl2c = ((pfdev->features.mem_features >> 8) & GENMASK(3, 0)) + 1;



          /*


           * The ARM driver is grouping cores per core group and then


           * only using the number of cores in group 0 to calculate the


           * size. Not sure why this is done like that, but I guess


           * shader_present will only show cores in the first group


           * anyway.


           */



I'm not sure I understand this comment. I'm not sure which version of
the driver you are looking at, but shader_present should contain all the
cores (in every core group).
The kbase driver (in panfrost's git tree[1]), contains:
base_gpu_props *props = &kbdev->gpu_props.props;
    u32 nr_l2 = props->l2_props.num_l2_slices;
    u64 core_mask = props->coherency_info.group[0].core_mask;
u32 nr_blocks = fls64(core_mask);
/* JM and tiler counter blocks are always present */
    dump_size = (2 + nr_l2 + nr_blocks) *
    		NR_CNT_PER_BLOCK *
    		NR_BYTES_PER_CNT;
[1]
https://gitlab.freedesktop.org/panfrost/mali_kbase/blob/master/driver/produc...
i.e. the number of cores + number of L2 caches (plus 2 for JM/tiler)
multiplied by the block size.
Also another difference with the below is that fls64 and hweight64 don't
return the same numbers. If the core mask is non-contiguous this will
return different values.
...
...

          ncores = hweight64(pfdev->features.shader_present);



          /*


           * There's always one JM and one Tiler block, hence the '+ 2'


           * here.


           */


          size = (nl2c + ncores + 2) *


                 COUNTERS_PER_BLOCK * BYTES_PER_COUNTER;


  }



  perfcnt = devm_kzalloc(pfdev->dev, sizeof(*perfcnt), GFP_KERNEL);


  if (!perfcnt)


          return -ENOMEM;



[...]
Steve

    

2025

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

Re: [RESEND PATCH v2 2/2] drm/panfrost: Expose perf counters through debugfs

Signed-off-by: Boris Brezillon boris.brezillon@collabora.com