From: Thierry Reding treding@nvidia.com
Hi,
this fixes a couple of oddities like slightly off DMA masks and add support for hardware cursors on newer chips as well as support for the sector layout bit in NVIDIA framebuffer modifiers.
The first patch in this set is a small helper that I think might be useful to other drivers eventually, but if it isn't it's easily something I can carry in the Tegra driver. It'd be great to get an Acked-by on it from one of the drm-misc maintainers so that I can take it through the Tegra tree.
James, I've added you on Cc on this one because it makes use of the extended framebuffer modifiers that you introduced a while back to support the sector layout mux on Tegra194. It'd be great to get your thoughts on this just so this is used as expected.
Thanks, Thierry
Thierry Reding (9): drm/fourcc: Add macro to check for the modifier vendor drm/tegra: dc: Inherit DMA mask drm/tegra: dc: Parameterize maximum resolution drm/tegra: dc: Implement hardware cursor on Tegra186 and later drm/tegra: fb: Add diagnostics for framebuffer modifiers drm/tegra: gem: Add a clarifying comment gpu: host1x: Add early init and late exit callbacks drm/tegra: Count number of display controllers at runtime drm/tegra: Support sector layout on Tegra194
drivers/gpu/drm/tegra/dc.c | 104 +++++++++++++++++++++++++++++++--- drivers/gpu/drm/tegra/dc.h | 6 ++ drivers/gpu/drm/tegra/drm.c | 13 ++++- drivers/gpu/drm/tegra/drm.h | 5 ++ drivers/gpu/drm/tegra/fb.c | 10 ++++ drivers/gpu/drm/tegra/gem.h | 6 ++ drivers/gpu/drm/tegra/hub.c | 41 +++++++++++++- drivers/gpu/drm/tegra/plane.c | 32 +++++++++++ drivers/gpu/host1x/bus.c | 31 ++++++++++ include/linux/host1x.h | 2 + include/uapi/drm/drm_fourcc.h | 3 + 11 files changed, 240 insertions(+), 13 deletions(-)
From: Thierry Reding treding@nvidia.com
This is useful for checking at runtime whether a given modifier is from a specific vendor so that any vendor-specific parsing can be done.
Signed-off-by: Thierry Reding treding@nvidia.com --- include/uapi/drm/drm_fourcc.h | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h index f76de49c768f..567fd4566e08 100644 --- a/include/uapi/drm/drm_fourcc.h +++ b/include/uapi/drm/drm_fourcc.h @@ -366,6 +366,9 @@ extern "C" {
#define DRM_FORMAT_RESERVED ((1ULL << 56) - 1)
+#define fourcc_mod_is_vendor(modifier, vendor) \ + ((((modifier) >> 56) & 0xff) == DRM_FORMAT_MOD_VENDOR_## vendor) + #define fourcc_mod_code(vendor, val) \ ((((__u64)DRM_FORMAT_MOD_VENDOR_## vendor) << 56) | ((val) & 0x00ffffffffffffffULL))
Can we instead have a macro/function to get the vendor? This would be useful elsewhere as well, see drmGetFormatModifierVendor in a recent-ish libdrm patch [1].
[1]: https://gitlab.freedesktop.org/mesa/drm/-/merge_requests/108/diffs#91ecb12b2...
On Tue, Mar 23, 2021 at 04:04:35PM +0000, Simon Ser wrote:
Can we instead have a macro/function to get the vendor? This would be useful elsewhere as well, see drmGetFormatModifierVendor in a recent-ish libdrm patch [1].
Either way would work. I chose this because it ends up being much shorter than extracting the vendor and comparing to the constant.
Maybe we should just add both?
Thierry
From: Thierry Reding treding@nvidia.com
Inherit the DMA mask from host1x (on Tegra210 and earlier) or the display hub (on Tegra186 and later). This is necessary in order to properly map buffers without SMMU support and use the maximum IOVA space available with SMMU support.
Signed-off-by: Thierry Reding treding@nvidia.com --- drivers/gpu/drm/tegra/dc.c | 7 +++++++ drivers/gpu/drm/tegra/hub.c | 7 +++++++ 2 files changed, 14 insertions(+)
diff --git a/drivers/gpu/drm/tegra/dc.c b/drivers/gpu/drm/tegra/dc.c index 2585ba70b706..5737a0c4dc9f 100644 --- a/drivers/gpu/drm/tegra/dc.c +++ b/drivers/gpu/drm/tegra/dc.c @@ -2538,9 +2538,16 @@ static int tegra_dc_couple(struct tegra_dc *dc)
static int tegra_dc_probe(struct platform_device *pdev) { + u64 dma_mask = dma_get_mask(pdev->dev.parent); struct tegra_dc *dc; int err;
+ err = dma_coerce_mask_and_coherent(&pdev->dev, dma_mask); + if (err < 0) { + dev_err(&pdev->dev, "failed to set DMA mask: %d\n", err); + return err; + } + dc = devm_kzalloc(&pdev->dev, sizeof(*dc), GFP_KERNEL); if (!dc) return -ENOMEM; diff --git a/drivers/gpu/drm/tegra/hub.c b/drivers/gpu/drm/tegra/hub.c index 8e6d329d062b..617240032c37 100644 --- a/drivers/gpu/drm/tegra/hub.c +++ b/drivers/gpu/drm/tegra/hub.c @@ -848,12 +848,19 @@ static const struct host1x_client_ops tegra_display_hub_ops = {
static int tegra_display_hub_probe(struct platform_device *pdev) { + u64 dma_mask = dma_get_mask(pdev->dev.parent); struct device_node *child = NULL; struct tegra_display_hub *hub; struct clk *clk; unsigned int i; int err;
+ err = dma_coerce_mask_and_coherent(&pdev->dev, dma_mask); + if (err < 0) { + dev_err(&pdev->dev, "failed to set DMA mask: %d\n", err); + return err; + } + hub = devm_kzalloc(&pdev->dev, sizeof(*hub), GFP_KERNEL); if (!hub) return -ENOMEM;
From: Thierry Reding treding@nvidia.com
Tegra186 and later support a higher maximum resolution than earlier chips, so make sure to reflect that in the mode configuration.
Signed-off-by: Thierry Reding treding@nvidia.com --- drivers/gpu/drm/tegra/dc.c | 6 ++++++ drivers/gpu/drm/tegra/drm.c | 13 ++++++++++--- drivers/gpu/drm/tegra/drm.h | 1 + 3 files changed, 17 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/tegra/dc.c b/drivers/gpu/drm/tegra/dc.c index 5737a0c4dc9f..1886ef1fcda7 100644 --- a/drivers/gpu/drm/tegra/dc.c +++ b/drivers/gpu/drm/tegra/dc.c @@ -2117,6 +2117,12 @@ static int tegra_dc_init(struct host1x_client *client) if (dc->soc->pitch_align > tegra->pitch_align) tegra->pitch_align = dc->soc->pitch_align;
+ /* track maximum resolution */ + if (dc->soc->has_nvdisplay) + drm->mode_config.max_width = drm->mode_config.max_height = 16384; + else + drm->mode_config.max_width = drm->mode_config.max_height = 4096; + err = tegra_dc_rgb_init(drm, dc); if (err < 0 && err != -ENODEV) { dev_err(dc->dev, "failed to initialize RGB output: %d\n", err); diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c index 6015913fef83..bbc504763bd4 100644 --- a/drivers/gpu/drm/tegra/drm.c +++ b/drivers/gpu/drm/tegra/drm.c @@ -1144,9 +1144,8 @@ static int host1x_drm_probe(struct host1x_device *dev)
drm->mode_config.min_width = 0; drm->mode_config.min_height = 0; - - drm->mode_config.max_width = 4096; - drm->mode_config.max_height = 4096; + drm->mode_config.max_width = 0; + drm->mode_config.max_height = 0;
drm->mode_config.allow_fb_modifiers = true;
@@ -1165,6 +1164,14 @@ static int host1x_drm_probe(struct host1x_device *dev) if (err < 0) goto fbdev;
+ /* + * Now that all display controller have been initialized, the maximum + * supported resolution is known and the bitmask for horizontal and + * vertical bitfields can be computed. + */ + tegra->hmask = drm->mode_config.max_width - 1; + tegra->vmask = drm->mode_config.max_height - 1; + if (tegra->use_explicit_iommu) { u64 carveout_start, carveout_end, gem_start, gem_end; u64 dma_mask = dma_get_mask(&dev->dev); diff --git a/drivers/gpu/drm/tegra/drm.h b/drivers/gpu/drm/tegra/drm.h index 1af57c2016eb..34fbcd6abf2f 100644 --- a/drivers/gpu/drm/tegra/drm.h +++ b/drivers/gpu/drm/tegra/drm.h @@ -54,6 +54,7 @@ struct tegra_drm { struct tegra_fbdev *fbdev; #endif
+ unsigned int hmask, vmask; unsigned int pitch_align;
struct tegra_display_hub *hub;
From: Thierry Reding treding@nvidia.com
The hardware cursor on Tegra186 differs slightly from the implementation on older SoC generations. In particular the new implementation relies on software for clipping the cursor against the screen. Fortunately, atomic KMS already computes clipped coordinates for (cursor) planes, so this is trivial to implement.
The format supported by the hardware cursor is also slightly different.
Signed-off-by: Thierry Reding treding@nvidia.com --- drivers/gpu/drm/tegra/dc.c | 62 +++++++++++++++++++++++++++++++++----- drivers/gpu/drm/tegra/dc.h | 5 +++ 2 files changed, 59 insertions(+), 8 deletions(-)
diff --git a/drivers/gpu/drm/tegra/dc.c b/drivers/gpu/drm/tegra/dc.c index 1886ef1fcda7..4262fd9b9a15 100644 --- a/drivers/gpu/drm/tegra/dc.c +++ b/drivers/gpu/drm/tegra/dc.c @@ -832,10 +832,14 @@ static struct drm_plane *tegra_primary_plane_create(struct drm_device *drm, return &plane->base; }
-static const u32 tegra_cursor_plane_formats[] = { +static const u32 tegra_legacy_cursor_plane_formats[] = { DRM_FORMAT_RGBA8888, };
+static const u32 tegra_cursor_plane_formats[] = { + DRM_FORMAT_ARGB8888, +}; + static int tegra_cursor_atomic_check(struct drm_plane *plane, struct drm_atomic_state *state) { @@ -875,12 +879,22 @@ static void tegra_cursor_atomic_update(struct drm_plane *plane, plane); struct tegra_plane_state *tegra_plane_state = to_tegra_plane_state(new_state); struct tegra_dc *dc = to_tegra_dc(new_state->crtc); - u32 value = CURSOR_CLIP_DISPLAY; + struct tegra_drm *tegra = plane->dev->dev_private; + u64 dma_mask = *dc->dev->dma_mask; + unsigned int x, y; + u32 value = 0;
/* rien ne va plus */ if (!new_state->crtc || !new_state->fb) return;
+ /* + * Legacy display supports hardware clipping of the cursor, but + * nvdisplay relies on software to clip the cursor to the screen. + */ + if (!dc->soc->has_nvdisplay) + value |= CURSOR_CLIP_DISPLAY; + switch (new_state->crtc_w) { case 32: value |= CURSOR_SIZE_32x32; @@ -908,7 +922,7 @@ static void tegra_cursor_atomic_update(struct drm_plane *plane, tegra_dc_writel(dc, value, DC_DISP_CURSOR_START_ADDR);
#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT - value = (tegra_plane_state->iova[0] >> 32) & 0x3; + value = (tegra_plane_state->iova[0] >> 32) & (dma_mask >> 32); tegra_dc_writel(dc, value, DC_DISP_CURSOR_START_ADDR_HI); #endif
@@ -920,15 +934,42 @@ static void tegra_cursor_atomic_update(struct drm_plane *plane, value = tegra_dc_readl(dc, DC_DISP_BLEND_CURSOR_CONTROL); value &= ~CURSOR_DST_BLEND_MASK; value &= ~CURSOR_SRC_BLEND_MASK; - value |= CURSOR_MODE_NORMAL; + + if (dc->soc->has_nvdisplay) + value &= ~CURSOR_COMPOSITION_MODE_XOR; + else + value |= CURSOR_MODE_NORMAL; + value |= CURSOR_DST_BLEND_NEG_K1_TIMES_SRC; value |= CURSOR_SRC_BLEND_K1_TIMES_SRC; value |= CURSOR_ALPHA; tegra_dc_writel(dc, value, DC_DISP_BLEND_CURSOR_CONTROL);
+ /* nvdisplay relies on software for clipping */ + if (dc->soc->has_nvdisplay) { + unsigned int i, j, w, h; + + x = new_state->dst.x1; + y = new_state->dst.y1; + + i = new_state->src.x1 >> 16; + j = new_state->src.y1 >> 16; + + value = ((j & tegra->vmask) << 16) | (i & tegra->hmask); + tegra_dc_writel(dc, value, DC_DISP_PCALC_HEAD_SET_CROPPED_POINT_IN_CURSOR); + + w = (new_state->src.x2 - new_state->src.x1) >> 16; + h = (new_state->src.y2 - new_state->src.y1) >> 16; + + value = ((h & tegra->vmask) << 16) | (w & tegra->hmask); + tegra_dc_writel(dc, value, DC_DISP_PCALC_HEAD_SET_CROPPED_SIZE_IN_CURSOR); + } else { + x = new_state->crtc_x; + y = new_state->crtc_y; + } + /* position the cursor */ - value = (new_state->crtc_y & 0x3fff) << 16 | - (new_state->crtc_x & 0x3fff); + value = ((y & tegra->vmask) << 16) | (x & tegra->hmask); tegra_dc_writel(dc, value, DC_DISP_CURSOR_POSITION); }
@@ -982,8 +1023,13 @@ static struct drm_plane *tegra_dc_cursor_plane_create(struct drm_device *drm, plane->index = 6; plane->dc = dc;
- num_formats = ARRAY_SIZE(tegra_cursor_plane_formats); - formats = tegra_cursor_plane_formats; + if (!dc->soc->has_nvdisplay) { + num_formats = ARRAY_SIZE(tegra_legacy_cursor_plane_formats); + formats = tegra_legacy_cursor_plane_formats; + } else { + num_formats = ARRAY_SIZE(tegra_cursor_plane_formats); + formats = tegra_cursor_plane_formats; + }
err = drm_universal_plane_init(drm, &plane->base, possible_crtcs, &tegra_plane_funcs, formats, diff --git a/drivers/gpu/drm/tegra/dc.h b/drivers/gpu/drm/tegra/dc.h index 051d03dcb9b0..21074cd2ce5e 100644 --- a/drivers/gpu/drm/tegra/dc.h +++ b/drivers/gpu/drm/tegra/dc.h @@ -511,6 +511,8 @@ int tegra_dc_rgb_exit(struct tegra_dc *dc);
#define DC_DISP_CURSOR_START_ADDR_HI 0x4ec #define DC_DISP_BLEND_CURSOR_CONTROL 0x4f1 +#define CURSOR_COMPOSITION_MODE_BLEND (0 << 25) +#define CURSOR_COMPOSITION_MODE_XOR (1 << 25) #define CURSOR_MODE_LEGACY (0 << 24) #define CURSOR_MODE_NORMAL (1 << 24) #define CURSOR_DST_BLEND_ZERO (0 << 16) @@ -705,6 +707,9 @@ int tegra_dc_rgb_exit(struct tegra_dc *dc); #define PROTOCOL_MASK (0xf << 8) #define PROTOCOL_SINGLE_TMDS_A (0x1 << 8)
+#define DC_DISP_PCALC_HEAD_SET_CROPPED_POINT_IN_CURSOR 0x442 +#define DC_DISP_PCALC_HEAD_SET_CROPPED_SIZE_IN_CURSOR 0x446 + #define DC_WIN_CORE_WINDOWGROUP_SET_CONTROL 0x702 #define OWNER_MASK (0xf << 0) #define OWNER(x) (((x) & 0xf) << 0)
23.03.2021 18:54, Thierry Reding пишет:
@@ -920,15 +934,42 @@ static void tegra_cursor_atomic_update(struct drm_plane *plane, value = tegra_dc_readl(dc, DC_DISP_BLEND_CURSOR_CONTROL); value &= ~CURSOR_DST_BLEND_MASK; value &= ~CURSOR_SRC_BLEND_MASK;
- value |= CURSOR_MODE_NORMAL;
if (dc->soc->has_nvdisplay)
value &= ~CURSOR_COMPOSITION_MODE_XOR;
else
value |= CURSOR_MODE_NORMAL;
value |= CURSOR_DST_BLEND_NEG_K1_TIMES_SRC; value |= CURSOR_SRC_BLEND_K1_TIMES_SRC; value |= CURSOR_ALPHA; tegra_dc_writel(dc, value, DC_DISP_BLEND_CURSOR_CONTROL);
/* nvdisplay relies on software for clipping */
if (dc->soc->has_nvdisplay) {
But coordinates already should be clipped by drm_atomic_helper_check_plane_state().
On Tue, Mar 23, 2021 at 08:57:42PM +0300, Dmitry Osipenko wrote:
23.03.2021 18:54, Thierry Reding пишет:
@@ -920,15 +934,42 @@ static void tegra_cursor_atomic_update(struct drm_plane *plane, value = tegra_dc_readl(dc, DC_DISP_BLEND_CURSOR_CONTROL); value &= ~CURSOR_DST_BLEND_MASK; value &= ~CURSOR_SRC_BLEND_MASK;
- value |= CURSOR_MODE_NORMAL;
if (dc->soc->has_nvdisplay)
value &= ~CURSOR_COMPOSITION_MODE_XOR;
else
value |= CURSOR_MODE_NORMAL;
value |= CURSOR_DST_BLEND_NEG_K1_TIMES_SRC; value |= CURSOR_SRC_BLEND_K1_TIMES_SRC; value |= CURSOR_ALPHA; tegra_dc_writel(dc, value, DC_DISP_BLEND_CURSOR_CONTROL);
/* nvdisplay relies on software for clipping */
if (dc->soc->has_nvdisplay) {
But coordinates already should be clipped by drm_atomic_helper_check_plane_state().
Yes, and the driver goes on to use the clipped coordinates later on in this function.
Thierry
23.03.2021 21:24, Thierry Reding пишет:
On Tue, Mar 23, 2021 at 08:57:42PM +0300, Dmitry Osipenko wrote:
23.03.2021 18:54, Thierry Reding пишет:
@@ -920,15 +934,42 @@ static void tegra_cursor_atomic_update(struct drm_plane *plane, value = tegra_dc_readl(dc, DC_DISP_BLEND_CURSOR_CONTROL); value &= ~CURSOR_DST_BLEND_MASK; value &= ~CURSOR_SRC_BLEND_MASK;
- value |= CURSOR_MODE_NORMAL;
if (dc->soc->has_nvdisplay)
value &= ~CURSOR_COMPOSITION_MODE_XOR;
else
value |= CURSOR_MODE_NORMAL;
value |= CURSOR_DST_BLEND_NEG_K1_TIMES_SRC; value |= CURSOR_SRC_BLEND_K1_TIMES_SRC; value |= CURSOR_ALPHA; tegra_dc_writel(dc, value, DC_DISP_BLEND_CURSOR_CONTROL);
/* nvdisplay relies on software for clipping */
if (dc->soc->has_nvdisplay) {
But coordinates already should be clipped by drm_atomic_helper_check_plane_state().
Yes, and the driver goes on to use the clipped coordinates later on in this function.
I see now what it does, looks okay.
Minor nit: the i/j aren't very expressive names, something like sx/sy sw/sh should be a bit more appropriate naming.
You could also make use of drm_rect_width/height helpers.
But this doesn't deserve a v2 if there is nothing more important to improve.
On Tue, Mar 23, 2021 at 10:05:23PM +0300, Dmitry Osipenko wrote:
23.03.2021 21:24, Thierry Reding пишет:
On Tue, Mar 23, 2021 at 08:57:42PM +0300, Dmitry Osipenko wrote:
23.03.2021 18:54, Thierry Reding пишет:
@@ -920,15 +934,42 @@ static void tegra_cursor_atomic_update(struct drm_plane *plane, value = tegra_dc_readl(dc, DC_DISP_BLEND_CURSOR_CONTROL); value &= ~CURSOR_DST_BLEND_MASK; value &= ~CURSOR_SRC_BLEND_MASK;
- value |= CURSOR_MODE_NORMAL;
if (dc->soc->has_nvdisplay)
value &= ~CURSOR_COMPOSITION_MODE_XOR;
else
value |= CURSOR_MODE_NORMAL;
value |= CURSOR_DST_BLEND_NEG_K1_TIMES_SRC; value |= CURSOR_SRC_BLEND_K1_TIMES_SRC; value |= CURSOR_ALPHA; tegra_dc_writel(dc, value, DC_DISP_BLEND_CURSOR_CONTROL);
/* nvdisplay relies on software for clipping */
if (dc->soc->has_nvdisplay) {
But coordinates already should be clipped by drm_atomic_helper_check_plane_state().
Yes, and the driver goes on to use the clipped coordinates later on in this function.
I see now what it does, looks okay.
Minor nit: the i/j aren't very expressive names, something like sx/sy sw/sh should be a bit more appropriate naming.
You could also make use of drm_rect_width/height helpers.
But this doesn't deserve a v2 if there is nothing more important to improve.
As I was browsing through the drm_rect helpers, I also came across drm_rect_fp_to_int(), which helps simplify this further. As a result I was able to just get rid of most of these temporary variables in favor of just the integer version of drm_rect.
Thanks for the hint.
Thierry
From: Thierry Reding treding@nvidia.com
Add a debug message to let the user know when a framebuffer modifier is not supported.
Signed-off-by: Thierry Reding treding@nvidia.com --- drivers/gpu/drm/tegra/fb.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/gpu/drm/tegra/fb.c b/drivers/gpu/drm/tegra/fb.c index 01939c57fc74..350f33206076 100644 --- a/drivers/gpu/drm/tegra/fb.c +++ b/drivers/gpu/drm/tegra/fb.c @@ -86,6 +86,7 @@ int tegra_fb_get_tiling(struct drm_framebuffer *framebuffer, break;
default: + DRM_DEBUG_KMS("unknown format modifier: %llx\n", modifier); return -EINVAL; }
From: Thierry Reding treding@nvidia.com
Clarify when a fixed IOV address can be used and when a buffer has to be mapped before the IOVA can be used.
Signed-off-by: Thierry Reding treding@nvidia.com --- drivers/gpu/drm/tegra/plane.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/drivers/gpu/drm/tegra/plane.c b/drivers/gpu/drm/tegra/plane.c index 19e8847a164b..793da5d675d2 100644 --- a/drivers/gpu/drm/tegra/plane.c +++ b/drivers/gpu/drm/tegra/plane.c @@ -119,6 +119,14 @@ static int tegra_dc_pin(struct tegra_dc *dc, struct tegra_plane_state *state) dma_addr_t phys_addr, *phys; struct sg_table *sgt;
+ /* + * If we're not attached to a domain, we already stored the + * physical address when the buffer was allocated. If we're + * part of a group that's shared between all display + * controllers, we've also already mapped the framebuffer + * through the SMMU. In both cases we can short-circuit the + * code below and retrieve the stored IOV address. + */ if (!domain || dc->client.group) phys = &phys_addr; else
23.03.2021 18:54, Thierry Reding пишет:
From: Thierry Reding treding@nvidia.com
Clarify when a fixed IOV address can be used and when a buffer has to be mapped before the IOVA can be used.
Signed-off-by: Thierry Reding treding@nvidia.com
drivers/gpu/drm/tegra/plane.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/drivers/gpu/drm/tegra/plane.c b/drivers/gpu/drm/tegra/plane.c index 19e8847a164b..793da5d675d2 100644 --- a/drivers/gpu/drm/tegra/plane.c +++ b/drivers/gpu/drm/tegra/plane.c @@ -119,6 +119,14 @@ static int tegra_dc_pin(struct tegra_dc *dc, struct tegra_plane_state *state) dma_addr_t phys_addr, *phys; struct sg_table *sgt;
/*
* If we're not attached to a domain, we already stored the
* physical address when the buffer was allocated. If we're
* part of a group that's shared between all display
* controllers, we've also already mapped the framebuffer
* through the SMMU. In both cases we can short-circuit the
* code below and retrieve the stored IOV address.
if (!domain || dc->client.group) phys = &phys_addr; else*/
This comment is correct, but the logic feels a bit lame because it should be wasteful to re-map DMA on each FB flip. Personally I don't care much about this since older Tegras use pinned buffers by default, but this shouldn't be good for T124+ users.
Perhaps dumb buffers should be pinned to display by default and then we should extend the Tegra UAPI to support BO mapping to display client(?).
On Wed, Mar 24, 2021 at 05:41:08PM +0300, Dmitry Osipenko wrote:
23.03.2021 18:54, Thierry Reding пишет:
From: Thierry Reding treding@nvidia.com
Clarify when a fixed IOV address can be used and when a buffer has to be mapped before the IOVA can be used.
Signed-off-by: Thierry Reding treding@nvidia.com
drivers/gpu/drm/tegra/plane.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/drivers/gpu/drm/tegra/plane.c b/drivers/gpu/drm/tegra/plane.c index 19e8847a164b..793da5d675d2 100644 --- a/drivers/gpu/drm/tegra/plane.c +++ b/drivers/gpu/drm/tegra/plane.c @@ -119,6 +119,14 @@ static int tegra_dc_pin(struct tegra_dc *dc, struct tegra_plane_state *state) dma_addr_t phys_addr, *phys; struct sg_table *sgt;
/*
* If we're not attached to a domain, we already stored the
* physical address when the buffer was allocated. If we're
* part of a group that's shared between all display
* controllers, we've also already mapped the framebuffer
* through the SMMU. In both cases we can short-circuit the
* code below and retrieve the stored IOV address.
if (!domain || dc->client.group) phys = &phys_addr; else*/
This comment is correct, but the logic feels a bit lame because it should be wasteful to re-map DMA on each FB flip. Personally I don't care much about this since older Tegras use pinned buffers by default, but this shouldn't be good for T124+ users.
I'm not terribly thrilled by this either, but it's the only way to do this when using the DMA API because we don't know at allocation time (or import time for that matter) which of the (up to) 4 display controllers a framebuffer will be shown on. tegra_dc_pin() is the earliest where this is known and worst case that's called once per flip.
When the IOMMU API is used explicitly, we always map framebuffers into the IOMMU domain shared by all display controllers at allocation or import time and then we don't need to pin at flip time anymore.
I do have a work-in-progress patch somewhere that creates a mapping cache to mitigate this problem to some degree. I need to dig that up and do a few measurements because I vaguely recall this speeding up flips by quite a bit (well, except for the very first mapping, obviously).
Perhaps dumb buffers should be pinned to display by default and then we should extend the Tegra UAPI to support BO mapping to display client(?).
That would kind of defeat the purpose of a generic KMS UAPI.
Thierry
24.03.2021 18:02, Thierry Reding пишет:
On Wed, Mar 24, 2021 at 05:41:08PM +0300, Dmitry Osipenko wrote:
23.03.2021 18:54, Thierry Reding пишет:
From: Thierry Reding treding@nvidia.com
Clarify when a fixed IOV address can be used and when a buffer has to be mapped before the IOVA can be used.
Signed-off-by: Thierry Reding treding@nvidia.com
drivers/gpu/drm/tegra/plane.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/drivers/gpu/drm/tegra/plane.c b/drivers/gpu/drm/tegra/plane.c index 19e8847a164b..793da5d675d2 100644 --- a/drivers/gpu/drm/tegra/plane.c +++ b/drivers/gpu/drm/tegra/plane.c @@ -119,6 +119,14 @@ static int tegra_dc_pin(struct tegra_dc *dc, struct tegra_plane_state *state) dma_addr_t phys_addr, *phys; struct sg_table *sgt;
/*
* If we're not attached to a domain, we already stored the
* physical address when the buffer was allocated. If we're
* part of a group that's shared between all display
* controllers, we've also already mapped the framebuffer
* through the SMMU. In both cases we can short-circuit the
* code below and retrieve the stored IOV address.
if (!domain || dc->client.group) phys = &phys_addr; else*/
This comment is correct, but the logic feels a bit lame because it should be wasteful to re-map DMA on each FB flip. Personally I don't care much about this since older Tegras use pinned buffers by default, but this shouldn't be good for T124+ users.
I'm not terribly thrilled by this either, but it's the only way to do this when using the DMA API because we don't know at allocation time (or import time for that matter) which of the (up to) 4 display controllers a framebuffer will be shown on. tegra_dc_pin() is the earliest where this is known and worst case that's called once per flip.
When the IOMMU API is used explicitly, we always map framebuffers into the IOMMU domain shared by all display controllers at allocation or import time and then we don't need to pin at flip time anymore.
I do have a work-in-progress patch somewhere that creates a mapping cache to mitigate this problem to some degree. I need to dig that up and do a few measurements because I vaguely recall this speeding up flips by quite a bit (well, except for the very first mapping, obviously).
Perhaps dumb buffers should be pinned to display by default and then we should extend the Tegra UAPI to support BO mapping to display client(?).
That would kind of defeat the purpose of a generic KMS UAPI.
Couldn't the BOs be mapped when FB is created, i.e. by tegra_fb_create?
On Wed, Mar 24, 2021 at 06:45:30PM +0300, Dmitry Osipenko wrote:
24.03.2021 18:02, Thierry Reding пишет:
On Wed, Mar 24, 2021 at 05:41:08PM +0300, Dmitry Osipenko wrote:
23.03.2021 18:54, Thierry Reding пишет:
From: Thierry Reding treding@nvidia.com
Clarify when a fixed IOV address can be used and when a buffer has to be mapped before the IOVA can be used.
Signed-off-by: Thierry Reding treding@nvidia.com
drivers/gpu/drm/tegra/plane.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/drivers/gpu/drm/tegra/plane.c b/drivers/gpu/drm/tegra/plane.c index 19e8847a164b..793da5d675d2 100644 --- a/drivers/gpu/drm/tegra/plane.c +++ b/drivers/gpu/drm/tegra/plane.c @@ -119,6 +119,14 @@ static int tegra_dc_pin(struct tegra_dc *dc, struct tegra_plane_state *state) dma_addr_t phys_addr, *phys; struct sg_table *sgt;
/*
* If we're not attached to a domain, we already stored the
* physical address when the buffer was allocated. If we're
* part of a group that's shared between all display
* controllers, we've also already mapped the framebuffer
* through the SMMU. In both cases we can short-circuit the
* code below and retrieve the stored IOV address.
if (!domain || dc->client.group) phys = &phys_addr; else*/
This comment is correct, but the logic feels a bit lame because it should be wasteful to re-map DMA on each FB flip. Personally I don't care much about this since older Tegras use pinned buffers by default, but this shouldn't be good for T124+ users.
I'm not terribly thrilled by this either, but it's the only way to do this when using the DMA API because we don't know at allocation time (or import time for that matter) which of the (up to) 4 display controllers a framebuffer will be shown on. tegra_dc_pin() is the earliest where this is known and worst case that's called once per flip.
When the IOMMU API is used explicitly, we always map framebuffers into the IOMMU domain shared by all display controllers at allocation or import time and then we don't need to pin at flip time anymore.
I do have a work-in-progress patch somewhere that creates a mapping cache to mitigate this problem to some degree. I need to dig that up and do a few measurements because I vaguely recall this speeding up flips by quite a bit (well, except for the very first mapping, obviously).
Perhaps dumb buffers should be pinned to display by default and then we should extend the Tegra UAPI to support BO mapping to display client(?).
That would kind of defeat the purpose of a generic KMS UAPI.
Couldn't the BOs be mapped when FB is created, i.e. by tegra_fb_create?
I suppose that would be possible. However, tegra_fb_create() doesn't know a thing about display controllers, so we'd have to add extra code to it to iterate over all display controllers and do a dma_map_sg() of the GEM object for each of them.
It's also somewhat wasteful because now we get a mapping for each framebuffer for each display controller. So if you've got, say, a four UHD screen setup (which is something that Tegra194 supports), you could end up with 8 UHD framebuffers (two for each display, for double- buffering) at 32 MiB each for a whopping 256 MiB of memory that needs to be mapped for each of the four display controllers. That 1 GiB worth of page table updates, whereas you really only need one fourth of that.
Granted, this will make flipping a bit faster, and IOVA space isn't really a problem on Tegra194. It would still waste a bit of RAM for all those page table entries that we don't really need, though.
A mapping cache seems like a much better compromise because the cache lookup should be quite fast compared to a mapping operation and we waste just a couple dozen bytes per mapping perhaps as opposed to a few megabytes for the gratuitous, preemptive mappings.
Thierry
24.03.2021 19:42, Thierry Reding пишет:
On Wed, Mar 24, 2021 at 06:45:30PM +0300, Dmitry Osipenko wrote:
24.03.2021 18:02, Thierry Reding пишет:
On Wed, Mar 24, 2021 at 05:41:08PM +0300, Dmitry Osipenko wrote:
23.03.2021 18:54, Thierry Reding пишет:
From: Thierry Reding treding@nvidia.com
Clarify when a fixed IOV address can be used and when a buffer has to be mapped before the IOVA can be used.
Signed-off-by: Thierry Reding treding@nvidia.com
drivers/gpu/drm/tegra/plane.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/drivers/gpu/drm/tegra/plane.c b/drivers/gpu/drm/tegra/plane.c index 19e8847a164b..793da5d675d2 100644 --- a/drivers/gpu/drm/tegra/plane.c +++ b/drivers/gpu/drm/tegra/plane.c @@ -119,6 +119,14 @@ static int tegra_dc_pin(struct tegra_dc *dc, struct tegra_plane_state *state) dma_addr_t phys_addr, *phys; struct sg_table *sgt;
/*
* If we're not attached to a domain, we already stored the
* physical address when the buffer was allocated. If we're
* part of a group that's shared between all display
* controllers, we've also already mapped the framebuffer
* through the SMMU. In both cases we can short-circuit the
* code below and retrieve the stored IOV address.
if (!domain || dc->client.group) phys = &phys_addr; else*/
This comment is correct, but the logic feels a bit lame because it should be wasteful to re-map DMA on each FB flip. Personally I don't care much about this since older Tegras use pinned buffers by default, but this shouldn't be good for T124+ users.
I'm not terribly thrilled by this either, but it's the only way to do this when using the DMA API because we don't know at allocation time (or import time for that matter) which of the (up to) 4 display controllers a framebuffer will be shown on. tegra_dc_pin() is the earliest where this is known and worst case that's called once per flip.
When the IOMMU API is used explicitly, we always map framebuffers into the IOMMU domain shared by all display controllers at allocation or import time and then we don't need to pin at flip time anymore.
I do have a work-in-progress patch somewhere that creates a mapping cache to mitigate this problem to some degree. I need to dig that up and do a few measurements because I vaguely recall this speeding up flips by quite a bit (well, except for the very first mapping, obviously).
Perhaps dumb buffers should be pinned to display by default and then we should extend the Tegra UAPI to support BO mapping to display client(?).
That would kind of defeat the purpose of a generic KMS UAPI.
Couldn't the BOs be mapped when FB is created, i.e. by tegra_fb_create?
I suppose that would be possible. However, tegra_fb_create() doesn't know a thing about display controllers, so we'd have to add extra code to it to iterate over all display controllers and do a dma_map_sg() of the GEM object for each of them.
It's also somewhat wasteful because now we get a mapping for each framebuffer for each display controller. So if you've got, say, a four UHD screen setup (which is something that Tegra194 supports), you could end up with 8 UHD framebuffers (two for each display, for double- buffering) at 32 MiB each for a whopping 256 MiB of memory that needs to be mapped for each of the four display controllers. That 1 GiB worth of page table updates, whereas you really only need one fourth of that.
Granted, this will make flipping a bit faster, and IOVA space isn't really a problem on Tegra194. It would still waste a bit of RAM for all those page table entries that we don't really need, though.
A mapping cache seems like a much better compromise because the cache lookup should be quite fast compared to a mapping operation and we waste just a couple dozen bytes per mapping perhaps as opposed to a few megabytes for the gratuitous, preemptive mappings.
Isn't it really possible to put displays into the same IOMMU group on T194? It doesn't make much sense to have them in a separate groups on Linux.
On Wed, Mar 24, 2021 at 07:50:01PM +0300, Dmitry Osipenko wrote:
24.03.2021 19:42, Thierry Reding пишет:
On Wed, Mar 24, 2021 at 06:45:30PM +0300, Dmitry Osipenko wrote:
24.03.2021 18:02, Thierry Reding пишет:
On Wed, Mar 24, 2021 at 05:41:08PM +0300, Dmitry Osipenko wrote:
23.03.2021 18:54, Thierry Reding пишет:
From: Thierry Reding treding@nvidia.com
Clarify when a fixed IOV address can be used and when a buffer has to be mapped before the IOVA can be used.
Signed-off-by: Thierry Reding treding@nvidia.com
drivers/gpu/drm/tegra/plane.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/drivers/gpu/drm/tegra/plane.c b/drivers/gpu/drm/tegra/plane.c index 19e8847a164b..793da5d675d2 100644 --- a/drivers/gpu/drm/tegra/plane.c +++ b/drivers/gpu/drm/tegra/plane.c @@ -119,6 +119,14 @@ static int tegra_dc_pin(struct tegra_dc *dc, struct tegra_plane_state *state) dma_addr_t phys_addr, *phys; struct sg_table *sgt;
/*
* If we're not attached to a domain, we already stored the
* physical address when the buffer was allocated. If we're
* part of a group that's shared between all display
* controllers, we've also already mapped the framebuffer
* through the SMMU. In both cases we can short-circuit the
* code below and retrieve the stored IOV address.
if (!domain || dc->client.group) phys = &phys_addr; else*/
This comment is correct, but the logic feels a bit lame because it should be wasteful to re-map DMA on each FB flip. Personally I don't care much about this since older Tegras use pinned buffers by default, but this shouldn't be good for T124+ users.
I'm not terribly thrilled by this either, but it's the only way to do this when using the DMA API because we don't know at allocation time (or import time for that matter) which of the (up to) 4 display controllers a framebuffer will be shown on. tegra_dc_pin() is the earliest where this is known and worst case that's called once per flip.
When the IOMMU API is used explicitly, we always map framebuffers into the IOMMU domain shared by all display controllers at allocation or import time and then we don't need to pin at flip time anymore.
I do have a work-in-progress patch somewhere that creates a mapping cache to mitigate this problem to some degree. I need to dig that up and do a few measurements because I vaguely recall this speeding up flips by quite a bit (well, except for the very first mapping, obviously).
Perhaps dumb buffers should be pinned to display by default and then we should extend the Tegra UAPI to support BO mapping to display client(?).
That would kind of defeat the purpose of a generic KMS UAPI.
Couldn't the BOs be mapped when FB is created, i.e. by tegra_fb_create?
I suppose that would be possible. However, tegra_fb_create() doesn't know a thing about display controllers, so we'd have to add extra code to it to iterate over all display controllers and do a dma_map_sg() of the GEM object for each of them.
It's also somewhat wasteful because now we get a mapping for each framebuffer for each display controller. So if you've got, say, a four UHD screen setup (which is something that Tegra194 supports), you could end up with 8 UHD framebuffers (two for each display, for double- buffering) at 32 MiB each for a whopping 256 MiB of memory that needs to be mapped for each of the four display controllers. That 1 GiB worth of page table updates, whereas you really only need one fourth of that.
Granted, this will make flipping a bit faster, and IOVA space isn't really a problem on Tegra194. It would still waste a bit of RAM for all those page table entries that we don't really need, though.
A mapping cache seems like a much better compromise because the cache lookup should be quite fast compared to a mapping operation and we waste just a couple dozen bytes per mapping perhaps as opposed to a few megabytes for the gratuitous, preemptive mappings.
Isn't it really possible to put displays into the same IOMMU group on T194? It doesn't make much sense to have them in a separate groups on Linux.
It is possible and in fact that's what's already happening. However, the problem isn't that these devices are not in the same group, the problem is that the DMA API doesn't know anything about groups. It works on struct device and if you've got DMA API debugging enabled it may even flag incorrect usage as errors.
So from a DMA API point of view, if a device wants to use a buffer, that buffer first has to be mapped for that device, even if it was already mapped for a different device that happens to be in the same IOMMU group and hence share an IOMMU domain.
Thierry
26.03.2021 19:37, Thierry Reding пишет:
On Wed, Mar 24, 2021 at 07:50:01PM +0300, Dmitry Osipenko wrote:
24.03.2021 19:42, Thierry Reding пишет:
On Wed, Mar 24, 2021 at 06:45:30PM +0300, Dmitry Osipenko wrote:
24.03.2021 18:02, Thierry Reding пишет:
On Wed, Mar 24, 2021 at 05:41:08PM +0300, Dmitry Osipenko wrote:
23.03.2021 18:54, Thierry Reding пишет: > From: Thierry Reding treding@nvidia.com > > Clarify when a fixed IOV address can be used and when a buffer has to > be mapped before the IOVA can be used. > > Signed-off-by: Thierry Reding treding@nvidia.com > --- > drivers/gpu/drm/tegra/plane.c | 8 ++++++++ > 1 file changed, 8 insertions(+) > > diff --git a/drivers/gpu/drm/tegra/plane.c b/drivers/gpu/drm/tegra/plane.c > index 19e8847a164b..793da5d675d2 100644 > --- a/drivers/gpu/drm/tegra/plane.c > +++ b/drivers/gpu/drm/tegra/plane.c > @@ -119,6 +119,14 @@ static int tegra_dc_pin(struct tegra_dc *dc, struct tegra_plane_state *state) > dma_addr_t phys_addr, *phys; > struct sg_table *sgt; > > + /* > + * If we're not attached to a domain, we already stored the > + * physical address when the buffer was allocated. If we're > + * part of a group that's shared between all display > + * controllers, we've also already mapped the framebuffer > + * through the SMMU. In both cases we can short-circuit the > + * code below and retrieve the stored IOV address. > + */ > if (!domain || dc->client.group) > phys = &phys_addr; > else >
This comment is correct, but the logic feels a bit lame because it should be wasteful to re-map DMA on each FB flip. Personally I don't care much about this since older Tegras use pinned buffers by default, but this shouldn't be good for T124+ users.
I'm not terribly thrilled by this either, but it's the only way to do this when using the DMA API because we don't know at allocation time (or import time for that matter) which of the (up to) 4 display controllers a framebuffer will be shown on. tegra_dc_pin() is the earliest where this is known and worst case that's called once per flip.
When the IOMMU API is used explicitly, we always map framebuffers into the IOMMU domain shared by all display controllers at allocation or import time and then we don't need to pin at flip time anymore.
I do have a work-in-progress patch somewhere that creates a mapping cache to mitigate this problem to some degree. I need to dig that up and do a few measurements because I vaguely recall this speeding up flips by quite a bit (well, except for the very first mapping, obviously).
Perhaps dumb buffers should be pinned to display by default and then we should extend the Tegra UAPI to support BO mapping to display client(?).
That would kind of defeat the purpose of a generic KMS UAPI.
Couldn't the BOs be mapped when FB is created, i.e. by tegra_fb_create?
I suppose that would be possible. However, tegra_fb_create() doesn't know a thing about display controllers, so we'd have to add extra code to it to iterate over all display controllers and do a dma_map_sg() of the GEM object for each of them.
It's also somewhat wasteful because now we get a mapping for each framebuffer for each display controller. So if you've got, say, a four UHD screen setup (which is something that Tegra194 supports), you could end up with 8 UHD framebuffers (two for each display, for double- buffering) at 32 MiB each for a whopping 256 MiB of memory that needs to be mapped for each of the four display controllers. That 1 GiB worth of page table updates, whereas you really only need one fourth of that.
Granted, this will make flipping a bit faster, and IOVA space isn't really a problem on Tegra194. It would still waste a bit of RAM for all those page table entries that we don't really need, though.
A mapping cache seems like a much better compromise because the cache lookup should be quite fast compared to a mapping operation and we waste just a couple dozen bytes per mapping perhaps as opposed to a few megabytes for the gratuitous, preemptive mappings.
Isn't it really possible to put displays into the same IOMMU group on T194? It doesn't make much sense to have them in a separate groups on Linux.
It is possible and in fact that's what's already happening. However, the problem isn't that these devices are not in the same group, the problem is that the DMA API doesn't know anything about groups. It works on struct device and if you've got DMA API debugging enabled it may even flag incorrect usage as errors.
So from a DMA API point of view, if a device wants to use a buffer, that buffer first has to be mapped for that device, even if it was already mapped for a different device that happens to be in the same IOMMU group and hence share an IOMMU domain.
This sounds to me like something which needs to be addressed first, i.e. to make DMA API aware that it's okay to re-use mappings by sibling devices within the same IOMMU group. Although, I assume that you already considered this variant, didn't you?
From: Thierry Reding treding@nvidia.com
These callbacks can be used by client drivers to run code during early init and during late exit. Early init callbacks are run prior to the regular init callbacks while late exit callbacks run after the regular exit callbacks.
Signed-off-by: Thierry Reding treding@nvidia.com --- drivers/gpu/host1x/bus.c | 31 +++++++++++++++++++++++++++++++ include/linux/host1x.h | 2 ++ 2 files changed, 33 insertions(+)
diff --git a/drivers/gpu/host1x/bus.c b/drivers/gpu/host1x/bus.c index 4c18874f7c4a..c9077cfbfef9 100644 --- a/drivers/gpu/host1x/bus.c +++ b/drivers/gpu/host1x/bus.c @@ -196,6 +196,17 @@ int host1x_device_init(struct host1x_device *device)
mutex_lock(&device->clients_lock);
+ list_for_each_entry(client, &device->clients, list) { + if (client->ops && client->ops->early_init) { + err = client->ops->early_init(client); + if (err < 0) { + dev_err(&device->dev, "failed to early initialize %s: %d\n", + dev_name(client->dev), err); + goto teardown_late; + } + } + } + list_for_each_entry(client, &device->clients, list) { if (client->ops && client->ops->init) { err = client->ops->init(client); @@ -217,6 +228,14 @@ int host1x_device_init(struct host1x_device *device) if (client->ops->exit) client->ops->exit(client);
+ /* reset client to end of list for late teardown */ + client = list_entry(&device->clients, struct host1x_client, list); + +teardown_late: + list_for_each_entry_continue_reverse(client, &device->clients, list) + if (client->ops->late_exit) + client->ops->late_exit(client); + mutex_unlock(&device->clients_lock); return err; } @@ -251,6 +270,18 @@ int host1x_device_exit(struct host1x_device *device) } }
+ list_for_each_entry_reverse(client, &device->clients, list) { + if (client->ops && client->ops->late_exit) { + err = client->ops->late_exit(client); + if (err < 0) { + dev_err(&device->dev, "failed to late cleanup %s: %d\n", + dev_name(client->dev), err); + mutex_unlock(&device->clients_lock); + return err; + } + } + } + mutex_unlock(&device->clients_lock);
return 0; diff --git a/include/linux/host1x.h b/include/linux/host1x.h index 5890f91dd286..74970681ecdb 100644 --- a/include/linux/host1x.h +++ b/include/linux/host1x.h @@ -31,8 +31,10 @@ u64 host1x_get_dma_mask(struct host1x *host1x); * @resume: host1x client resume code */ struct host1x_client_ops { + int (*early_init)(struct host1x_client *client); int (*init)(struct host1x_client *client); int (*exit)(struct host1x_client *client); + int (*late_exit)(struct host1x_client *client); int (*suspend)(struct host1x_client *client); int (*resume)(struct host1x_client *client); };
From: Thierry Reding treding@nvidia.com
In order to be able to attach planes to all possible display controllers the exact number of CRTCs must be known. Keep track of the number of the display controllers that register during initialization.
Signed-off-by: Thierry Reding treding@nvidia.com --- drivers/gpu/drm/tegra/dc.c | 22 ++++++++++++++++++++++ drivers/gpu/drm/tegra/drm.h | 1 + drivers/gpu/drm/tegra/hub.c | 6 ++++-- 3 files changed, 27 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/tegra/dc.c b/drivers/gpu/drm/tegra/dc.c index 4262fd9b9a15..927e6f5a2c24 100644 --- a/drivers/gpu/drm/tegra/dc.c +++ b/drivers/gpu/drm/tegra/dc.c @@ -2081,6 +2081,16 @@ static bool tegra_dc_has_window_groups(struct tegra_dc *dc) return false; }
+static int tegra_dc_early_init(struct host1x_client *client) +{ + struct drm_device *drm = dev_get_drvdata(client->host); + struct tegra_drm *tegra = drm->dev_private; + + tegra->num_crtcs++; + + return 0; +} + static int tegra_dc_init(struct host1x_client *client) { struct drm_device *drm = dev_get_drvdata(client->host); @@ -2229,6 +2239,16 @@ static int tegra_dc_exit(struct host1x_client *client) return 0; }
+static int tegra_dc_late_exit(struct host1x_client *client) +{ + struct drm_device *drm = dev_get_drvdata(client->host); + struct tegra_drm *tegra = drm->dev_private; + + tegra->num_crtcs--; + + return 0; +} + static int tegra_dc_runtime_suspend(struct host1x_client *client) { struct tegra_dc *dc = host1x_client_to_dc(client); @@ -2293,8 +2313,10 @@ static int tegra_dc_runtime_resume(struct host1x_client *client) }
static const struct host1x_client_ops dc_client_ops = { + .early_init = tegra_dc_early_init, .init = tegra_dc_init, .exit = tegra_dc_exit, + .late_exit = tegra_dc_late_exit, .suspend = tegra_dc_runtime_suspend, .resume = tegra_dc_runtime_resume, }; diff --git a/drivers/gpu/drm/tegra/drm.h b/drivers/gpu/drm/tegra/drm.h index 34fbcd6abf2f..9a089b93da24 100644 --- a/drivers/gpu/drm/tegra/drm.h +++ b/drivers/gpu/drm/tegra/drm.h @@ -56,6 +56,7 @@ struct tegra_drm {
unsigned int hmask, vmask; unsigned int pitch_align; + unsigned int num_crtcs;
struct tegra_display_hub *hub; }; diff --git a/drivers/gpu/drm/tegra/hub.c b/drivers/gpu/drm/tegra/hub.c index 617240032c37..500c9d37e654 100644 --- a/drivers/gpu/drm/tegra/hub.c +++ b/drivers/gpu/drm/tegra/hub.c @@ -562,9 +562,8 @@ struct drm_plane *tegra_shared_plane_create(struct drm_device *drm, enum drm_plane_type type = DRM_PLANE_TYPE_OVERLAY; struct tegra_drm *tegra = drm->dev_private; struct tegra_display_hub *hub = tegra->hub; - /* planes can be assigned to arbitrary CRTCs */ - unsigned int possible_crtcs = 0x7; struct tegra_shared_plane *plane; + unsigned int possible_crtcs; unsigned int num_formats; const u64 *modifiers; struct drm_plane *p; @@ -583,6 +582,9 @@ struct drm_plane *tegra_shared_plane_create(struct drm_device *drm,
p = &plane->base.base;
+ /* planes can be assigned to arbitrary CRTCs */ + possible_crtcs = BIT(tegra->num_crtcs) - 1; + num_formats = ARRAY_SIZE(tegra_shared_plane_formats); formats = tegra_shared_plane_formats; modifiers = tegra_shared_plane_modifiers;
From: Thierry Reding treding@nvidia.com
Tegra194 has a special physical address bit that enables some memory swizzling logic to support different sector layouts. Support the bit that selects the sector layout which is passed in the framebuffer modifier.
Signed-off-by: Thierry Reding treding@nvidia.com --- drivers/gpu/drm/tegra/dc.c | 7 +++++++ drivers/gpu/drm/tegra/dc.h | 1 + drivers/gpu/drm/tegra/drm.h | 3 +++ drivers/gpu/drm/tegra/fb.c | 9 +++++++++ drivers/gpu/drm/tegra/gem.h | 6 ++++++ drivers/gpu/drm/tegra/hub.c | 28 ++++++++++++++++++++++++++++ drivers/gpu/drm/tegra/plane.c | 24 ++++++++++++++++++++++++ 7 files changed, 78 insertions(+)
diff --git a/drivers/gpu/drm/tegra/dc.c b/drivers/gpu/drm/tegra/dc.c index 927e6f5a2c24..742608b30527 100644 --- a/drivers/gpu/drm/tegra/dc.c +++ b/drivers/gpu/drm/tegra/dc.c @@ -2326,6 +2326,7 @@ static const struct tegra_dc_soc_info tegra20_dc_soc_info = { .supports_interlacing = false, .supports_cursor = false, .supports_block_linear = false, + .supports_sector_layout = false, .has_legacy_blending = true, .pitch_align = 8, .has_powergate = false, @@ -2345,6 +2346,7 @@ static const struct tegra_dc_soc_info tegra30_dc_soc_info = { .supports_interlacing = false, .supports_cursor = false, .supports_block_linear = false, + .supports_sector_layout = false, .has_legacy_blending = true, .pitch_align = 8, .has_powergate = false, @@ -2364,6 +2366,7 @@ static const struct tegra_dc_soc_info tegra114_dc_soc_info = { .supports_interlacing = false, .supports_cursor = false, .supports_block_linear = false, + .supports_sector_layout = false, .has_legacy_blending = true, .pitch_align = 64, .has_powergate = true, @@ -2383,6 +2386,7 @@ static const struct tegra_dc_soc_info tegra124_dc_soc_info = { .supports_interlacing = true, .supports_cursor = true, .supports_block_linear = true, + .supports_sector_layout = false, .has_legacy_blending = false, .pitch_align = 64, .has_powergate = true, @@ -2402,6 +2406,7 @@ static const struct tegra_dc_soc_info tegra210_dc_soc_info = { .supports_interlacing = true, .supports_cursor = true, .supports_block_linear = true, + .supports_sector_layout = false, .has_legacy_blending = false, .pitch_align = 64, .has_powergate = true, @@ -2455,6 +2460,7 @@ static const struct tegra_dc_soc_info tegra186_dc_soc_info = { .supports_interlacing = true, .supports_cursor = true, .supports_block_linear = true, + .supports_sector_layout = false, .has_legacy_blending = false, .pitch_align = 64, .has_powergate = false, @@ -2503,6 +2509,7 @@ static const struct tegra_dc_soc_info tegra194_dc_soc_info = { .supports_interlacing = true, .supports_cursor = true, .supports_block_linear = true, + .supports_sector_layout = true, .has_legacy_blending = false, .pitch_align = 64, .has_powergate = false, diff --git a/drivers/gpu/drm/tegra/dc.h b/drivers/gpu/drm/tegra/dc.h index 21074cd2ce5e..29f19c3c6149 100644 --- a/drivers/gpu/drm/tegra/dc.h +++ b/drivers/gpu/drm/tegra/dc.h @@ -52,6 +52,7 @@ struct tegra_dc_soc_info { bool supports_interlacing; bool supports_cursor; bool supports_block_linear; + bool supports_sector_layout; bool has_legacy_blending; unsigned int pitch_align; bool has_powergate; diff --git a/drivers/gpu/drm/tegra/drm.h b/drivers/gpu/drm/tegra/drm.h index 9a089b93da24..fe1a37e95bfa 100644 --- a/drivers/gpu/drm/tegra/drm.h +++ b/drivers/gpu/drm/tegra/drm.h @@ -24,6 +24,9 @@ #include "hub.h" #include "trace.h"
+/* XXX move to include/uapi/drm/drm_fourcc.h? */ +#define DRM_FORMAT_MOD_NVIDIA_SECTOR_LAYOUT BIT(22) + struct reset_control;
#ifdef CONFIG_DRM_FBDEV_EMULATION diff --git a/drivers/gpu/drm/tegra/fb.c b/drivers/gpu/drm/tegra/fb.c index 350f33206076..c04dda8353fd 100644 --- a/drivers/gpu/drm/tegra/fb.c +++ b/drivers/gpu/drm/tegra/fb.c @@ -44,6 +44,15 @@ int tegra_fb_get_tiling(struct drm_framebuffer *framebuffer, { uint64_t modifier = framebuffer->modifier;
+ if (fourcc_mod_is_vendor(modifier, NVIDIA)) { + if ((modifier & DRM_FORMAT_MOD_NVIDIA_SECTOR_LAYOUT) == 0) + tiling->sector_layout = TEGRA_BO_SECTOR_LAYOUT_TEGRA; + else + tiling->sector_layout = TEGRA_BO_SECTOR_LAYOUT_GPU; + + modifier &= ~DRM_FORMAT_MOD_NVIDIA_SECTOR_LAYOUT; + } + switch (modifier) { case DRM_FORMAT_MOD_LINEAR: tiling->mode = TEGRA_BO_TILING_MODE_PITCH; diff --git a/drivers/gpu/drm/tegra/gem.h b/drivers/gpu/drm/tegra/gem.h index fafb5724499b..c15fd99d6cb2 100644 --- a/drivers/gpu/drm/tegra/gem.h +++ b/drivers/gpu/drm/tegra/gem.h @@ -21,9 +21,15 @@ enum tegra_bo_tiling_mode { TEGRA_BO_TILING_MODE_BLOCK, };
+enum tegra_bo_sector_layout { + TEGRA_BO_SECTOR_LAYOUT_TEGRA, + TEGRA_BO_SECTOR_LAYOUT_GPU, +}; + struct tegra_bo_tiling { enum tegra_bo_tiling_mode mode; unsigned long value; + enum tegra_bo_sector_layout sector_layout; };
struct tegra_bo { diff --git a/drivers/gpu/drm/tegra/hub.c b/drivers/gpu/drm/tegra/hub.c index 500c9d37e654..79bff8b48271 100644 --- a/drivers/gpu/drm/tegra/hub.c +++ b/drivers/gpu/drm/tegra/hub.c @@ -55,6 +55,18 @@ static const u64 tegra_shared_plane_modifiers[] = { DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(3), DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(4), DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(5), + /* + * The GPU sector layout is only supported on Tegra194, but these will + * be filtered out later on by ->format_mod_supported() on SoCs where + * it isn't supported. + */ + DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(0) | DRM_FORMAT_MOD_NVIDIA_SECTOR_LAYOUT, + DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(1) | DRM_FORMAT_MOD_NVIDIA_SECTOR_LAYOUT, + DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(2) | DRM_FORMAT_MOD_NVIDIA_SECTOR_LAYOUT, + DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(3) | DRM_FORMAT_MOD_NVIDIA_SECTOR_LAYOUT, + DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(4) | DRM_FORMAT_MOD_NVIDIA_SECTOR_LAYOUT, + DRM_FORMAT_MOD_NVIDIA_16BX2_BLOCK(5) | DRM_FORMAT_MOD_NVIDIA_SECTOR_LAYOUT, + /* sentinel */ DRM_FORMAT_MOD_INVALID };
@@ -366,6 +378,12 @@ static int tegra_shared_plane_atomic_check(struct drm_plane *plane, return -EINVAL; }
+ if (tiling->sector_layout == TEGRA_BO_SECTOR_LAYOUT_GPU && + !dc->soc->supports_sector_layout) { + DRM_ERROR("hardware doesn't support GPU sector layout\n"); + return -EINVAL; + } + /* * Tegra doesn't support different strides for U and V planes so we * error out if the user tries to display a framebuffer with such a @@ -485,6 +503,16 @@ static void tegra_shared_plane_atomic_update(struct drm_plane *plane,
base = tegra_plane_state->iova[0] + fb->offsets[0];
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT + /* + * Physical address bit 39 in Tegra194 is used as a switch for special + * logic that swizzles the memory using either the legacy Tegra or the + * dGPU sector layout. + */ + if (tegra_plane_state->tiling.sector_layout == TEGRA_BO_SECTOR_LAYOUT_GPU) + base |= BIT(39); +#endif + tegra_plane_writel(p, tegra_plane_state->format, DC_WIN_COLOR_DEPTH); tegra_plane_writel(p, 0, DC_WIN_PRECOMP_WGRP_PARAMS);
diff --git a/drivers/gpu/drm/tegra/plane.c b/drivers/gpu/drm/tegra/plane.c index 793da5d675d2..1e0eae8b4342 100644 --- a/drivers/gpu/drm/tegra/plane.c +++ b/drivers/gpu/drm/tegra/plane.c @@ -83,6 +83,22 @@ static void tegra_plane_atomic_destroy_state(struct drm_plane *plane, kfree(state); }
+static bool tegra_plane_supports_sector_layout(struct drm_plane *plane) +{ + struct drm_crtc *crtc; + + drm_for_each_crtc(crtc, plane->dev) { + if (plane->possible_crtcs & drm_crtc_mask(crtc)) { + struct tegra_dc *dc = to_tegra_dc(crtc); + + if (!dc->soc->supports_sector_layout) + return false; + } + } + + return true; +} + static bool tegra_plane_format_mod_supported(struct drm_plane *plane, uint32_t format, uint64_t modifier) @@ -92,6 +108,14 @@ static bool tegra_plane_format_mod_supported(struct drm_plane *plane, if (modifier == DRM_FORMAT_MOD_LINEAR) return true;
+ /* check for the sector layout bit */ + if (fourcc_mod_is_vendor(modifier, NVIDIA)) { + if (modifier & DRM_FORMAT_MOD_NVIDIA_SECTOR_LAYOUT) { + if (!tegra_plane_supports_sector_layout(plane)) + return false; + } + } + if (info->num_planes == 1) return true;
dri-devel@lists.freedesktop.org