From: Thierry Reding treding@nvidia.com
This series is a continuation of the work to move host1x and Tegra DRM towards being able to use the IOMMU-backed DMA API.
The first two patches are required to workaround the shortage of IOMMU domains on older Tegra SoC generations. The remainder of the patches is mostly preparatory work to smoothen the transition to the DMA API. With all of these patches applied, it's possible for the drivers to either use the IOMMU API explicitly, or, if already attached to an DMA IOMMU domain, continue to use that existing mapping with the DMA API.
These patches apply on top of linux-next and the ->load()/->unload() removal patch from here:
https://patchwork.freedesktop.org/patch/337896/
Thierry
Thierry Reding (12): memory: tegra: Add gr2d and gr3d to DRM IOMMU group drm/tegra: Simplify IOMMU group selection gpu: host1x: Overhaul host1x_bo_{pin,unpin}() API gpu: host1x: Clean up debugfs on removal gpu: host1x: Add direction flags to relocations gpu: host1x: Allocate gather copy for host1x gpu: host1x: Support DMA mapping of buffers gpu: host1x: Set DMA mask based on IOMMU setup drm/tegra: Remove memory allocation from Falcon library drm/tegra: falcon: Clarify address usage drm/tegra: Support DMA API for display controllers drm/tegra: Optionally attach clients to the IOMMU
drivers/gpu/drm/tegra/dc.c | 10 +- drivers/gpu/drm/tegra/drm.c | 82 ++++++++---- drivers/gpu/drm/tegra/drm.h | 4 +- drivers/gpu/drm/tegra/falcon.c | 64 ++------- drivers/gpu/drm/tegra/falcon.h | 16 +-- drivers/gpu/drm/tegra/gem.c | 46 ++++++- drivers/gpu/drm/tegra/gr2d.c | 2 +- drivers/gpu/drm/tegra/gr3d.c | 2 +- drivers/gpu/drm/tegra/hub.c | 6 +- drivers/gpu/drm/tegra/plane.c | 104 +++++++++++++++ drivers/gpu/drm/tegra/plane.h | 8 ++ drivers/gpu/drm/tegra/vic.c | 91 ++++++++----- drivers/gpu/host1x/dev.c | 223 +++++++++++++++++++------------- drivers/gpu/host1x/dev.h | 1 + drivers/gpu/host1x/job.c | 91 +++++++++++-- drivers/gpu/host1x/job.h | 4 + drivers/memory/tegra/tegra114.c | 10 +- drivers/memory/tegra/tegra124.c | 8 +- drivers/memory/tegra/tegra30.c | 11 +- include/linux/host1x.h | 21 ++- 20 files changed, 552 insertions(+), 252 deletions(-)
From: Thierry Reding treding@nvidia.com
All of the devices making up the Tegra DRM device want to share a single IOMMU domain. Put them into a single group to allow them to do that.
Signed-off-by: Thierry Reding treding@nvidia.com --- drivers/memory/tegra/tegra114.c | 10 ++++++---- drivers/memory/tegra/tegra124.c | 8 +++++--- drivers/memory/tegra/tegra30.c | 11 +++++++---- 3 files changed, 18 insertions(+), 11 deletions(-)
diff --git a/drivers/memory/tegra/tegra114.c b/drivers/memory/tegra/tegra114.c index ac8351b5beeb..48ef01c3ff90 100644 --- a/drivers/memory/tegra/tegra114.c +++ b/drivers/memory/tegra/tegra114.c @@ -909,16 +909,18 @@ static const struct tegra_smmu_swgroup tegra114_swgroups[] = { { .name = "tsec", .swgroup = TEGRA_SWGROUP_TSEC, .reg = 0x294 }, };
-static const unsigned int tegra114_group_display[] = { +static const unsigned int tegra114_group_drm[] = { TEGRA_SWGROUP_DC, TEGRA_SWGROUP_DCB, + TEGRA_SWGROUP_G2, + TEGRA_SWGROUP_NV, };
static const struct tegra_smmu_group_soc tegra114_groups[] = { { - .name = "display", - .swgroups = tegra114_group_display, - .num_swgroups = ARRAY_SIZE(tegra114_group_display), + .name = "drm", + .swgroups = tegra114_group_drm, + .num_swgroups = ARRAY_SIZE(tegra114_group_drm), }, };
diff --git a/drivers/memory/tegra/tegra124.c b/drivers/memory/tegra/tegra124.c index 5d0ccb2be206..62b30b1b9677 100644 --- a/drivers/memory/tegra/tegra124.c +++ b/drivers/memory/tegra/tegra124.c @@ -974,16 +974,18 @@ static const struct tegra_smmu_swgroup tegra124_swgroups[] = { { .name = "vi", .swgroup = TEGRA_SWGROUP_VI, .reg = 0x280 }, };
-static const unsigned int tegra124_group_display[] = { +static const unsigned int tegra124_group_drm[] = { TEGRA_SWGROUP_DC, TEGRA_SWGROUP_DCB, + TEGRA_SWGROUP_GPU, + TEGRA_SWGROUP_VIC, };
static const struct tegra_smmu_group_soc tegra124_groups[] = { { .name = "display", - .swgroups = tegra124_group_display, - .num_swgroups = ARRAY_SIZE(tegra124_group_display), + .swgroups = tegra124_group_drm, + .num_swgroups = ARRAY_SIZE(tegra124_group_drm), }, };
diff --git a/drivers/memory/tegra/tegra30.c b/drivers/memory/tegra/tegra30.c index 14788fc2f9e8..8947bee6d032 100644 --- a/drivers/memory/tegra/tegra30.c +++ b/drivers/memory/tegra/tegra30.c @@ -931,16 +931,19 @@ static const struct tegra_smmu_swgroup tegra30_swgroups[] = { { .name = "isp", .swgroup = TEGRA_SWGROUP_ISP, .reg = 0x258 }, };
-static const unsigned int tegra30_group_display[] = { +static const unsigned int tegra30_group_drm[] = { TEGRA_SWGROUP_DC, TEGRA_SWGROUP_DCB, + TEGRA_SWGROUP_G2, + TEGRA_SWGROUP_NV, + TEGRA_SWGROUP_NV2, };
static const struct tegra_smmu_group_soc tegra30_groups[] = { { - .name = "display", - .swgroups = tegra30_group_display, - .num_swgroups = ARRAY_SIZE(tegra30_group_display), + .name = "drm", + .swgroups = tegra30_group_drm, + .num_swgroups = ARRAY_SIZE(tegra30_group_drm), }, };
28.10.2019 15:37, Thierry Reding пишет:
From: Thierry Reding treding@nvidia.com
All of the devices making up the Tegra DRM device want to share a single IOMMU domain. Put them into a single group to allow them to do that.
Signed-off-by: Thierry Reding treding@nvidia.com
drivers/memory/tegra/tegra114.c | 10 ++++++---- drivers/memory/tegra/tegra124.c | 8 +++++--- drivers/memory/tegra/tegra30.c | 11 +++++++---- 3 files changed, 18 insertions(+), 11 deletions(-)
diff --git a/drivers/memory/tegra/tegra114.c b/drivers/memory/tegra/tegra114.c index ac8351b5beeb..48ef01c3ff90 100644 --- a/drivers/memory/tegra/tegra114.c +++ b/drivers/memory/tegra/tegra114.c @@ -909,16 +909,18 @@ static const struct tegra_smmu_swgroup tegra114_swgroups[] = { { .name = "tsec", .swgroup = TEGRA_SWGROUP_TSEC, .reg = 0x294 }, };
-static const unsigned int tegra114_group_display[] = { +static const unsigned int tegra114_group_drm[] = { TEGRA_SWGROUP_DC, TEGRA_SWGROUP_DCB,
- TEGRA_SWGROUP_G2,
- TEGRA_SWGROUP_NV,
};
static const struct tegra_smmu_group_soc tegra114_groups[] = { {
.name = "display",
.swgroups = tegra114_group_display,
.num_swgroups = ARRAY_SIZE(tegra114_group_display),
.name = "drm",
.swgroups = tegra114_group_drm,
},.num_swgroups = ARRAY_SIZE(tegra114_group_drm),
};
diff --git a/drivers/memory/tegra/tegra124.c b/drivers/memory/tegra/tegra124.c index 5d0ccb2be206..62b30b1b9677 100644 --- a/drivers/memory/tegra/tegra124.c +++ b/drivers/memory/tegra/tegra124.c @@ -974,16 +974,18 @@ static const struct tegra_smmu_swgroup tegra124_swgroups[] = { { .name = "vi", .swgroup = TEGRA_SWGROUP_VI, .reg = 0x280 }, };
-static const unsigned int tegra124_group_display[] = { +static const unsigned int tegra124_group_drm[] = { TEGRA_SWGROUP_DC, TEGRA_SWGROUP_DCB,
- TEGRA_SWGROUP_GPU,
- TEGRA_SWGROUP_VIC,
};
static const struct tegra_smmu_group_soc tegra124_groups[] = { { .name = "display",
.swgroups = tegra124_group_display,
.num_swgroups = ARRAY_SIZE(tegra124_group_display),
.swgroups = tegra124_group_drm,
},.num_swgroups = ARRAY_SIZE(tegra124_group_drm),
The "display" -> "drm" group's renaming is missed for T124.
[snip]
On Wed, Oct 30, 2019 at 06:05:48PM +0300, Dmitry Osipenko wrote:
28.10.2019 15:37, Thierry Reding пишет:
From: Thierry Reding treding@nvidia.com
All of the devices making up the Tegra DRM device want to share a single IOMMU domain. Put them into a single group to allow them to do that.
Signed-off-by: Thierry Reding treding@nvidia.com
drivers/memory/tegra/tegra114.c | 10 ++++++---- drivers/memory/tegra/tegra124.c | 8 +++++--- drivers/memory/tegra/tegra30.c | 11 +++++++---- 3 files changed, 18 insertions(+), 11 deletions(-)
diff --git a/drivers/memory/tegra/tegra114.c b/drivers/memory/tegra/tegra114.c index ac8351b5beeb..48ef01c3ff90 100644 --- a/drivers/memory/tegra/tegra114.c +++ b/drivers/memory/tegra/tegra114.c @@ -909,16 +909,18 @@ static const struct tegra_smmu_swgroup tegra114_swgroups[] = { { .name = "tsec", .swgroup = TEGRA_SWGROUP_TSEC, .reg = 0x294 }, };
-static const unsigned int tegra114_group_display[] = { +static const unsigned int tegra114_group_drm[] = { TEGRA_SWGROUP_DC, TEGRA_SWGROUP_DCB,
- TEGRA_SWGROUP_G2,
- TEGRA_SWGROUP_NV,
};
static const struct tegra_smmu_group_soc tegra114_groups[] = { {
.name = "display",
.swgroups = tegra114_group_display,
.num_swgroups = ARRAY_SIZE(tegra114_group_display),
.name = "drm",
.swgroups = tegra114_group_drm,
},.num_swgroups = ARRAY_SIZE(tegra114_group_drm),
};
diff --git a/drivers/memory/tegra/tegra124.c b/drivers/memory/tegra/tegra124.c index 5d0ccb2be206..62b30b1b9677 100644 --- a/drivers/memory/tegra/tegra124.c +++ b/drivers/memory/tegra/tegra124.c @@ -974,16 +974,18 @@ static const struct tegra_smmu_swgroup tegra124_swgroups[] = { { .name = "vi", .swgroup = TEGRA_SWGROUP_VI, .reg = 0x280 }, };
-static const unsigned int tegra124_group_display[] = { +static const unsigned int tegra124_group_drm[] = { TEGRA_SWGROUP_DC, TEGRA_SWGROUP_DCB,
- TEGRA_SWGROUP_GPU,
- TEGRA_SWGROUP_VIC,
};
static const struct tegra_smmu_group_soc tegra124_groups[] = { { .name = "display",
.swgroups = tegra124_group_display,
.num_swgroups = ARRAY_SIZE(tegra124_group_display),
.swgroups = tegra124_group_drm,
},.num_swgroups = ARRAY_SIZE(tegra124_group_drm),
The "display" -> "drm" group's renaming is missed for T124.
Good catch! Fixed now.
Thanks, Thierry
From: Thierry Reding treding@nvidia.com
All the devices that make up the DRM device are now part of the same IOMMU group. This simplifies the handling of the IOMMU attachment and also avoids exhausting the number of IOMMUs available on early Tegra SoC generations.
Signed-off-by: Thierry Reding treding@nvidia.com --- drivers/gpu/drm/tegra/dc.c | 2 +- drivers/gpu/drm/tegra/drm.c | 34 ++++++++++++++++++++-------------- drivers/gpu/drm/tegra/drm.h | 3 +-- drivers/gpu/drm/tegra/gr2d.c | 2 +- drivers/gpu/drm/tegra/gr3d.c | 2 +- drivers/gpu/drm/tegra/vic.c | 2 +- 6 files changed, 25 insertions(+), 20 deletions(-)
diff --git a/drivers/gpu/drm/tegra/dc.c b/drivers/gpu/drm/tegra/dc.c index 54966f538141..36c36b295ab1 100644 --- a/drivers/gpu/drm/tegra/dc.c +++ b/drivers/gpu/drm/tegra/dc.c @@ -2014,7 +2014,7 @@ static int tegra_dc_init(struct host1x_client *client) if (!dc->syncpt) dev_warn(dc->dev, "failed to allocate syncpoint\n");
- err = host1x_client_iommu_attach(client, true); + err = host1x_client_iommu_attach(client); if (err < 0) { dev_err(client->dev, "failed to attach to domain: %d\n", err); return err; diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c index 7480f575188d..9a1c1694604a 100644 --- a/drivers/gpu/drm/tegra/drm.c +++ b/drivers/gpu/drm/tegra/drm.c @@ -904,7 +904,7 @@ int tegra_drm_unregister_client(struct tegra_drm *tegra, return 0; }
-int host1x_client_iommu_attach(struct host1x_client *client, bool shared) +int host1x_client_iommu_attach(struct host1x_client *client) { struct drm_device *drm = dev_get_drvdata(client->parent); struct tegra_drm *tegra = drm->dev_private; @@ -912,29 +912,30 @@ int host1x_client_iommu_attach(struct host1x_client *client, bool shared) int err;
if (tegra->domain) { + struct iommu_domain *domain; + group = iommu_group_get(client->dev); if (!group) { dev_err(client->dev, "failed to get IOMMU group\n"); return -ENODEV; }
- if (!shared || (shared && (group != tegra->group))) { #if IS_ENABLED(CONFIG_ARM_DMA_USE_IOMMU) - if (client->dev->archdata.mapping) { - struct dma_iommu_mapping *mapping = - to_dma_iommu_mapping(client->dev); - arm_iommu_detach_device(client->dev); - arm_iommu_release_mapping(mapping); - } + if (client->dev->archdata.mapping) { + struct dma_iommu_mapping *mapping = + to_dma_iommu_mapping(client->dev); + arm_iommu_detach_device(client->dev); + arm_iommu_release_mapping(mapping); + } #endif + + domain = iommu_get_domain_for_dev(client->dev); + if (domain != tegra->domain) { err = iommu_attach_group(tegra->domain, group); if (err < 0) { iommu_group_put(group); return err; } - - if (shared && !tegra->group) - tegra->group = group; } }
@@ -947,12 +948,17 @@ void host1x_client_iommu_detach(struct host1x_client *client) { struct drm_device *drm = dev_get_drvdata(client->parent); struct tegra_drm *tegra = drm->dev_private; + struct iommu_domain *domain;
if (client->group) { - if (client->group == tegra->group) { + /* + * Devices that are part of the same group may no longer be + * attached to a domain at this point because their group may + * have been detached by an earlier client. + */ + domain = iommu_get_domain_for_dev(client->dev); + if (domain) iommu_detach_group(tegra->domain, client->group); - tegra->group = NULL; - }
iommu_group_put(client->group); } diff --git a/drivers/gpu/drm/tegra/drm.h b/drivers/gpu/drm/tegra/drm.h index 8b812bb52e5b..28f2820a7371 100644 --- a/drivers/gpu/drm/tegra/drm.h +++ b/drivers/gpu/drm/tegra/drm.h @@ -36,7 +36,6 @@ struct tegra_drm { struct drm_device *drm;
struct iommu_domain *domain; - struct iommu_group *group; struct mutex mm_lock; struct drm_mm mm;
@@ -100,7 +99,7 @@ int tegra_drm_register_client(struct tegra_drm *tegra, struct tegra_drm_client *client); int tegra_drm_unregister_client(struct tegra_drm *tegra, struct tegra_drm_client *client); -int host1x_client_iommu_attach(struct host1x_client *client, bool shared); +int host1x_client_iommu_attach(struct host1x_client *client); void host1x_client_iommu_detach(struct host1x_client *client);
int tegra_drm_init(struct tegra_drm *tegra, struct drm_device *drm); diff --git a/drivers/gpu/drm/tegra/gr2d.c b/drivers/gpu/drm/tegra/gr2d.c index 5d5af9a05c18..1fc4e56c7cc5 100644 --- a/drivers/gpu/drm/tegra/gr2d.c +++ b/drivers/gpu/drm/tegra/gr2d.c @@ -50,7 +50,7 @@ static int gr2d_init(struct host1x_client *client) goto put; }
- err = host1x_client_iommu_attach(client, false); + err = host1x_client_iommu_attach(client); if (err < 0) { dev_err(client->dev, "failed to attach to domain: %d\n", err); goto free; diff --git a/drivers/gpu/drm/tegra/gr3d.c b/drivers/gpu/drm/tegra/gr3d.c index c249a6bd8d51..24fae0f64032 100644 --- a/drivers/gpu/drm/tegra/gr3d.c +++ b/drivers/gpu/drm/tegra/gr3d.c @@ -59,7 +59,7 @@ static int gr3d_init(struct host1x_client *client) goto put; }
- err = host1x_client_iommu_attach(client, false); + err = host1x_client_iommu_attach(client); if (err < 0) { dev_err(client->dev, "failed to attach to domain: %d\n", err); goto free; diff --git a/drivers/gpu/drm/tegra/vic.c b/drivers/gpu/drm/tegra/vic.c index d34b1ada422c..603f41ed4b81 100644 --- a/drivers/gpu/drm/tegra/vic.c +++ b/drivers/gpu/drm/tegra/vic.c @@ -187,7 +187,7 @@ static int vic_init(struct host1x_client *client) struct vic *vic = to_vic(drm); int err;
- err = host1x_client_iommu_attach(client, false); + err = host1x_client_iommu_attach(client); if (err < 0) { dev_err(vic->dev, "failed to attach to domain: %d\n", err); return err;
From: Thierry Reding treding@nvidia.com
The host1x_bo_pin() and host1x_bo_unpin() APIs are used to pin and unpin buffers during host1x job submission. Pinning currently returns the SG table and the DMA address (an IOVA if an IOMMU is used or a physical address if no IOMMU is used) of the buffer. The DMA address is only used for buffers that are relocated, whereas the host1x driver will map gather buffers into its own IOVA space so that they can be processed by the CDMA engine.
This approach has a couple of issues. On one hand it's not very useful to return a DMA address for the buffer if host1x doesn't need it. On the other hand, returning the SG table of the buffer is suboptimal because a single SG table cannot be shared for multiple mappings, because the DMA address is stored within the SG table, and the DMA address may be different for different devices.
Subsequent patches will move the host1x driver over to the DMA API which doesn't work with a single shared SG table. Fix this by returning a new SG table each time a buffer is pinned. This allows the buffer to be referenced by multiple jobs for different engines.
Change the prototypes of host1x_bo_pin() and host1x_bo_unpin() to take a struct device *, specifying the device for which the buffer should be pinned. This is required in order to be able to properly construct the SG table. While at it, make host1x_bo_pin() return the SG table because that allows us to return an ERR_PTR()-encoded error code if we need to, or return NULL to signal that we don't need the SG table to be remapped and can simply use the DMA address as-is. At the same time, returning the DMA address is made optional because in the example of command buffers, host1x doesn't need to know the DMA address since it will have to create its own mapping anyway.
Signed-off-by: Thierry Reding treding@nvidia.com --- drivers/gpu/drm/tegra/gem.c | 34 ++++++++++++++++++++++++++++++---- drivers/gpu/host1x/job.c | 15 ++++++++++++--- include/linux/host1x.h | 17 ++++++++++------- 3 files changed, 52 insertions(+), 14 deletions(-)
diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c index d2f88cc3134f..564ef60f67c2 100644 --- a/drivers/gpu/drm/tegra/gem.c +++ b/drivers/gpu/drm/tegra/gem.c @@ -27,17 +27,43 @@ static void tegra_bo_put(struct host1x_bo *bo) drm_gem_object_put_unlocked(&obj->gem); }
-static dma_addr_t tegra_bo_pin(struct host1x_bo *bo, struct sg_table **sgt) +static struct sg_table *tegra_bo_pin(struct device *dev, struct host1x_bo *bo, + dma_addr_t *phys) { struct tegra_bo *obj = host1x_to_tegra_bo(bo); + struct sg_table *sgt; + int err; + + if (phys) + *phys = obj->iova;
- *sgt = obj->sgt; + sgt = kzalloc(sizeof(*sgt), GFP_KERNEL); + if (!sgt) + return ERR_PTR(-ENOMEM);
- return obj->iova; + if (obj->pages) { + err = sg_alloc_table_from_pages(sgt, obj->pages, obj->num_pages, + 0, obj->gem.size, GFP_KERNEL); + if (err < 0) + goto free; + } else { + err = dma_get_sgtable(dev, sgt, obj->vaddr, obj->iova, + obj->gem.size); + if (err < 0) + goto free; + } + + return sgt; + +free: + kfree(sgt); + return ERR_PTR(err); }
-static void tegra_bo_unpin(struct host1x_bo *bo, struct sg_table *sgt) +static void tegra_bo_unpin(struct device *dev, struct sg_table *sgt) { + sg_free_table(sgt); + kfree(sgt); }
static void *tegra_bo_mmap(struct host1x_bo *bo) diff --git a/drivers/gpu/host1x/job.c b/drivers/gpu/host1x/job.c index eaa5c3352c13..90dd592fdfca 100644 --- a/drivers/gpu/host1x/job.c +++ b/drivers/gpu/host1x/job.c @@ -99,6 +99,7 @@ EXPORT_SYMBOL(host1x_job_add_gather);
static unsigned int pin_job(struct host1x *host, struct host1x_job *job) { + struct device *dev = job->client->dev; unsigned int i; int err;
@@ -115,7 +116,11 @@ static unsigned int pin_job(struct host1x *host, struct host1x_job *job) goto unpin; }
- phys_addr = host1x_bo_pin(reloc->target.bo, &sgt); + sgt = host1x_bo_pin(dev, reloc->target.bo, &phys_addr); + if (IS_ERR(sgt)) { + err = PTR_ERR(sgt); + goto unpin; + }
job->addr_phys[job->num_unpins] = phys_addr; job->unpins[job->num_unpins].bo = reloc->target.bo; @@ -139,7 +144,11 @@ static unsigned int pin_job(struct host1x *host, struct host1x_job *job) goto unpin; }
- phys_addr = host1x_bo_pin(g->bo, &sgt); + sgt = host1x_bo_pin(host->dev, g->bo, &phys_addr); + if (IS_ERR(sgt)) { + err = PTR_ERR(sgt); + goto unpin; + }
if (!IS_ENABLED(CONFIG_TEGRA_HOST1X_FIREWALL) && host->domain) { for_each_sg(sgt->sgl, sg, sgt->nents, j) @@ -566,7 +575,7 @@ void host1x_job_unpin(struct host1x_job *job) iova_pfn(&host->iova, job->addr_phys[i])); }
- host1x_bo_unpin(unpin->bo, unpin->sgt); + host1x_bo_unpin(host->dev, unpin->bo, unpin->sgt); host1x_bo_put(unpin->bo); }
diff --git a/include/linux/host1x.h b/include/linux/host1x.h index df6e613ba715..1ba23a6a2021 100644 --- a/include/linux/host1x.h +++ b/include/linux/host1x.h @@ -67,8 +67,9 @@ struct sg_table; struct host1x_bo_ops { struct host1x_bo *(*get)(struct host1x_bo *bo); void (*put)(struct host1x_bo *bo); - dma_addr_t (*pin)(struct host1x_bo *bo, struct sg_table **sgt); - void (*unpin)(struct host1x_bo *bo, struct sg_table *sgt); + struct sg_table *(*pin)(struct device *dev, struct host1x_bo *bo, + dma_addr_t *phys); + void (*unpin)(struct device *dev, struct sg_table *sgt); void *(*mmap)(struct host1x_bo *bo); void (*munmap)(struct host1x_bo *bo, void *addr); void *(*kmap)(struct host1x_bo *bo, unsigned int pagenum); @@ -95,15 +96,17 @@ static inline void host1x_bo_put(struct host1x_bo *bo) bo->ops->put(bo); }
-static inline dma_addr_t host1x_bo_pin(struct host1x_bo *bo, - struct sg_table **sgt) +static inline struct sg_table *host1x_bo_pin(struct device *dev, + struct host1x_bo *bo, + dma_addr_t *phys) { - return bo->ops->pin(bo, sgt); + return bo->ops->pin(dev, bo, phys); }
-static inline void host1x_bo_unpin(struct host1x_bo *bo, struct sg_table *sgt) +static inline void host1x_bo_unpin(struct device *dev, struct host1x_bo *bo, + struct sg_table *sgt) { - bo->ops->unpin(bo, sgt); + bo->ops->unpin(dev, sgt); }
static inline void *host1x_bo_mmap(struct host1x_bo *bo)
From: Thierry Reding treding@nvidia.com
The debugfs files created for host1x are never removed, causing these files to be left dangling in debugfs. This results in a crash when any of these files are accessed after the host1x driver has been removed, as well as a failure to create the debugfs entries when they are added again on driver probe.
Signed-off-by: Thierry Reding treding@nvidia.com --- drivers/gpu/host1x/dev.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/gpu/host1x/dev.c b/drivers/gpu/host1x/dev.c index 452ee5d64021..f30b8447a319 100644 --- a/drivers/gpu/host1x/dev.c +++ b/drivers/gpu/host1x/dev.c @@ -402,6 +402,7 @@ static int host1x_remove(struct platform_device *pdev) struct host1x *host = platform_get_drvdata(pdev);
host1x_unregister(host); + host1x_debug_deinit(host); host1x_intr_deinit(host); host1x_syncpt_deinit(host); reset_control_assert(host->rst);
From: Thierry Reding treding@nvidia.com
Add direction flags to host1x relocations performed during job pinning. These flags indicate the kinds of accesses that hardware is allowed to perform on the relocated buffers.
Signed-off-by: Thierry Reding treding@nvidia.com --- drivers/gpu/drm/tegra/drm.c | 2 ++ include/linux/host1x.h | 4 ++++ 2 files changed, 6 insertions(+)
diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c index 9a1c1694604a..efc8a27b9e6a 100644 --- a/drivers/gpu/drm/tegra/drm.c +++ b/drivers/gpu/drm/tegra/drm.c @@ -149,6 +149,8 @@ static int host1x_reloc_copy_from_user(struct host1x_reloc *dest, if (err < 0) return err;
+ dest->flags = HOST1X_RELOC_READ | HOST1X_RELOC_WRITE; + dest->cmdbuf.bo = host1x_bo_lookup(file, cmdbuf); if (!dest->cmdbuf.bo) return -ENOENT; diff --git a/include/linux/host1x.h b/include/linux/host1x.h index 1ba23a6a2021..6f8d772591ba 100644 --- a/include/linux/host1x.h +++ b/include/linux/host1x.h @@ -173,6 +173,9 @@ int host1x_job_submit(struct host1x_job *job); * host1x job */
+#define HOST1X_RELOC_READ (1 << 0) +#define HOST1X_RELOC_WRITE (1 << 1) + struct host1x_reloc { struct { struct host1x_bo *bo; @@ -183,6 +186,7 @@ struct host1x_reloc { unsigned long offset; } target; unsigned long shift; + unsigned long flags; };
struct host1x_job {
From: Thierry Reding treding@nvidia.com
Currently when the gather buffers are copied, they are copied to a buffer that is allocated for the host1x client that wants to execute the command streams in the buffers. However, the gather buffers will be read by the host1x device, which causes SMMU faults if the DMA API is backed by an IOMMU.
Fix this by allocating the gather buffer copy for the host1x device, which makes sure that it will be mapped into the host1x's IOVA space if the DMA API is backed by an IOMMU.
Signed-off-by: Thierry Reding treding@nvidia.com --- drivers/gpu/host1x/job.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/host1x/job.c b/drivers/gpu/host1x/job.c index 90dd592fdfca..2e0c3e0ca1fa 100644 --- a/drivers/gpu/host1x/job.c +++ b/drivers/gpu/host1x/job.c @@ -445,7 +445,8 @@ static int validate(struct host1x_firewall *fw, struct host1x_job_gather *g) return err; }
-static inline int copy_gathers(struct host1x_job *job, struct device *dev) +static inline int copy_gathers(struct device *host, struct host1x_job *job, + struct device *dev) { struct host1x_firewall fw; size_t size = 0; @@ -468,12 +469,12 @@ static inline int copy_gathers(struct host1x_job *job, struct device *dev) * Try a non-blocking allocation from a higher priority pools first, * as awaiting for the allocation here is a major performance hit. */ - job->gather_copy_mapped = dma_alloc_wc(dev, size, &job->gather_copy, + job->gather_copy_mapped = dma_alloc_wc(host, size, &job->gather_copy, GFP_NOWAIT);
/* the higher priority allocation failed, try the generic-blocking */ if (!job->gather_copy_mapped) - job->gather_copy_mapped = dma_alloc_wc(dev, size, + job->gather_copy_mapped = dma_alloc_wc(host, size, &job->gather_copy, GFP_KERNEL); if (!job->gather_copy_mapped) @@ -521,7 +522,7 @@ int host1x_job_pin(struct host1x_job *job, struct device *dev) goto out;
if (IS_ENABLED(CONFIG_TEGRA_HOST1X_FIREWALL)) { - err = copy_gathers(job, dev); + err = copy_gathers(host->dev, job, dev); if (err) goto out; } @@ -582,7 +583,7 @@ void host1x_job_unpin(struct host1x_job *job) job->num_unpins = 0;
if (job->gather_copy_size) - dma_free_wc(job->channel->dev, job->gather_copy_size, + dma_free_wc(host->dev, job->gather_copy_size, job->gather_copy_mapped, job->gather_copy); } EXPORT_SYMBOL(host1x_job_unpin);
From: Thierry Reding treding@nvidia.com
If host1x_bo_pin() returns an SG table, create a DMA mapping for the buffer. For buffers that the host1x client has already mapped itself, host1x_bo_pin() returns NULL and the existing DMA address is used.
Signed-off-by: Thierry Reding treding@nvidia.com --- drivers/gpu/drm/tegra/gem.c | 18 +++++++-- drivers/gpu/host1x/dev.c | 16 ++------ drivers/gpu/host1x/job.c | 73 ++++++++++++++++++++++++++++++++----- drivers/gpu/host1x/job.h | 4 ++ 4 files changed, 87 insertions(+), 24 deletions(-)
diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c index 564ef60f67c2..746dae32c484 100644 --- a/drivers/gpu/drm/tegra/gem.c +++ b/drivers/gpu/drm/tegra/gem.c @@ -34,9 +34,19 @@ static struct sg_table *tegra_bo_pin(struct device *dev, struct host1x_bo *bo, struct sg_table *sgt; int err;
- if (phys) + /* + * If we've manually mapped the buffer object through the IOMMU, make + * sure to return the IOVA address of our mapping. + */ + if (phys && obj->mm) { *phys = obj->iova; + return NULL; + }
+ /* + * If we don't have a mapping for this buffer yet, return an SG table + * so that host1x can do the mapping for us via the DMA API. + */ sgt = kzalloc(sizeof(*sgt), GFP_KERNEL); if (!sgt) return ERR_PTR(-ENOMEM); @@ -62,8 +72,10 @@ static struct sg_table *tegra_bo_pin(struct device *dev, struct host1x_bo *bo,
static void tegra_bo_unpin(struct device *dev, struct sg_table *sgt) { - sg_free_table(sgt); - kfree(sgt); + if (sgt) { + sg_free_table(sgt); + kfree(sgt); + } }
static void *tegra_bo_mmap(struct host1x_bo *bo) diff --git a/drivers/gpu/host1x/dev.c b/drivers/gpu/host1x/dev.c index f30b8447a319..5bdc484398f4 100644 --- a/drivers/gpu/host1x/dev.c +++ b/drivers/gpu/host1x/dev.c @@ -18,10 +18,6 @@ #include <trace/events/host1x.h> #undef CREATE_TRACE_POINTS
-#if IS_ENABLED(CONFIG_ARM_DMA_USE_IOMMU) -#include <asm/dma-iommu.h> -#endif - #include "bus.h" #include "channel.h" #include "debug.h" @@ -276,17 +272,13 @@ static int host1x_probe(struct platform_device *pdev) dev_err(&pdev->dev, "failed to get reset: %d\n", err); return err; } -#if IS_ENABLED(CONFIG_ARM_DMA_USE_IOMMU) - if (host->dev->archdata.mapping) { - struct dma_iommu_mapping *mapping = - to_dma_iommu_mapping(host->dev); - arm_iommu_detach_device(host->dev); - arm_iommu_release_mapping(mapping); - } -#endif + if (IS_ENABLED(CONFIG_TEGRA_HOST1X_FIREWALL)) goto skip_iommu;
+ if (iommu_get_domain_for_dev(&pdev->dev)) + goto skip_iommu; + host->group = iommu_group_get(&pdev->dev); if (host->group) { struct iommu_domain_geometry *geometry; diff --git a/drivers/gpu/host1x/job.c b/drivers/gpu/host1x/job.c index 2e0c3e0ca1fa..25ca54de8fc5 100644 --- a/drivers/gpu/host1x/job.c +++ b/drivers/gpu/host1x/job.c @@ -99,7 +99,8 @@ EXPORT_SYMBOL(host1x_job_add_gather);
static unsigned int pin_job(struct host1x *host, struct host1x_job *job) { - struct device *dev = job->client->dev; + struct host1x_client *client = job->client; + struct device *dev = client->dev; unsigned int i; int err;
@@ -107,8 +108,8 @@ static unsigned int pin_job(struct host1x *host, struct host1x_job *job)
for (i = 0; i < job->num_relocs; i++) { struct host1x_reloc *reloc = &job->relocs[i]; + dma_addr_t phys_addr, *phys; struct sg_table *sgt; - dma_addr_t phys_addr;
reloc->target.bo = host1x_bo_get(reloc->target.bo); if (!reloc->target.bo) { @@ -116,12 +117,51 @@ static unsigned int pin_job(struct host1x *host, struct host1x_job *job) goto unpin; }
- sgt = host1x_bo_pin(dev, reloc->target.bo, &phys_addr); + if (client->group) + phys = &phys_addr; + else + phys = NULL; + + sgt = host1x_bo_pin(dev, reloc->target.bo, phys); if (IS_ERR(sgt)) { err = PTR_ERR(sgt); goto unpin; }
+ if (sgt) { + unsigned long mask = HOST1X_RELOC_READ | + HOST1X_RELOC_WRITE; + enum dma_data_direction dir; + + switch (reloc->flags & mask) { + case HOST1X_RELOC_READ: + dir = DMA_TO_DEVICE; + break; + + case HOST1X_RELOC_WRITE: + dir = DMA_FROM_DEVICE; + break; + + case HOST1X_RELOC_READ | HOST1X_RELOC_WRITE: + dir = DMA_BIDIRECTIONAL; + break; + + default: + err = -EINVAL; + goto unpin; + } + + err = dma_map_sg(dev, sgt->sgl, sgt->nents, dir); + if (!err) { + err = -ENOMEM; + goto unpin; + } + + job->unpins[job->num_unpins].dev = dev; + job->unpins[job->num_unpins].dir = dir; + phys_addr = sg_dma_address(sgt->sgl); + } + job->addr_phys[job->num_unpins] = phys_addr; job->unpins[job->num_unpins].bo = reloc->target.bo; job->unpins[job->num_unpins].sgt = sgt; @@ -144,7 +184,7 @@ static unsigned int pin_job(struct host1x *host, struct host1x_job *job) goto unpin; }
- sgt = host1x_bo_pin(host->dev, g->bo, &phys_addr); + sgt = host1x_bo_pin(host->dev, g->bo, NULL); if (IS_ERR(sgt)) { err = PTR_ERR(sgt); goto unpin; @@ -172,15 +212,24 @@ static unsigned int pin_job(struct host1x *host, struct host1x_job *job) goto unpin; }
- job->addr_phys[job->num_unpins] = - iova_dma_addr(&host->iova, alloc); job->unpins[job->num_unpins].size = gather_size; + phys_addr = iova_dma_addr(&host->iova, alloc); } else { - job->addr_phys[job->num_unpins] = phys_addr; + err = dma_map_sg(host->dev, sgt->sgl, sgt->nents, + DMA_TO_DEVICE); + if (!err) { + err = -ENOMEM; + goto unpin; + } + + job->unpins[job->num_unpins].dev = host->dev; + phys_addr = sg_dma_address(sgt->sgl); }
- job->gather_addr_phys[i] = job->addr_phys[job->num_unpins]; + job->addr_phys[job->num_unpins] = phys_addr; + job->gather_addr_phys[i] = phys_addr;
+ job->unpins[job->num_unpins].dir = DMA_TO_DEVICE; job->unpins[job->num_unpins].bo = g->bo; job->unpins[job->num_unpins].sgt = sgt; job->num_unpins++; @@ -567,6 +616,8 @@ void host1x_job_unpin(struct host1x_job *job)
for (i = 0; i < job->num_unpins; i++) { struct host1x_job_unpin_data *unpin = &job->unpins[i]; + struct device *dev = unpin->dev ?: host->dev; + struct sg_table *sgt = unpin->sgt;
if (!IS_ENABLED(CONFIG_TEGRA_HOST1X_FIREWALL) && unpin->size && host->domain) { @@ -576,7 +627,11 @@ void host1x_job_unpin(struct host1x_job *job) iova_pfn(&host->iova, job->addr_phys[i])); }
- host1x_bo_unpin(host->dev, unpin->bo, unpin->sgt); + if (unpin->dev && sgt) + dma_unmap_sg(unpin->dev, sgt->sgl, sgt->nents, + unpin->dir); + + host1x_bo_unpin(dev, unpin->bo, sgt); host1x_bo_put(unpin->bo); }
diff --git a/drivers/gpu/host1x/job.h b/drivers/gpu/host1x/job.h index 62b8805e6b35..94bc2e4ae241 100644 --- a/drivers/gpu/host1x/job.h +++ b/drivers/gpu/host1x/job.h @@ -8,6 +8,8 @@ #ifndef __HOST1X_JOB_H #define __HOST1X_JOB_H
+#include <linux/dma-direction.h> + struct host1x_job_gather { unsigned int words; dma_addr_t base; @@ -19,7 +21,9 @@ struct host1x_job_gather { struct host1x_job_unpin_data { struct host1x_bo *bo; struct sg_table *sgt; + struct device *dev; size_t size; + enum dma_data_direction dir; };
/*
From: Thierry Reding treding@nvidia.com
If the Tegra DRM clients are backed by an IOMMU, push buffers are likely to be allocated beyond the 32-bit boundary if sufficient system memory is available. This is problematic on earlier generations of Tegra where host1x supports a maximum of 32 address bits for the GATHER opcode. More recent versions of Tegra (Tegra186 and later) have a wide variant of the GATHER opcode, which allows addressing up to 64 bits of memory.
If host1x itself is behind an IOMMU as well this doesn't matter because the IOMMU's input address space is restricted to 32 bits on generations without support for wide GATHER opcodes.
However, if host1x is not behind an IOMMU, it won't be able to process push buffers beyond the 32-bit boundary on Tegra generations that don't support wide GATHER opcodes. Restrict the DMA mask to 32 bits on these generations prevents buffers from being allocated from beyond the 32-bit boundary.
Signed-off-by: Thierry Reding treding@nvidia.com --- drivers/gpu/host1x/dev.c | 214 ++++++++++++++++++++++++--------------- drivers/gpu/host1x/dev.h | 1 + 2 files changed, 136 insertions(+), 79 deletions(-)
diff --git a/drivers/gpu/host1x/dev.c b/drivers/gpu/host1x/dev.c index 5bdc484398f4..a738ea55e407 100644 --- a/drivers/gpu/host1x/dev.c +++ b/drivers/gpu/host1x/dev.c @@ -73,6 +73,7 @@ static const struct host1x_info host1x01_info = { .init = host1x01_init, .sync_offset = 0x3000, .dma_mask = DMA_BIT_MASK(32), + .has_wide_gather = false, .has_hypervisor = false, .num_sid_entries = 0, .sid_table = NULL, @@ -86,6 +87,7 @@ static const struct host1x_info host1x02_info = { .init = host1x02_init, .sync_offset = 0x3000, .dma_mask = DMA_BIT_MASK(32), + .has_wide_gather = false, .has_hypervisor = false, .num_sid_entries = 0, .sid_table = NULL, @@ -99,6 +101,7 @@ static const struct host1x_info host1x04_info = { .init = host1x04_init, .sync_offset = 0x2100, .dma_mask = DMA_BIT_MASK(34), + .has_wide_gather = false, .has_hypervisor = false, .num_sid_entries = 0, .sid_table = NULL, @@ -112,6 +115,7 @@ static const struct host1x_info host1x05_info = { .init = host1x05_init, .sync_offset = 0x2100, .dma_mask = DMA_BIT_MASK(34), + .has_wide_gather = false, .has_hypervisor = false, .num_sid_entries = 0, .sid_table = NULL, @@ -134,6 +138,7 @@ static const struct host1x_info host1x06_info = { .init = host1x06_init, .sync_offset = 0x0, .dma_mask = DMA_BIT_MASK(40), + .has_wide_gather = true, .has_hypervisor = true, .num_sid_entries = ARRAY_SIZE(tegra186_sid_table), .sid_table = tegra186_sid_table, @@ -156,6 +161,7 @@ static const struct host1x_info host1x07_info = { .init = host1x07_init, .sync_offset = 0x0, .dma_mask = DMA_BIT_MASK(40), + .has_wide_gather = true, .has_hypervisor = true, .num_sid_entries = ARRAY_SIZE(tegra194_sid_table), .sid_table = tegra194_sid_table, @@ -186,6 +192,117 @@ static void host1x_setup_sid_table(struct host1x *host) } }
+static struct iommu_domain *host1x_iommu_attach(struct host1x *host) +{ + struct iommu_domain *domain = iommu_get_domain_for_dev(host->dev); + int err; + + /* + * If the host1x firewall is enabled, there's no need to enable IOMMU + * support. Similarly, if host1x is already attached to an IOMMU (via + * the DMA API), don't try to attach again. + */ + if (IS_ENABLED(CONFIG_TEGRA_HOST1X_FIREWALL) || domain) + return domain; + + host->group = iommu_group_get(host->dev); + if (host->group) { + struct iommu_domain_geometry *geometry; + dma_addr_t start, end; + unsigned long order; + + err = iova_cache_get(); + if (err < 0) + goto put_group; + + host->domain = iommu_domain_alloc(&platform_bus_type); + if (!host->domain) { + err = -ENOMEM; + goto put_cache; + } + + err = iommu_attach_group(host->domain, host->group); + if (err) { + if (err == -ENODEV) + err = 0; + + goto free_domain; + } + + geometry = &host->domain->geometry; + start = geometry->aperture_start & host->info->dma_mask; + end = geometry->aperture_end & host->info->dma_mask; + + order = __ffs(host->domain->pgsize_bitmap); + init_iova_domain(&host->iova, 1UL << order, start >> order); + host->iova_end = end; + + domain = host->domain; + } + + return domain; + +free_domain: + iommu_domain_free(host->domain); + host->domain = NULL; +put_cache: + iova_cache_put(); +put_group: + iommu_group_put(host->group); + host->group = NULL; + + return ERR_PTR(err); +} + +static int host1x_iommu_init(struct host1x *host) +{ + u64 mask = host->info->dma_mask; + struct iommu_domain *domain; + int err; + + domain = host1x_iommu_attach(host); + if (IS_ERR(domain)) { + err = PTR_ERR(domain); + dev_err(host->dev, "failed to attach to IOMMU: %d\n", err); + return err; + } + + /* + * If we're not behind an IOMMU make sure we don't get push buffers + * that are allocated outside of the range addressable by the GATHER + * opcode. + * + * Newer generations of Tegra (Tegra186 and later) support a wide + * variant of the GATHER opcode that allows addressing more bits. + */ + if (!domain && !host->info->has_wide_gather) + mask = DMA_BIT_MASK(32); + + err = dma_coerce_mask_and_coherent(host->dev, mask); + if (err < 0) { + dev_err(host->dev, "failed to set DMA mask: %d\n", err); + return err; + } + + return 0; +} + +static void host1x_iommu_exit(struct host1x *host) +{ + if (host->domain) { + put_iova_domain(&host->iova); + iommu_detach_group(host->domain, host->group); + + iommu_domain_free(host->domain); + host->domain = NULL; + + iova_cache_put(); + + iommu_group_put(host->group); + host->group = NULL; + } +} + static int host1x_probe(struct platform_device *pdev) { struct host1x *host; @@ -248,8 +365,6 @@ static int host1x_probe(struct platform_device *pdev) host->dev->dma_parms = &host->dma_parms; dma_set_max_seg_size(host->dev, UINT_MAX);
- dma_set_mask_and_coherent(host->dev, host->info->dma_mask); - if (host->info->init) { err = host->info->init(host); if (err) @@ -273,82 +388,41 @@ static int host1x_probe(struct platform_device *pdev) return err; }
- if (IS_ENABLED(CONFIG_TEGRA_HOST1X_FIREWALL)) - goto skip_iommu; - - if (iommu_get_domain_for_dev(&pdev->dev)) - goto skip_iommu; - - host->group = iommu_group_get(&pdev->dev); - if (host->group) { - struct iommu_domain_geometry *geometry; - u64 mask = dma_get_mask(host->dev); - dma_addr_t start, end; - unsigned long order; - - err = iova_cache_get(); - if (err < 0) - goto put_group; - - host->domain = iommu_domain_alloc(&platform_bus_type); - if (!host->domain) { - err = -ENOMEM; - goto put_cache; - } - - err = iommu_attach_group(host->domain, host->group); - if (err) { - if (err == -ENODEV) { - iommu_domain_free(host->domain); - host->domain = NULL; - iova_cache_put(); - iommu_group_put(host->group); - host->group = NULL; - goto skip_iommu; - } - - goto fail_free_domain; - } - - geometry = &host->domain->geometry; - start = geometry->aperture_start & mask; - end = geometry->aperture_end & mask; - - order = __ffs(host->domain->pgsize_bitmap); - init_iova_domain(&host->iova, 1UL << order, start >> order); - host->iova_end = end; + err = host1x_iommu_init(host); + if (err < 0) { + dev_err(&pdev->dev, "failed to setup IOMMU: %d\n", err); + return err; }
-skip_iommu: err = host1x_channel_list_init(&host->channel_list, host->info->nb_channels); if (err) { dev_err(&pdev->dev, "failed to initialize channel list\n"); - goto fail_detach_device; + goto iommu_exit; }
err = clk_prepare_enable(host->clk); if (err < 0) { dev_err(&pdev->dev, "failed to enable clock\n"); - goto fail_free_channels; + goto free_channels; }
err = reset_control_deassert(host->rst); if (err < 0) { dev_err(&pdev->dev, "failed to deassert reset: %d\n", err); - goto fail_unprepare_disable; + goto unprepare_disable; }
err = host1x_syncpt_init(host); if (err) { dev_err(&pdev->dev, "failed to initialize syncpts\n"); - goto fail_reset_assert; + goto reset_assert; }
err = host1x_intr_init(host, syncpt_irq); if (err) { dev_err(&pdev->dev, "failed to initialize interrupts\n"); - goto fail_deinit_syncpt; + goto deinit_syncpt; }
host1x_debug_init(host); @@ -358,33 +432,22 @@ static int host1x_probe(struct platform_device *pdev)
err = host1x_register(host); if (err < 0) - goto fail_deinit_intr; + goto deinit_intr;
return 0;
-fail_deinit_intr: +deinit_intr: host1x_intr_deinit(host); -fail_deinit_syncpt: +deinit_syncpt: host1x_syncpt_deinit(host); -fail_reset_assert: +reset_assert: reset_control_assert(host->rst); -fail_unprepare_disable: +unprepare_disable: clk_disable_unprepare(host->clk); -fail_free_channels: +free_channels: host1x_channel_list_free(&host->channel_list); -fail_detach_device: - if (host->group && host->domain) { - put_iova_domain(&host->iova); - iommu_detach_group(host->domain, host->group); - } -fail_free_domain: - if (host->domain) - iommu_domain_free(host->domain); -put_cache: - if (host->group) - iova_cache_put(); -put_group: - iommu_group_put(host->group); +iommu_exit: + host1x_iommu_exit(host);
return err; } @@ -399,14 +462,7 @@ static int host1x_remove(struct platform_device *pdev) host1x_syncpt_deinit(host); reset_control_assert(host->rst); clk_disable_unprepare(host->clk); - - if (host->domain) { - put_iova_domain(&host->iova); - iommu_detach_group(host->domain, host->group); - iommu_domain_free(host->domain); - iova_cache_put(); - iommu_group_put(host->group); - } + host1x_iommu_exit(host);
return 0; } diff --git a/drivers/gpu/host1x/dev.h b/drivers/gpu/host1x/dev.h index abafde7c665e..f781a9b0f39d 100644 --- a/drivers/gpu/host1x/dev.h +++ b/drivers/gpu/host1x/dev.h @@ -97,6 +97,7 @@ struct host1x_info { int (*init)(struct host1x *host1x); /* initialize per SoC ops */ unsigned int sync_offset; /* offset of syncpoint registers */ u64 dma_mask; /* mask of addressable memory */ + bool has_wide_gather; /* supports GATHER_W opcode */ bool has_hypervisor; /* has hypervisor registers */ unsigned int num_sid_entries; const struct host1x_sid_entry *sid_table;
From: Thierry Reding treding@nvidia.com
Having to provide allocator hooks to the Falcon library is somewhat cumbersome and it doesn't give the users of the library a lot of flexibility to deal with allocations. Instead, remove the notion of Falcon "operations" and let drivers deal with the memory allocations themselves.
Signed-off-by: Thierry Reding treding@nvidia.com --- drivers/gpu/drm/tegra/falcon.c | 50 ++----------------- drivers/gpu/drm/tegra/falcon.h | 11 ---- drivers/gpu/drm/tegra/vic.c | 91 ++++++++++++++++++++++++---------- 3 files changed, 68 insertions(+), 84 deletions(-)
diff --git a/drivers/gpu/drm/tegra/falcon.c b/drivers/gpu/drm/tegra/falcon.c index f49ad36e24db..a5b25e4ecbd2 100644 --- a/drivers/gpu/drm/tegra/falcon.c +++ b/drivers/gpu/drm/tegra/falcon.c @@ -59,26 +59,11 @@ static void falcon_copy_firmware_image(struct falcon *falcon, const struct firmware *firmware) { u32 *firmware_vaddr = falcon->firmware.vaddr; - dma_addr_t daddr; size_t i; - int err;
/* copy the whole thing taking into account endianness */ for (i = 0; i < firmware->size / sizeof(u32); i++) firmware_vaddr[i] = le32_to_cpu(((u32 *)firmware->data)[i]); - - /* ensure that caches are flushed and falcon can see the firmware */ - daddr = dma_map_single(falcon->dev, firmware_vaddr, - falcon->firmware.size, DMA_TO_DEVICE); - err = dma_mapping_error(falcon->dev, daddr); - if (err) { - dev_err(falcon->dev, "failed to map firmware: %d\n", err); - return; - } - dma_sync_single_for_device(falcon->dev, daddr, - falcon->firmware.size, DMA_TO_DEVICE); - dma_unmap_single(falcon->dev, daddr, falcon->firmware.size, - DMA_TO_DEVICE); }
static int falcon_parse_firmware_image(struct falcon *falcon) @@ -125,6 +110,8 @@ int falcon_read_firmware(struct falcon *falcon, const char *name) if (err < 0) return err;
+ falcon->firmware.size = falcon->firmware.firmware->size; + return 0; }
@@ -133,16 +120,6 @@ int falcon_load_firmware(struct falcon *falcon) const struct firmware *firmware = falcon->firmware.firmware; int err;
- falcon->firmware.size = firmware->size; - - /* allocate iova space for the firmware */ - falcon->firmware.vaddr = falcon->ops->alloc(falcon, firmware->size, - &falcon->firmware.paddr); - if (IS_ERR(falcon->firmware.vaddr)) { - dev_err(falcon->dev, "DMA memory mapping failed\n"); - return PTR_ERR(falcon->firmware.vaddr); - } - /* copy firmware image into local area. this also ensures endianness */ falcon_copy_firmware_image(falcon, firmware);
@@ -150,27 +127,17 @@ int falcon_load_firmware(struct falcon *falcon) err = falcon_parse_firmware_image(falcon); if (err < 0) { dev_err(falcon->dev, "failed to parse firmware image\n"); - goto err_setup_firmware_image; + return err; }
release_firmware(firmware); falcon->firmware.firmware = NULL;
return 0; - -err_setup_firmware_image: - falcon->ops->free(falcon, falcon->firmware.size, - falcon->firmware.paddr, falcon->firmware.vaddr); - - return err; }
int falcon_init(struct falcon *falcon) { - /* check mandatory ops */ - if (!falcon->ops || !falcon->ops->alloc || !falcon->ops->free) - return -EINVAL; - falcon->firmware.vaddr = NULL;
return 0; @@ -178,17 +145,8 @@ int falcon_init(struct falcon *falcon)
void falcon_exit(struct falcon *falcon) { - if (falcon->firmware.firmware) { + if (falcon->firmware.firmware) release_firmware(falcon->firmware.firmware); - falcon->firmware.firmware = NULL; - } - - if (falcon->firmware.vaddr) { - falcon->ops->free(falcon, falcon->firmware.size, - falcon->firmware.paddr, - falcon->firmware.vaddr); - falcon->firmware.vaddr = NULL; - } }
int falcon_boot(struct falcon *falcon) diff --git a/drivers/gpu/drm/tegra/falcon.h b/drivers/gpu/drm/tegra/falcon.h index 3d1243217410..92491a1e90df 100644 --- a/drivers/gpu/drm/tegra/falcon.h +++ b/drivers/gpu/drm/tegra/falcon.h @@ -74,15 +74,6 @@ struct falcon_fw_os_header_v1 { u32 data_size; };
-struct falcon; - -struct falcon_ops { - void *(*alloc)(struct falcon *falcon, size_t size, - dma_addr_t *paddr); - void (*free)(struct falcon *falcon, size_t size, - dma_addr_t paddr, void *vaddr); -}; - struct falcon_firmware_section { unsigned long offset; size_t size; @@ -107,8 +98,6 @@ struct falcon { /* Set by falcon client */ struct device *dev; void __iomem *regs; - const struct falcon_ops *ops; - void *data;
struct falcon_firmware firmware; }; diff --git a/drivers/gpu/drm/tegra/vic.c b/drivers/gpu/drm/tegra/vic.c index 603f41ed4b81..4345b8054617 100644 --- a/drivers/gpu/drm/tegra/vic.c +++ b/drivers/gpu/drm/tegra/vic.c @@ -158,27 +158,6 @@ static int vic_boot(struct vic *vic) return 0; }
-static void *vic_falcon_alloc(struct falcon *falcon, size_t size, - dma_addr_t *iova) -{ - struct tegra_drm *tegra = falcon->data; - - return tegra_drm_alloc(tegra, size, iova); -} - -static void vic_falcon_free(struct falcon *falcon, size_t size, - dma_addr_t iova, void *va) -{ - struct tegra_drm *tegra = falcon->data; - - return tegra_drm_free(tegra, size, va, iova); -} - -static const struct falcon_ops vic_falcon_ops = { - .alloc = vic_falcon_alloc, - .free = vic_falcon_free -}; - static int vic_init(struct host1x_client *client) { struct tegra_drm_client *drm = host1x_to_drm_client(client); @@ -246,6 +225,15 @@ static int vic_exit(struct host1x_client *client) host1x_channel_put(vic->channel); host1x_client_iommu_detach(client);
+ if (client->group) + tegra_drm_free(tegra, vic->falcon.firmware.size, + vic->falcon.firmware.vaddr, + vic->falcon.firmware.paddr); + else + dma_free_coherent(vic->dev, vic->falcon.firmware.size, + vic->falcon.firmware.vaddr, + vic->falcon.firmware.paddr); + return 0; }
@@ -256,25 +244,75 @@ static const struct host1x_client_ops vic_client_ops = {
static int vic_load_firmware(struct vic *vic) { + struct host1x_client *client = &vic->client.base; + struct tegra_drm *tegra = vic->client.drm; + dma_addr_t phys; + size_t size; + void *virt; int err;
- if (vic->falcon.data) + if (vic->falcon.firmware.vaddr) return 0;
- vic->falcon.data = vic->client.drm; - err = falcon_read_firmware(&vic->falcon, vic->config->firmware); if (err < 0) - goto cleanup; + return err; + + size = vic->falcon.firmware.size; + + if (!client->group) { + virt = dma_alloc_coherent(vic->dev, size, &phys, GFP_KERNEL); + + err = dma_mapping_error(vic->dev, phys); + if (err < 0) + return err; + } else { + virt = tegra_drm_alloc(tegra, size, &phys); + } + + vic->falcon.firmware.vaddr = virt; + vic->falcon.firmware.paddr = phys;
err = falcon_load_firmware(&vic->falcon); if (err < 0) goto cleanup;
+ /* + * In this case we have received an IOVA from the shared domain, so we + * need to make sure to get the physical address so that the DMA API + * knows what memory pages to flush the cache for. + */ + if (client->group) { + phys = dma_map_single(vic->dev, virt, size, DMA_TO_DEVICE); + + err = dma_mapping_error(vic->dev, phys); + if (err < 0) + goto cleanup; + + /* + * If the DMA API mapped this through a bounce buffer, the + * dma_sync_single_for_device() call below will not be able + * to flush the caches for the right memory pages. Output a + * big warning in that case so that the DMA mask can be set + * properly and the bounce buffer avoided. + */ + WARN(phys != vic->falcon.firmware.paddr, + "check DMA mask setting for %s\n", dev_name(vic->dev)); + } + + dma_sync_single_for_device(vic->dev, phys, size, DMA_TO_DEVICE); + + if (client->group) + dma_unmap_single(vic->dev, phys, size, DMA_TO_DEVICE); + return 0;
cleanup: - vic->falcon.data = NULL; + if (!client->group) + dma_free_coherent(vic->dev, size, virt, phys); + else + tegra_drm_free(tegra, size, virt, phys); + return err; }
@@ -415,7 +453,6 @@ static int vic_probe(struct platform_device *pdev)
vic->falcon.dev = dev; vic->falcon.regs = vic->regs; - vic->falcon.ops = &vic_falcon_ops;
err = falcon_init(&vic->falcon); if (err < 0)
From: Thierry Reding treding@nvidia.com
Rename paddr -> iova and vaddr -> virt to make it clearer how these addresses are used. This is important for a subsequent patch that makes a distinction between the physical address (physical address of the system memory from the CPU's point of view) and the IOVA (physical address of the system memory from the device's point of view).
Signed-off-by: Thierry Reding treding@nvidia.com --- drivers/gpu/drm/tegra/falcon.c | 14 ++++----- drivers/gpu/drm/tegra/falcon.h | 5 +-- drivers/gpu/drm/tegra/vic.c | 56 +++++++++++++++------------------- 3 files changed, 34 insertions(+), 41 deletions(-)
diff --git a/drivers/gpu/drm/tegra/falcon.c b/drivers/gpu/drm/tegra/falcon.c index a5b25e4ecbd2..56edef06c48e 100644 --- a/drivers/gpu/drm/tegra/falcon.c +++ b/drivers/gpu/drm/tegra/falcon.c @@ -58,17 +58,17 @@ static int falcon_copy_chunk(struct falcon *falcon, static void falcon_copy_firmware_image(struct falcon *falcon, const struct firmware *firmware) { - u32 *firmware_vaddr = falcon->firmware.vaddr; + u32 *virt = falcon->firmware.virt; size_t i;
/* copy the whole thing taking into account endianness */ for (i = 0; i < firmware->size / sizeof(u32); i++) - firmware_vaddr[i] = le32_to_cpu(((u32 *)firmware->data)[i]); + virt[i] = le32_to_cpu(((u32 *)firmware->data)[i]); }
static int falcon_parse_firmware_image(struct falcon *falcon) { - struct falcon_fw_bin_header_v1 *bin = (void *)falcon->firmware.vaddr; + struct falcon_fw_bin_header_v1 *bin = (void *)falcon->firmware.virt; struct falcon_fw_os_header_v1 *os;
/* endian problems would show up right here */ @@ -89,7 +89,7 @@ static int falcon_parse_firmware_image(struct falcon *falcon) return -EINVAL; }
- os = falcon->firmware.vaddr + bin->os_header_offset; + os = falcon->firmware.virt + bin->os_header_offset;
falcon->firmware.bin_data.size = bin->os_size; falcon->firmware.bin_data.offset = bin->os_data_offset; @@ -138,7 +138,7 @@ int falcon_load_firmware(struct falcon *falcon)
int falcon_init(struct falcon *falcon) { - falcon->firmware.vaddr = NULL; + falcon->firmware.virt = NULL;
return 0; } @@ -155,7 +155,7 @@ int falcon_boot(struct falcon *falcon) u32 value; int err;
- if (!falcon->firmware.vaddr) + if (!falcon->firmware.virt) return -EINVAL;
err = readl_poll_timeout(falcon->regs + FALCON_DMACTL, value, @@ -168,7 +168,7 @@ int falcon_boot(struct falcon *falcon) falcon_writel(falcon, 0, FALCON_DMACTL);
/* setup the address of the binary data so Falcon can access it later */ - falcon_writel(falcon, (falcon->firmware.paddr + + falcon_writel(falcon, (falcon->firmware.iova + falcon->firmware.bin_data.offset) >> 8, FALCON_DMATRFBASE);
diff --git a/drivers/gpu/drm/tegra/falcon.h b/drivers/gpu/drm/tegra/falcon.h index 92491a1e90df..c56ee32d92ee 100644 --- a/drivers/gpu/drm/tegra/falcon.h +++ b/drivers/gpu/drm/tegra/falcon.h @@ -84,8 +84,9 @@ struct falcon_firmware { const struct firmware *firmware;
/* Raw firmware data */ - dma_addr_t paddr; - void *vaddr; + dma_addr_t iova; + dma_addr_t phys; + void *virt; size_t size;
/* Parsed firmware information */ diff --git a/drivers/gpu/drm/tegra/vic.c b/drivers/gpu/drm/tegra/vic.c index 4345b8054617..9444ba183990 100644 --- a/drivers/gpu/drm/tegra/vic.c +++ b/drivers/gpu/drm/tegra/vic.c @@ -133,9 +133,9 @@ static int vic_boot(struct vic *vic) if (err < 0) return err;
- hdr = vic->falcon.firmware.vaddr; + hdr = vic->falcon.firmware.virt; fce_bin_data_offset = *(u32 *)(hdr + VIC_UCODE_FCE_DATA_OFFSET); - hdr = vic->falcon.firmware.vaddr + + hdr = vic->falcon.firmware.virt + *(u32 *)(hdr + VIC_UCODE_FCE_HEADER_OFFSET); fce_ucode_size = *(u32 *)(hdr + FCE_UCODE_SIZE_OFFSET);
@@ -143,7 +143,7 @@ static int vic_boot(struct vic *vic) falcon_execute_method(&vic->falcon, VIC_SET_FCE_UCODE_SIZE, fce_ucode_size); falcon_execute_method(&vic->falcon, VIC_SET_FCE_UCODE_OFFSET, - (vic->falcon.firmware.paddr + fce_bin_data_offset) + (vic->falcon.firmware.iova + fce_bin_data_offset) >> 8);
err = falcon_wait_idle(&vic->falcon); @@ -225,14 +225,17 @@ static int vic_exit(struct host1x_client *client) host1x_channel_put(vic->channel); host1x_client_iommu_detach(client);
- if (client->group) + if (client->group) { + dma_unmap_single(vic->dev, vic->falcon.firmware.phys, + vic->falcon.firmware.size, DMA_TO_DEVICE); tegra_drm_free(tegra, vic->falcon.firmware.size, - vic->falcon.firmware.vaddr, - vic->falcon.firmware.paddr); - else + vic->falcon.firmware.virt, + vic->falcon.firmware.iova); + } else { dma_free_coherent(vic->dev, vic->falcon.firmware.size, - vic->falcon.firmware.vaddr, - vic->falcon.firmware.paddr); + vic->falcon.firmware.virt, + vic->falcon.firmware.iova); + }
return 0; } @@ -246,12 +249,12 @@ static int vic_load_firmware(struct vic *vic) { struct host1x_client *client = &vic->client.base; struct tegra_drm *tegra = vic->client.drm; - dma_addr_t phys; + dma_addr_t iova; size_t size; void *virt; int err;
- if (vic->falcon.firmware.vaddr) + if (vic->falcon.firmware.virt) return 0;
err = falcon_read_firmware(&vic->falcon, vic->config->firmware); @@ -261,17 +264,17 @@ static int vic_load_firmware(struct vic *vic) size = vic->falcon.firmware.size;
if (!client->group) { - virt = dma_alloc_coherent(vic->dev, size, &phys, GFP_KERNEL); + virt = dma_alloc_coherent(vic->dev, size, &iova, GFP_KERNEL);
- err = dma_mapping_error(vic->dev, phys); + err = dma_mapping_error(vic->dev, iova); if (err < 0) return err; } else { - virt = tegra_drm_alloc(tegra, size, &phys); + virt = tegra_drm_alloc(tegra, size, &iova); }
- vic->falcon.firmware.vaddr = virt; - vic->falcon.firmware.paddr = phys; + vic->falcon.firmware.virt = virt; + vic->falcon.firmware.iova = iova;
err = falcon_load_firmware(&vic->falcon); if (err < 0) @@ -283,35 +286,24 @@ static int vic_load_firmware(struct vic *vic) * knows what memory pages to flush the cache for. */ if (client->group) { + dma_addr_t phys; + phys = dma_map_single(vic->dev, virt, size, DMA_TO_DEVICE);
err = dma_mapping_error(vic->dev, phys); if (err < 0) goto cleanup;
- /* - * If the DMA API mapped this through a bounce buffer, the - * dma_sync_single_for_device() call below will not be able - * to flush the caches for the right memory pages. Output a - * big warning in that case so that the DMA mask can be set - * properly and the bounce buffer avoided. - */ - WARN(phys != vic->falcon.firmware.paddr, - "check DMA mask setting for %s\n", dev_name(vic->dev)); + vic->falcon.firmware.phys = phys; }
- dma_sync_single_for_device(vic->dev, phys, size, DMA_TO_DEVICE); - - if (client->group) - dma_unmap_single(vic->dev, phys, size, DMA_TO_DEVICE); - return 0;
cleanup: if (!client->group) - dma_free_coherent(vic->dev, size, virt, phys); + dma_free_coherent(vic->dev, size, virt, iova); else - tegra_drm_free(tegra, size, virt, phys); + tegra_drm_free(tegra, size, virt, iova);
return err; }
From: Thierry Reding treding@nvidia.com
If a display controller is not attached to an explicit IOMMU domain, which usually means that it's connected to an IOMMU domain controlled by the DMA API, make sure to map the framebuffer to the display controller address space. This allows us to transparently handle setups where the display controller is attached to an IOMMU or setups where it isn't. It also allows the driver to work with a DMA API that is backed by an IOMMU.
Signed-off-by: Thierry Reding treding@nvidia.com --- drivers/gpu/drm/tegra/dc.c | 8 ++- drivers/gpu/drm/tegra/hub.c | 6 +- drivers/gpu/drm/tegra/plane.c | 104 ++++++++++++++++++++++++++++++++++ drivers/gpu/drm/tegra/plane.h | 8 +++ 4 files changed, 120 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/tegra/dc.c b/drivers/gpu/drm/tegra/dc.c index 36c36b295ab1..5b1f9ff97576 100644 --- a/drivers/gpu/drm/tegra/dc.c +++ b/drivers/gpu/drm/tegra/dc.c @@ -715,9 +715,7 @@ static void tegra_plane_atomic_update(struct drm_plane *plane, window.swap = state->swap;
for (i = 0; i < fb->format->num_planes; i++) { - struct tegra_bo *bo = tegra_fb_get_plane(fb, i); - - window.base[i] = bo->iova + fb->offsets[i]; + window.base[i] = state->iova[i] + fb->offsets[i];
/* * Tegra uses a shared stride for UV planes. Framebuffers are @@ -732,6 +730,8 @@ static void tegra_plane_atomic_update(struct drm_plane *plane, }
static const struct drm_plane_helper_funcs tegra_plane_helper_funcs = { + .prepare_fb = tegra_plane_prepare_fb, + .cleanup_fb = tegra_plane_cleanup_fb, .atomic_check = tegra_plane_atomic_check, .atomic_disable = tegra_plane_atomic_disable, .atomic_update = tegra_plane_atomic_update, @@ -914,6 +914,8 @@ static void tegra_cursor_atomic_disable(struct drm_plane *plane, }
static const struct drm_plane_helper_funcs tegra_cursor_plane_helper_funcs = { + .prepare_fb = tegra_plane_prepare_fb, + .cleanup_fb = tegra_plane_cleanup_fb, .atomic_check = tegra_cursor_atomic_check, .atomic_update = tegra_cursor_atomic_update, .atomic_disable = tegra_cursor_atomic_disable, diff --git a/drivers/gpu/drm/tegra/hub.c b/drivers/gpu/drm/tegra/hub.c index 104115e42190..2b4082d0bc9e 100644 --- a/drivers/gpu/drm/tegra/hub.c +++ b/drivers/gpu/drm/tegra/hub.c @@ -413,7 +413,6 @@ static void tegra_shared_plane_atomic_update(struct drm_plane *plane, unsigned int zpos = plane->state->normalized_zpos; struct drm_framebuffer *fb = plane->state->fb; struct tegra_plane *p = to_tegra_plane(plane); - struct tegra_bo *bo; dma_addr_t base; u32 value;
@@ -456,8 +455,7 @@ static void tegra_shared_plane_atomic_update(struct drm_plane *plane, /* disable compression */ tegra_plane_writel(p, 0, DC_WINBUF_CDE_CONTROL);
- bo = tegra_fb_get_plane(fb, 0); - base = bo->iova; + base = state->iova[0] + fb->offsets[0];
tegra_plane_writel(p, state->format, DC_WIN_COLOR_DEPTH); tegra_plane_writel(p, 0, DC_WIN_PRECOMP_WGRP_PARAMS); @@ -521,6 +519,8 @@ static void tegra_shared_plane_atomic_update(struct drm_plane *plane, }
static const struct drm_plane_helper_funcs tegra_shared_plane_helper_funcs = { + .prepare_fb = tegra_plane_prepare_fb, + .cleanup_fb = tegra_plane_cleanup_fb, .atomic_check = tegra_shared_plane_atomic_check, .atomic_update = tegra_shared_plane_atomic_update, .atomic_disable = tegra_shared_plane_atomic_disable, diff --git a/drivers/gpu/drm/tegra/plane.c b/drivers/gpu/drm/tegra/plane.c index 6bab71d6e81d..163b590be224 100644 --- a/drivers/gpu/drm/tegra/plane.c +++ b/drivers/gpu/drm/tegra/plane.c @@ -6,6 +6,7 @@ #include <drm/drm_atomic.h> #include <drm/drm_atomic_helper.h> #include <drm/drm_fourcc.h> +#include <drm/drm_gem_framebuffer_helper.h> #include <drm/drm_plane_helper.h>
#include "dc.h" @@ -23,6 +24,7 @@ static void tegra_plane_reset(struct drm_plane *plane) { struct tegra_plane *p = to_tegra_plane(plane); struct tegra_plane_state *state; + unsigned int i;
if (plane->state) __drm_atomic_helper_plane_destroy_state(plane->state); @@ -36,6 +38,9 @@ static void tegra_plane_reset(struct drm_plane *plane) plane->state->plane = plane; plane->state->zpos = p->index; plane->state->normalized_zpos = p->index; + + for (i = 0; i < 3; i++) + state->iova[i] = DMA_MAPPING_ERROR; } }
@@ -60,6 +65,11 @@ tegra_plane_atomic_duplicate_state(struct drm_plane *plane) for (i = 0; i < 2; i++) copy->blending[i] = state->blending[i];
+ for (i = 0; i < 3; i++) { + copy->iova[i] = DMA_MAPPING_ERROR; + copy->sgt[i] = NULL; + } + return ©->base; }
@@ -95,6 +105,100 @@ const struct drm_plane_funcs tegra_plane_funcs = { .format_mod_supported = tegra_plane_format_mod_supported, };
+static int tegra_dc_pin(struct tegra_dc *dc, struct tegra_plane_state *state) +{ + unsigned int i; + int err; + + for (i = 0; i < state->base.fb->format->num_planes; i++) { + struct tegra_bo *bo = tegra_fb_get_plane(state->base.fb, i); + + if (!dc->client.group) { + struct sg_table *sgt; + + sgt = host1x_bo_pin(dc->dev, &bo->base, NULL); + if (IS_ERR(sgt)) { + err = PTR_ERR(sgt); + goto unpin; + } + + err = dma_map_sg(dc->dev, sgt->sgl, sgt->nents, + DMA_TO_DEVICE); + if (err == 0) { + err = -ENOMEM; + goto unpin; + } + + state->iova[i] = sg_dma_address(sgt->sgl); + state->sgt[i] = sgt; + } else { + state->iova[i] = bo->iova; + } + } + + return 0; + +unpin: + dev_err(dc->dev, "failed to map plane %u: %d\n", i, err); + + while (i--) { + struct tegra_bo *bo = tegra_fb_get_plane(state->base.fb, i); + struct sg_table *sgt = state->sgt[i]; + + dma_unmap_sg(dc->dev, sgt->sgl, sgt->nents, DMA_TO_DEVICE); + host1x_bo_unpin(dc->dev, &bo->base, sgt); + + state->iova[i] = DMA_MAPPING_ERROR; + state->sgt[i] = NULL; + } + + return err; +} + +static void tegra_dc_unpin(struct tegra_dc *dc, struct tegra_plane_state *state) +{ + unsigned int i; + + for (i = 0; i < state->base.fb->format->num_planes; i++) { + struct tegra_bo *bo = tegra_fb_get_plane(state->base.fb, i); + + if (!dc->client.group) { + struct sg_table *sgt = state->sgt[i]; + + if (sgt) { + dma_unmap_sg(dc->dev, sgt->sgl, sgt->nents, + DMA_TO_DEVICE); + host1x_bo_unpin(dc->dev, &bo->base, sgt); + } + } + + state->iova[i] = DMA_MAPPING_ERROR; + state->sgt[i] = NULL; + } +} + +int tegra_plane_prepare_fb(struct drm_plane *plane, + struct drm_plane_state *state) +{ + struct tegra_dc *dc = to_tegra_dc(state->crtc); + + if (!state->fb) + return 0; + + drm_gem_fb_prepare_fb(plane, state); + + return tegra_dc_pin(dc, to_tegra_plane_state(state)); +} + +void tegra_plane_cleanup_fb(struct drm_plane *plane, + struct drm_plane_state *state) +{ + struct tegra_dc *dc = to_tegra_dc(state->crtc); + + if (dc) + tegra_dc_unpin(dc, to_tegra_plane_state(state)); +} + int tegra_plane_state_add(struct tegra_plane *plane, struct drm_plane_state *state) { diff --git a/drivers/gpu/drm/tegra/plane.h b/drivers/gpu/drm/tegra/plane.h index 510c394e6d9a..a158a915109a 100644 --- a/drivers/gpu/drm/tegra/plane.h +++ b/drivers/gpu/drm/tegra/plane.h @@ -39,6 +39,9 @@ struct tegra_plane_legacy_blending_state { struct tegra_plane_state { struct drm_plane_state base;
+ struct sg_table *sgt[3]; + dma_addr_t iova[3]; + struct tegra_bo_tiling tiling; u32 format; u32 swap; @@ -61,6 +64,11 @@ to_tegra_plane_state(struct drm_plane_state *state)
extern const struct drm_plane_funcs tegra_plane_funcs;
+int tegra_plane_prepare_fb(struct drm_plane *plane, + struct drm_plane_state *state); +void tegra_plane_cleanup_fb(struct drm_plane *plane, + struct drm_plane_state *state); + int tegra_plane_state_add(struct tegra_plane *plane, struct drm_plane_state *state);
From: Thierry Reding treding@nvidia.com
If a client is already attached to an IOMMU domain that is not the shared domain, don't try to attach it again. This allows using the IOMMU-backed DMA API.
Since the IOMMU-backed DMA API is now supported and there's no way to detach from it on 64-bit ARM, don't bother to detach from it on 32-bit ARM either.
Signed-off-by: Thierry Reding treding@nvidia.com --- drivers/gpu/drm/tegra/drm.c | 66 +++++++++++++++++++++++++++---------- drivers/gpu/drm/tegra/drm.h | 1 + 2 files changed, 49 insertions(+), 18 deletions(-)
diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c index efc8a27b9e6a..56e5e7a5c108 100644 --- a/drivers/gpu/drm/tegra/drm.c +++ b/drivers/gpu/drm/tegra/drm.c @@ -20,10 +20,6 @@ #include <drm/drm_prime.h> #include <drm/drm_vblank.h>
-#if IS_ENABLED(CONFIG_ARM_DMA_USE_IOMMU) -#include <asm/dma-iommu.h> -#endif - #include "drm.h" #include "gem.h"
@@ -908,30 +904,27 @@ int tegra_drm_unregister_client(struct tegra_drm *tegra,
int host1x_client_iommu_attach(struct host1x_client *client) { + struct iommu_domain *domain = iommu_get_domain_for_dev(client->dev); struct drm_device *drm = dev_get_drvdata(client->parent); struct tegra_drm *tegra = drm->dev_private; struct iommu_group *group = NULL; int err;
- if (tegra->domain) { - struct iommu_domain *domain; + /* + * If the host1x client is already attached to an IOMMU domain that is + * not the shared IOMMU domain, don't try to attach it to a different + * domain. This allows using the IOMMU-backed DMA API. + */ + if (domain && domain != tegra->domain) + return 0;
+ if (tegra->domain) { group = iommu_group_get(client->dev); if (!group) { dev_err(client->dev, "failed to get IOMMU group\n"); return -ENODEV; }
-#if IS_ENABLED(CONFIG_ARM_DMA_USE_IOMMU) - if (client->dev->archdata.mapping) { - struct dma_iommu_mapping *mapping = - to_dma_iommu_mapping(client->dev); - arm_iommu_detach_device(client->dev); - arm_iommu_release_mapping(mapping); - } -#endif - - domain = iommu_get_domain_for_dev(client->dev); if (domain != tegra->domain) { err = iommu_attach_group(tegra->domain, group); if (err < 0) { @@ -939,6 +932,8 @@ int host1x_client_iommu_attach(struct host1x_client *client) return err; } } + + tegra->use_explicit_iommu = true; }
client->group = group; @@ -963,6 +958,7 @@ void host1x_client_iommu_detach(struct host1x_client *client) iommu_detach_group(tegra->domain, client->group);
iommu_group_put(client->group); + client->group = NULL; } }
@@ -1046,6 +1042,7 @@ void tegra_drm_free(struct tegra_drm *tegra, size_t size, void *virt, static int host1x_drm_probe(struct host1x_device *dev) { struct drm_driver *driver = &tegra_drm_driver; + struct iommu_domain *domain; struct tegra_drm *tegra; struct drm_device *drm; int err; @@ -1060,7 +1057,36 @@ static int host1x_drm_probe(struct host1x_device *dev) goto put; }
- if (iommu_present(&platform_bus_type)) { + /* + * If the Tegra DRM clients are backed by an IOMMU, push buffers are + * likely to be allocated beyond the 32-bit boundary if sufficient + * system memory is available. This is problematic on earlier Tegra + * generations where host1x supports a maximum of 32 address bits in + * the GATHER opcode. In this case, unless host1x is behind an IOMMU + * as well it won't be able to process buffers allocated beyond the + * 32-bit boundary. + * + * The DMA API will use bounce buffers in this case, so that could + * perhaps still be made to work, even if less efficient, but there + * is another catch: in order to perform cache maintenance on pages + * allocated for discontiguous buffers we need to map and unmap the + * SG table representing these buffers. This is fine for something + * small like a push buffer, but it exhausts the bounce buffer pool + * (typically on the order of a few MiB) for framebuffers (many MiB + * for any modern resolution). + * + * Work around this by making sure that Tegra DRM clients only use + * an IOMMU if the parent host1x also uses an IOMMU. + * + * Note that there's still a small gap here that we don't cover: if + * the DMA API is backed by an IOMMU there's no way to control which + * device is attached to an IOMMU and which isn't, except via wiring + * up the device tree appropriately. This is considered an problem + * of integration, so care must be taken for the DT to be consistent. + */ + domain = iommu_get_domain_for_dev(drm->dev->parent); + + if (domain && iommu_present(&platform_bus_type)) { tegra->domain = iommu_domain_alloc(&platform_bus_type); if (!tegra->domain) { err = -ENOMEM; @@ -1104,7 +1130,7 @@ static int host1x_drm_probe(struct host1x_device *dev) if (err < 0) goto fbdev;
- if (tegra->domain) { + if (tegra->use_explicit_iommu) { u64 carveout_start, carveout_end, gem_start, gem_end; u64 dma_mask = dma_get_mask(&dev->dev); dma_addr_t start, end; @@ -1132,6 +1158,10 @@ static int host1x_drm_probe(struct host1x_device *dev) DRM_DEBUG_DRIVER(" GEM: %#llx-%#llx\n", gem_start, gem_end); DRM_DEBUG_DRIVER(" Carveout: %#llx-%#llx\n", carveout_start, carveout_end); + } else if (tegra->domain) { + iommu_domain_free(tegra->domain); + tegra->domain = NULL; + iova_cache_put(); }
if (tegra->hub) { diff --git a/drivers/gpu/drm/tegra/drm.h b/drivers/gpu/drm/tegra/drm.h index 28f2820a7371..d941553f7a3d 100644 --- a/drivers/gpu/drm/tegra/drm.h +++ b/drivers/gpu/drm/tegra/drm.h @@ -36,6 +36,7 @@ struct tegra_drm { struct drm_device *drm;
struct iommu_domain *domain; + bool use_explicit_iommu; struct mutex mm_lock; struct drm_mm mm;
dri-devel@lists.freedesktop.org