Every set_pages_array_wb call resulted in cross-core interrupts and TLB flushes. Merge more of them for less overhead.
This reduces the time needed to free a 1.6 GiB GTT WC buffer as part of Vulkan CTS from ~2 sec to < 0.25 sec. (Allocation still takes more than 2 sec though)
Signed-off-by: Bas Nieuwenhuizen basni@chromium.org --- drivers/gpu/drm/ttm/ttm_page_alloc_dma.c | 31 ++++++++++++++++++------ 1 file changed, 24 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c index 4c659405a008a..9440ba0a55116 100644 --- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c +++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c @@ -299,6 +299,25 @@ static int set_pages_array_uc(struct page **pages, int addrinarray) #endif return 0; } + +static int ttm_set_page_range_wb(struct page *p, unsigned long numpages) +{ +#if IS_ENABLED(CONFIG_AGP) + unsigned long i; + + for (i = 0; i < numpages; i++) + unmap_page_from_agp(p + i); +#endif + return 0; +} + +#else /* for !CONFIG_X86 */ + +static int ttm_set_page_range_wb(struct page *p, unsigned long numpages) +{ + return set_memory_wb((unsigned long)page_address(p), numpages); +} + #endif /* for !CONFIG_X86 */
static int ttm_set_pages_caching(struct dma_pool *pool, @@ -387,18 +406,16 @@ static void ttm_pool_update_free_locked(struct dma_pool *pool, static void ttm_dma_page_put(struct dma_pool *pool, struct dma_page *d_page) { struct page *page = d_page->p; - unsigned i, num_pages; + unsigned num_pages; int ret;
/* Don't set WB on WB page pool. */ if (!(pool->type & IS_CACHED)) { num_pages = pool->size / PAGE_SIZE; - for (i = 0; i < num_pages; ++i, ++page) { - ret = set_pages_array_wb(&page, 1); - if (ret) { - pr_err("%s: Failed to set %d pages to wb!\n", - pool->dev_name, 1); - } + ret = ttm_set_page_range_wb(page, num_pages); + if (ret) { + pr_err("%s: Failed to set %d pages to wb!\n", + pool->dev_name, num_pages); } }
On Wed, Jul 25, 2018 at 10:29:50PM +0200, Bas Nieuwenhuizen wrote:
Every set_pages_array_wb call resulted in cross-core interrupts and TLB flushes. Merge more of them for less overhead.
This reduces the time needed to free a 1.6 GiB GTT WC buffer as part of Vulkan CTS from ~2 sec to < 0.25 sec. (Allocation still takes more than 2 sec though)
Signed-off-by: Bas Nieuwenhuizen basni@chromium.org
Reviewed-by: Huang Rui ray.huang@amd.com
drivers/gpu/drm/ttm/ttm_page_alloc_dma.c | 31 ++++++++++++++++++------ 1 file changed, 24 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c index 4c659405a008a..9440ba0a55116 100644 --- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c +++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c @@ -299,6 +299,25 @@ static int set_pages_array_uc(struct page **pages, int addrinarray) #endif return 0; }
+static int ttm_set_page_range_wb(struct page *p, unsigned long numpages) +{ +#if IS_ENABLED(CONFIG_AGP)
unsigned long i;
for (i = 0; i < numpages; i++)
unmap_page_from_agp(p + i);
+#endif
- return 0;
+}
+#else /* for !CONFIG_X86 */
+static int ttm_set_page_range_wb(struct page *p, unsigned long numpages) +{
- return set_memory_wb((unsigned long)page_address(p), numpages);
+}
#endif /* for !CONFIG_X86 */
static int ttm_set_pages_caching(struct dma_pool *pool, @@ -387,18 +406,16 @@ static void ttm_pool_update_free_locked(struct dma_pool *pool, static void ttm_dma_page_put(struct dma_pool *pool, struct dma_page *d_page) { struct page *page = d_page->p;
- unsigned i, num_pages;
unsigned num_pages; int ret;
/* Don't set WB on WB page pool. */ if (!(pool->type & IS_CACHED)) { num_pages = pool->size / PAGE_SIZE;
for (i = 0; i < num_pages; ++i, ++page) {
ret = set_pages_array_wb(&page, 1);
if (ret) {
pr_err("%s: Failed to set %d pages to wb!\n",
pool->dev_name, 1);
}
ret = ttm_set_page_range_wb(page, num_pages);
if (ret) {
pr_err("%s: Failed to set %d pages to wb!\n",
} }pool->dev_name, num_pages);
-- 2.18.0.233.g985f88cf7e-goog
amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
On 07/26/2018 04:29 AM, Bas Nieuwenhuizen wrote:
Every set_pages_array_wb call resulted in cross-core interrupts and TLB flushes. Merge more of them for less overhead.
This reduces the time needed to free a 1.6 GiB GTT WC buffer as part of Vulkan CTS from ~2 sec to < 0.25 sec. (Allocation still takes more than 2 sec though)
Signed-off-by: Bas Nieuwenhuizen basni@chromium.org
drivers/gpu/drm/ttm/ttm_page_alloc_dma.c | 31 ++++++++++++++++++------ 1 file changed, 24 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c index 4c659405a008a..9440ba0a55116 100644 --- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c +++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c @@ -299,6 +299,25 @@ static int set_pages_array_uc(struct page **pages, int addrinarray) #endif return 0; }
+static int ttm_set_page_range_wb(struct page *p, unsigned long numpages) +{ +#if IS_ENABLED(CONFIG_AGP)
unsigned long i;
for (i = 0; i < numpages; i++)
unmap_page_from_agp(p + i);
+#endif
- return 0;
+}
+#else /* for !CONFIG_X86 */
+static int ttm_set_page_range_wb(struct page *p, unsigned long numpages) +{
- return set_memory_wb((unsigned long)page_address(p), numpages);
+}
#endif /* for !CONFIG_X86 */
static int ttm_set_pages_caching(struct dma_pool *pool,
@@ -387,18 +406,16 @@ static void ttm_pool_update_free_locked(struct dma_pool *pool, static void ttm_dma_page_put(struct dma_pool *pool, struct dma_page *d_page) { struct page *page = d_page->p;
- unsigned i, num_pages;
unsigned num_pages; int ret;
/* Don't set WB on WB page pool. */ if (!(pool->type & IS_CACHED)) { num_pages = pool->size / PAGE_SIZE;
for (i = 0; i < num_pages; ++i, ++page) {
ret = set_pages_array_wb(&page, 1);
if (ret) {
pr_err("%s: Failed to set %d pages to wb!\n",
pool->dev_name, 1);
}
ret = ttm_set_page_range_wb(page, num_pages);
For AGP enabled, set_pages_array_wc() could works like that by passing "num_pages" instead of "1" In X86 case, we may use set_pages_array_wb() in arch/x86/mm/pageattr.c.
so, does it work as below?
ret = set_pages_array_wb(page, num_pages);
Jerry
if (ret) {
pr_err("%s: Failed to set %d pages to wb!\n",
} }pool->dev_name, num_pages);
On Thu, Jul 26, 2018 at 7:52 AM, Zhang, Jerry (Junwei) Jerry.Zhang@amd.com wrote:
On 07/26/2018 04:29 AM, Bas Nieuwenhuizen wrote:
Every set_pages_array_wb call resulted in cross-core interrupts and TLB flushes. Merge more of them for less overhead.
This reduces the time needed to free a 1.6 GiB GTT WC buffer as part of Vulkan CTS from ~2 sec to < 0.25 sec. (Allocation still takes more than 2 sec though)
Signed-off-by: Bas Nieuwenhuizen basni@chromium.org
drivers/gpu/drm/ttm/ttm_page_alloc_dma.c | 31 ++++++++++++++++++------ 1 file changed, 24 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c index 4c659405a008a..9440ba0a55116 100644 --- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c +++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c @@ -299,6 +299,25 @@ static int set_pages_array_uc(struct page **pages, int addrinarray) #endif return 0; }
+static int ttm_set_page_range_wb(struct page *p, unsigned long numpages) +{ +#if IS_ENABLED(CONFIG_AGP)
unsigned long i;
for (i = 0; i < numpages; i++)
unmap_page_from_agp(p + i);
+#endif
return 0;
+}
+#else /* for !CONFIG_X86 */
+static int ttm_set_page_range_wb(struct page *p, unsigned long numpages) +{
return set_memory_wb((unsigned long)page_address(p), numpages);
+}
#endif /* for !CONFIG_X86 */
static int ttm_set_pages_caching(struct dma_pool *pool,
@@ -387,18 +406,16 @@ static void ttm_pool_update_free_locked(struct dma_pool *pool, static void ttm_dma_page_put(struct dma_pool *pool, struct dma_page *d_page) { struct page *page = d_page->p;
unsigned i, num_pages;
unsigned num_pages; int ret; /* Don't set WB on WB page pool. */ if (!(pool->type & IS_CACHED)) { num_pages = pool->size / PAGE_SIZE;
for (i = 0; i < num_pages; ++i, ++page) {
ret = set_pages_array_wb(&page, 1);
if (ret) {
pr_err("%s: Failed to set %d pages to
wb!\n",
pool->dev_name, 1);
}
ret = ttm_set_page_range_wb(page, num_pages);
For AGP enabled, set_pages_array_wc() could works like that by passing "num_pages" instead of "1" In X86 case, we may use set_pages_array_wb() in arch/x86/mm/pageattr.c.
so, does it work as below?
ret = set_pages_array_wb(page, num_pages);
No that would not work. Note that we have an array of page structs, while set_pages_array_wb() wants an array of pointers to page structs. We could allocate a temporary array and write the pointers but that seems unnecessarily inefficient to me, and probably also does not achieve a reduction in code.
Jerry
if (ret) {
pr_err("%s: Failed to set %d pages to wb!\n",
pool->dev_name, num_pages); } }
Am 25.07.2018 um 22:29 schrieb Bas Nieuwenhuizen:
Every set_pages_array_wb call resulted in cross-core interrupts and TLB flushes. Merge more of them for less overhead.
This reduces the time needed to free a 1.6 GiB GTT WC buffer as part of Vulkan CTS from ~2 sec to < 0.25 sec. (Allocation still takes more than 2 sec though)
Yeah, I was already wondering when I originally implemented this if there isn't a better approach.
This needs a bit of cleanup I think, e.g. use set_pages_wb() instead of set_memory_wb() and we should move the non-x86 abstraction into a common header for both ttm_page_alloc_dma.c and ttm_page_alloc.c.
Bas, do you want to tackle this or should just I take a look?
Christian.
Signed-off-by: Bas Nieuwenhuizen basni@chromium.org
drivers/gpu/drm/ttm/ttm_page_alloc_dma.c | 31 ++++++++++++++++++------ 1 file changed, 24 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c index 4c659405a008a..9440ba0a55116 100644 --- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c +++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c @@ -299,6 +299,25 @@ static int set_pages_array_uc(struct page **pages, int addrinarray) #endif return 0; }
+static int ttm_set_page_range_wb(struct page *p, unsigned long numpages) +{ +#if IS_ENABLED(CONFIG_AGP)
unsigned long i;
for (i = 0; i < numpages; i++)
unmap_page_from_agp(p + i);
+#endif
- return 0;
+}
+#else /* for !CONFIG_X86 */
+static int ttm_set_page_range_wb(struct page *p, unsigned long numpages) +{
- return set_memory_wb((unsigned long)page_address(p), numpages);
+}
#endif /* for !CONFIG_X86 */
static int ttm_set_pages_caching(struct dma_pool *pool,
@@ -387,18 +406,16 @@ static void ttm_pool_update_free_locked(struct dma_pool *pool, static void ttm_dma_page_put(struct dma_pool *pool, struct dma_page *d_page) { struct page *page = d_page->p;
- unsigned i, num_pages;
unsigned num_pages; int ret;
/* Don't set WB on WB page pool. */ if (!(pool->type & IS_CACHED)) { num_pages = pool->size / PAGE_SIZE;
for (i = 0; i < num_pages; ++i, ++page) {
ret = set_pages_array_wb(&page, 1);
if (ret) {
pr_err("%s: Failed to set %d pages to wb!\n",
pool->dev_name, 1);
}
ret = ttm_set_page_range_wb(page, num_pages);
if (ret) {
pr_err("%s: Failed to set %d pages to wb!\n",
} }pool->dev_name, num_pages);
On Thu, Jul 26, 2018 at 08:37:45AM +0200, Christian König wrote:
Am 25.07.2018 um 22:29 schrieb Bas Nieuwenhuizen:
Every set_pages_array_wb call resulted in cross-core interrupts and TLB flushes. Merge more of them for less overhead.
This reduces the time needed to free a 1.6 GiB GTT WC buffer as part of Vulkan CTS from ~2 sec to < 0.25 sec. (Allocation still takes more than 2 sec though)
Yeah, I was already wondering when I originally implemented this if there isn't a better approach.
This needs a bit of cleanup I think, e.g. use set_pages_wb() instead of set_memory_wb() and we should move the non-x86 abstraction into a common header for both ttm_page_alloc_dma.c and ttm_page_alloc.c.
Agree, at the first glance, I almost got it wrong to miss-read "#ifndef CONFIG_X86". So it make sense to move non-x86 definition into another header.
Thanks, Ray
Bas, do you want to tackle this or should just I take a look?
Christian.
Signed-off-by: Bas Nieuwenhuizen basni@chromium.org
drivers/gpu/drm/ttm/ttm_page_alloc_dma.c | 31 ++++++++++++++++++------ 1 file changed, 24 insertions(+), 7 deletions(-)
diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c index 4c659405a008a..9440ba0a55116 100644 --- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c +++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c @@ -299,6 +299,25 @@ static int set_pages_array_uc(struct page **pages, int addrinarray) #endif return 0; }
+static int ttm_set_page_range_wb(struct page *p, unsigned long numpages) +{ +#if IS_ENABLED(CONFIG_AGP)
unsigned long i;
for (i = 0; i < numpages; i++)
unmap_page_from_agp(p + i);
+#endif
- return 0;
+}
+#else /* for !CONFIG_X86 */
+static int ttm_set_page_range_wb(struct page *p, unsigned long numpages) +{
- return set_memory_wb((unsigned long)page_address(p), numpages);
+}
#endif /* for !CONFIG_X86 */ static int ttm_set_pages_caching(struct dma_pool *pool, @@ -387,18 +406,16 @@ static void ttm_pool_update_free_locked(struct dma_pool *pool, static void ttm_dma_page_put(struct dma_pool *pool, struct dma_page *d_page) { struct page *page = d_page->p;
- unsigned i, num_pages;
- unsigned num_pages; int ret; /* Don't set WB on WB page pool. */ if (!(pool->type & IS_CACHED)) { num_pages = pool->size / PAGE_SIZE;
for (i = 0; i < num_pages; ++i, ++page) {
ret = set_pages_array_wb(&page, 1);
if (ret) {
pr_err("%s: Failed to set %d pages to wb!\n",
pool->dev_name, 1);
}
ret = ttm_set_page_range_wb(page, num_pages);
if (ret) {
pr_err("%s: Failed to set %d pages to wb!\n",
} }pool->dev_name, num_pages);
amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
dri-devel@lists.freedesktop.org