[PATCH] drm/ttm: Merge hugepage attr changes in ttm_dma_page_put.

List overview All Threads
Download

newer

older

[PATCH 0/5] drm/ttm: move non-x86...

[Bug 106228] amdgpu reads back...

Bas Nieuwenhuizen

25 Jul 2018 25 Jul '18

8:29 p.m.

Every set_pages_array_wb call resulted in cross-core interrupts and TLB flushes. Merge more of them for less overhead.

This reduces the time needed to free a 1.6 GiB GTT WC buffer as part of Vulkan CTS from ~2 sec to < 0.25 sec. (Allocation still takes more than 2 sec though)

Signed-off-by: Bas Nieuwenhuizen basni@chromium.org --- drivers/gpu/drm/ttm/ttm_page_alloc_dma.c | 31 ++++++++++++++++++------ 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c index 4c659405a008a..9440ba0a55116 100644 --- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c +++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c @@ -299,6 +299,25 @@ static int set_pages_array_uc(struct page **pages, int addrinarray) #endif return 0; } + +static int ttm_set_page_range_wb(struct page *p, unsigned long numpages) +{ +#if IS_ENABLED(CONFIG_AGP) + unsigned long i; + + for (i = 0; i < numpages; i++) + unmap_page_from_agp(p + i); +#endif + return 0; +} + +#else /* for !CONFIG_X86 */ + +static int ttm_set_page_range_wb(struct page *p, unsigned long numpages) +{ + return set_memory_wb((unsigned long)page_address(p), numpages); +} + #endif /* for !CONFIG_X86 */

static int ttm_set_pages_caching(struct dma_pool *pool, @@ -387,18 +406,16 @@ static void ttm_pool_update_free_locked(struct dma_pool *pool, static void ttm_dma_page_put(struct dma_pool *pool, struct dma_page *d_page) { struct page *page = d_page->p; - unsigned i, num_pages; + unsigned num_pages; int ret;

/* Don't set WB on WB page pool. */ if (!(pool->type & IS_CACHED)) { num_pages = pool->size / PAGE_SIZE; - for (i = 0; i < num_pages; ++i, ++page) { - ret = set_pages_array_wb(&page, 1); - if (ret) { - pr_err("%s: Failed to set %d pages to wb!\n", - pool->dev_name, 1); - } + ret = ttm_set_page_range_wb(page, num_pages); + if (ret) { + pr_err("%s: Failed to set %d pages to wb!\n", + pool->dev_name, num_pages); } }

-- 2.18.0.233.g985f88cf7e-goog

Show replies by date

Huang Rui

26 Jul 26 Jul

4:02 a.m.

On Wed, Jul 25, 2018 at 10:29:50PM +0200, Bas Nieuwenhuizen wrote:

...

Every set_pages_array_wb call resulted in cross-core interrupts and TLB flushes. Merge more of them for less overhead.

This reduces the time needed to free a 1.6 GiB GTT WC buffer as part of Vulkan CTS from ~2 sec to < 0.25 sec. (Allocation still takes more than 2 sec though)

Signed-off-by: Bas Nieuwenhuizen basni@chromium.org

Reviewed-by: Huang Rui ray.huang@amd.com

...

drivers/gpu/drm/ttm/ttm_page_alloc_dma.c | 31 ++++++++++++++++++------ 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c index 4c659405a008a..9440ba0a55116 100644 --- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c +++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c @@ -299,6 +299,25 @@ static int set_pages_array_uc(struct page **pages, int addrinarray) #endif return 0; }

+static int ttm_set_page_range_wb(struct page *p, unsigned long numpages) +{ +#if IS_ENABLED(CONFIG_AGP)
   unsigned long i;
   for (i = 0; i < numpages; i++)
           unmap_page_from_agp(p + i);
+#endif

return 0;

+}

+#else /* for !CONFIG_X86 */

+static int ttm_set_page_range_wb(struct page *p, unsigned long numpages) +{

return set_memory_wb((unsigned long)page_address(p), numpages);

+}

#endif /* for !CONFIG_X86 */

static int ttm_set_pages_caching(struct dma_pool *pool, @@ -387,18 +406,16 @@ static void ttm_pool_update_free_locked(struct dma_pool *pool, static void ttm_dma_page_put(struct dma_pool *pool, struct dma_page *d_page) { struct page *page = d_page->p;

unsigned i, num_pages;

unsigned num_pages; int ret;

/* Don't set WB on WB page pool. */ if (!(pool->type & IS_CACHED)) { num_pages = pool->size / PAGE_SIZE;
for (i = 0; i < num_pages; ++i, ++page) {
	ret = set_pages_array_wb(&page, 1);
	if (ret) {
		pr_err("%s: Failed to set %d pages to wb!\n",
		       pool->dev_name, 1);
	}
ret = ttm_set_page_range_wb(page, num_pages);
if (ret) {
	pr_err("%s: Failed to set %d pages to wb!\n",
	       pool->dev_name, num_pages);
} }
-- 2.18.0.233.g985f88cf7e-goog

amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Zhang, Jerry (Junwei)

5:52 a.m.

On 07/26/2018 04:29 AM, Bas Nieuwenhuizen wrote:

...

Every set_pages_array_wb call resulted in cross-core interrupts and TLB flushes. Merge more of them for less overhead.

This reduces the time needed to free a 1.6 GiB GTT WC buffer as part of Vulkan CTS from ~2 sec to < 0.25 sec. (Allocation still takes more than 2 sec though)

Signed-off-by: Bas Nieuwenhuizen basni@chromium.org

drivers/gpu/drm/ttm/ttm_page_alloc_dma.c | 31 ++++++++++++++++++------ 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c index 4c659405a008a..9440ba0a55116 100644 --- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c +++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c @@ -299,6 +299,25 @@ static int set_pages_array_uc(struct page **pages, int addrinarray) #endif return 0; }

+static int ttm_set_page_range_wb(struct page *p, unsigned long numpages) +{ +#if IS_ENABLED(CONFIG_AGP)
   unsigned long i;
   for (i = 0; i < numpages; i++)
           unmap_page_from_agp(p + i);
+#endif

return 0;

+}

+#else /* for !CONFIG_X86 */

+static int ttm_set_page_range_wb(struct page *p, unsigned long numpages) +{

return set_memory_wb((unsigned long)page_address(p), numpages);

+}

#endif /* for !CONFIG_X86 */

static int ttm_set_pages_caching(struct dma_pool *pool,

@@ -387,18 +406,16 @@ static void ttm_pool_update_free_locked(struct dma_pool *pool, static void ttm_dma_page_put(struct dma_pool *pool, struct dma_page *d_page) { struct page *page = d_page->p;

unsigned i, num_pages;

unsigned num_pages; int ret;

/* Don't set WB on WB page pool. */ if (!(pool->type & IS_CACHED)) { num_pages = pool->size / PAGE_SIZE;
for (i = 0; i < num_pages; ++i, ++page) {
	ret = set_pages_array_wb(&page, 1);
	if (ret) {
		pr_err("%s: Failed to set %d pages to wb!\n",
		       pool->dev_name, 1);
	}
ret = ttm_set_page_range_wb(page, num_pages);

For AGP enabled, set_pages_array_wc() could works like that by passing "num_pages" instead of "1" In X86 case, we may use set_pages_array_wb() in arch/x86/mm/pageattr.c.

so, does it work as below?

ret = set_pages_array_wb(page, num_pages);

Jerry

...

```
if (ret) {
```

	pr_err("%s: Failed to set %d pages to wb!\n",

```
	       pool->dev_name, num_pages);
```
} }

Bas Nieuwenhuizen

9:06 a.m.

On Thu, Jul 26, 2018 at 7:52 AM, Zhang, Jerry (Junwei) Jerry.Zhang@amd.com wrote:

...

On 07/26/2018 04:29 AM, Bas Nieuwenhuizen wrote:

...
Every set_pages_array_wb call resulted in cross-core interrupts and TLB flushes. Merge more of them for less overhead.

This reduces the time needed to free a 1.6 GiB GTT WC buffer as part of Vulkan CTS from ~2 sec to < 0.25 sec. (Allocation still takes more than 2 sec though)

Signed-off-by: Bas Nieuwenhuizen basni@chromium.org

drivers/gpu/drm/ttm/ttm_page_alloc_dma.c | 31 ++++++++++++++++++------ 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c index 4c659405a008a..9440ba0a55116 100644 --- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c +++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c @@ -299,6 +299,25 @@ static int set_pages_array_uc(struct page **pages, int addrinarray) #endif return 0; }

+static int ttm_set_page_range_wb(struct page *p, unsigned long numpages) +{ +#if IS_ENABLED(CONFIG_AGP)
   unsigned long i;
   for (i = 0; i < numpages; i++)
           unmap_page_from_agp(p + i);
+#endif
  return 0;
+}

+#else /* for !CONFIG_X86 */

+static int ttm_set_page_range_wb(struct page *p, unsigned long numpages) +{
  return set_memory_wb((unsigned long)page_address(p), numpages);
+}

#endif /* for !CONFIG_X86 */

static int ttm_set_pages_caching(struct dma_pool *pool,

@@ -387,18 +406,16 @@ static void ttm_pool_update_free_locked(struct dma_pool *pool, static void ttm_dma_page_put(struct dma_pool *pool, struct dma_page *d_page) { struct page *page = d_page->p;
  unsigned i, num_pages;
  unsigned num_pages;
  int ret;

  /* Don't set WB on WB page pool. */
  if (!(pool->type & IS_CACHED)) {
          num_pages = pool->size / PAGE_SIZE;
          for (i = 0; i < num_pages; ++i, ++page) {
                  ret = set_pages_array_wb(&page, 1);
                  if (ret) {
                          pr_err("%s: Failed to set %d pages to
wb!\n",
                                 pool->dev_name, 1);
                  }
          ret = ttm_set_page_range_wb(page, num_pages);
For AGP enabled, set_pages_array_wc() could works like that by passing "num_pages" instead of "1" In X86 case, we may use set_pages_array_wb() in arch/x86/mm/pageattr.c.

so, does it work as below?

ret = set_pages_array_wb(page, num_pages);

No that would not work. Note that we have an array of page structs, while set_pages_array_wb() wants an array of pointers to page structs. We could allocate a temporary array and write the pointers but that seems unnecessarily inefficient to me, and probably also does not achieve a reduction in code.

...

Jerry

...

```
          if (ret) {
```

                  pr_err("%s: Failed to set %d pages to wb!\n",

                         pool->dev_name, num_pages);
          }
  }

Christian König

6:37 a.m.

Am 25.07.2018 um 22:29 schrieb Bas Nieuwenhuizen:

...

Every set_pages_array_wb call resulted in cross-core interrupts and TLB flushes. Merge more of them for less overhead.

This reduces the time needed to free a 1.6 GiB GTT WC buffer as part of Vulkan CTS from ~2 sec to < 0.25 sec. (Allocation still takes more than 2 sec though)

Yeah, I was already wondering when I originally implemented this if there isn't a better approach.

This needs a bit of cleanup I think, e.g. use set_pages_wb() instead of set_memory_wb() and we should move the non-x86 abstraction into a common header for both ttm_page_alloc_dma.c and ttm_page_alloc.c.

Bas, do you want to tackle this or should just I take a look?

Christian.

...

Signed-off-by: Bas Nieuwenhuizen basni@chromium.org

drivers/gpu/drm/ttm/ttm_page_alloc_dma.c | 31 ++++++++++++++++++------ 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c index 4c659405a008a..9440ba0a55116 100644 --- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c +++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c @@ -299,6 +299,25 @@ static int set_pages_array_uc(struct page **pages, int addrinarray) #endif return 0; }

+static int ttm_set_page_range_wb(struct page *p, unsigned long numpages) +{ +#if IS_ENABLED(CONFIG_AGP)
   unsigned long i;
   for (i = 0; i < numpages; i++)
           unmap_page_from_agp(p + i);
+#endif

return 0;

+}

+#else /* for !CONFIG_X86 */

+static int ttm_set_page_range_wb(struct page *p, unsigned long numpages) +{

return set_memory_wb((unsigned long)page_address(p), numpages);

+}

#endif /* for !CONFIG_X86 */

static int ttm_set_pages_caching(struct dma_pool *pool,

@@ -387,18 +406,16 @@ static void ttm_pool_update_free_locked(struct dma_pool *pool, static void ttm_dma_page_put(struct dma_pool *pool, struct dma_page *d_page) { struct page *page = d_page->p;

unsigned i, num_pages;

unsigned num_pages; int ret;

/* Don't set WB on WB page pool. */ if (!(pool->type & IS_CACHED)) { num_pages = pool->size / PAGE_SIZE;
for (i = 0; i < num_pages; ++i, ++page) {
	ret = set_pages_array_wb(&page, 1);
	if (ret) {
		pr_err("%s: Failed to set %d pages to wb!\n",
		       pool->dev_name, 1);
	}
ret = ttm_set_page_range_wb(page, num_pages);
if (ret) {
	pr_err("%s: Failed to set %d pages to wb!\n",
	       pool->dev_name, num_pages);
} }

Huang Rui

7:02 a.m.

On Thu, Jul 26, 2018 at 08:37:45AM +0200, Christian König wrote:

...

Am 25.07.2018 um 22:29 schrieb Bas Nieuwenhuizen:

...
Every set_pages_array_wb call resulted in cross-core interrupts and TLB flushes. Merge more of them for less overhead.

This reduces the time needed to free a 1.6 GiB GTT WC buffer as part of Vulkan CTS from ~2 sec to < 0.25 sec. (Allocation still takes more than 2 sec though)

Yeah, I was already wondering when I originally implemented this if there isn't a better approach.

This needs a bit of cleanup I think, e.g. use set_pages_wb() instead of set_memory_wb() and we should move the non-x86 abstraction into a common header for both ttm_page_alloc_dma.c and ttm_page_alloc.c.

Agree, at the first glance, I almost got it wrong to miss-read "#ifndef CONFIG_X86". So it make sense to move non-x86 definition into another header.

Thanks, Ray

...

Bas, do you want to tackle this or should just I take a look?

Christian.

...
Signed-off-by: Bas Nieuwenhuizen basni@chromium.org

drivers/gpu/drm/ttm/ttm_page_alloc_dma.c | 31 ++++++++++++++++++------ 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c index 4c659405a008a..9440ba0a55116 100644 --- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c +++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c @@ -299,6 +299,25 @@ static int set_pages_array_uc(struct page **pages, int addrinarray) #endif return 0; }

+static int ttm_set_page_range_wb(struct page *p, unsigned long numpages) +{ +#if IS_ENABLED(CONFIG_AGP)
   unsigned long i;
   for (i = 0; i < numpages; i++)
           unmap_page_from_agp(p + i);
+#endif

return 0;

+}

+#else /* for !CONFIG_X86 */

+static int ttm_set_page_range_wb(struct page *p, unsigned long numpages) +{

return set_memory_wb((unsigned long)page_address(p), numpages);

+}

#endif /* for !CONFIG_X86 */ static int ttm_set_pages_caching(struct dma_pool *pool, @@ -387,18 +406,16 @@ static void ttm_pool_update_free_locked(struct dma_pool *pool, static void ttm_dma_page_put(struct dma_pool *pool, struct dma_page *d_page) { struct page *page = d_page->p;

unsigned i, num_pages;

unsigned num_pages; int ret; /* Don't set WB on WB page pool. */ if (!(pool->type & IS_CACHED)) { num_pages = pool->size / PAGE_SIZE;
for (i = 0; i < num_pages; ++i, ++page) {
	ret = set_pages_array_wb(&page, 1);
	if (ret) {
		pr_err("%s: Failed to set %d pages to wb!\n",
		       pool->dev_name, 1);
	}
ret = ttm_set_page_range_wb(page, num_pages);
if (ret) {
	pr_err("%s: Failed to set %d pages to wb!\n",
	       pool->dev_name, num_pages);
} }
amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx

2476

Age (days ago)

2477

Last active (days ago)

dri-devel@lists.freedesktop.org

5 comments

4 participants

tags (0)

participants (4)

Bas Nieuwenhuizen
Christian König
Huang Rui
Zhang, Jerry (Junwei)