The cost of faulting in all memory to be locked can be very high when working with large mappings. If only portions of the mapping will be used this can incur a high penalty for locking.
For the example of a large file, this is the usage pattern for a large statical language model (probably applies to other statical or graphical models as well). For the security example, any application transacting in data that cannot be swapped out (credit card data, medical records, etc).
This patch introduces the ability to request that pages are not pre-faulted, but are placed on the unevictable LRU when they are finally faulted in. The VM_LOCKONFAULT flag will be used together with VM_LOCKED and has no effect when set without VM_LOCKED. Setting the VM_LOCKONFAULT flag for a VMA will cause pages faulted into that VMA to be added to the unevictable LRU when they are faulted or if they are already present, but will not cause any missing pages to be faulted in.
Exposing this new lock state means that we cannot overload the meaning of the FOLL_POPULATE flag any longer. Prior to this patch it was used to mean that the VMA for a fault was locked. This means we need the new FOLL_MLOCK flag to communicate the locked state of a VMA. FOLL_POPULATE will now only control if the VMA should be populated and in the case of VM_LOCKONFAULT, it will not be set.
Signed-off-by: Eric B Munson emunson@akamai.com Acked-by: Kirill A. Shutemov kirill.shutemov@linux.intel.com Cc: Michal Hocko mhocko@suse.cz Cc: Vlastimil Babka vbabka@suse.cz Cc: Jonathan Corbet corbet@lwn.net Cc: "Kirill A. Shutemov" kirill@shutemov.name Cc: linux-kernel@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-mm@kvack.org Cc: linux-api@vger.kernel.org --- drivers/gpu/drm/drm_vm.c | 8 +++++++- fs/proc/task_mmu.c | 1 + include/linux/mm.h | 2 ++ kernel/fork.c | 2 +- mm/debug.c | 1 + mm/gup.c | 10 ++++++++-- mm/huge_memory.c | 2 +- mm/hugetlb.c | 4 ++-- mm/mlock.c | 2 +- mm/mmap.c | 2 +- mm/rmap.c | 4 ++-- 11 files changed, 27 insertions(+), 11 deletions(-)
diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c index aab49ee..103a5f6 100644 --- a/drivers/gpu/drm/drm_vm.c +++ b/drivers/gpu/drm/drm_vm.c @@ -699,9 +699,15 @@ int drm_vma_info(struct seq_file *m, void *data) (void *)(unsigned long)virt_to_phys(high_memory));
list_for_each_entry(pt, &dev->vmalist, head) { + char lock_flag = '-'; + vma = pt->vma; if (!vma) continue; + if (vma->vm_flags & VM_LOCKONFAULT) + lock_flag = 'f'; + else if (vma->vm_flags & VM_LOCKED) + lock_flag = 'l'; seq_printf(m, "\n%5d 0x%pK-0x%pK %c%c%c%c%c%c 0x%08lx000", pt->pid, @@ -710,7 +716,7 @@ int drm_vma_info(struct seq_file *m, void *data) vma->vm_flags & VM_WRITE ? 'w' : '-', vma->vm_flags & VM_EXEC ? 'x' : '-', vma->vm_flags & VM_MAYSHARE ? 's' : 'p', - vma->vm_flags & VM_LOCKED ? 'l' : '-', + lock_flag, vma->vm_flags & VM_IO ? 'i' : '-', vma->vm_pgoff);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ca1e091..38d69fc 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -579,6 +579,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) #ifdef CONFIG_X86_INTEL_MPX [ilog2(VM_MPX)] = "mp", #endif + [ilog2(VM_LOCKONFAULT)] = "lf", [ilog2(VM_LOCKED)] = "lo", [ilog2(VM_IO)] = "io", [ilog2(VM_SEQ_READ)] = "sr", diff --git a/include/linux/mm.h b/include/linux/mm.h index 2e872f9..c2f3551 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -127,6 +127,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */
+#define VM_LOCKONFAULT 0x00001000 /* Lock the pages covered when they are faulted in */ #define VM_LOCKED 0x00002000 #define VM_IO 0x00004000 /* Memory mapped I/O or similar */
@@ -2043,6 +2044,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma, #define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ +#define FOLL_MLOCK 0x1000 /* lock present pages */
typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); diff --git a/kernel/fork.c b/kernel/fork.c index dbd9b8d..a949228 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -454,7 +454,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) tmp->vm_mm = mm; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= ~VM_LOCKED; + tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); tmp->vm_next = tmp->vm_prev = NULL; file = tmp->vm_file; if (file) { diff --git a/mm/debug.c b/mm/debug.c index 76089dd..25176bb 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -121,6 +121,7 @@ static const struct trace_print_flags vmaflags_names[] = { {VM_GROWSDOWN, "growsdown" }, {VM_PFNMAP, "pfnmap" }, {VM_DENYWRITE, "denywrite" }, + {VM_LOCKONFAULT, "lockonfault" }, {VM_LOCKED, "locked" }, {VM_IO, "io" }, {VM_SEQ_READ, "seqread" }, diff --git a/mm/gup.c b/mm/gup.c index 6297f6b..e632908 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -92,7 +92,7 @@ retry: */ mark_page_accessed(page); } - if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* * The preliminary mapping check is mainly to avoid the * pointless overhead of lock_page on the ZERO_PAGE @@ -265,6 +265,9 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, unsigned int fault_flags = 0; int ret;
+ /* mlock all present pages, but do not fault in new pages */ + if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK) + return -ENOENT; /* For mm_populate(), just skip the stack guard page. */ if ((*flags & FOLL_POPULATE) && (stack_guard_page_start(vma, address) || @@ -850,7 +853,10 @@ long populate_vma_page_range(struct vm_area_struct *vma, VM_BUG_ON_VMA(end > vma->vm_end, vma); VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
- gup_flags = FOLL_TOUCH | FOLL_POPULATE; + gup_flags = FOLL_TOUCH | FOLL_MLOCK; + if ((vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT)) == VM_LOCKED) + gup_flags |= FOLL_POPULATE; + /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c107094..98ee786 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1238,7 +1238,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, pmd, _pmd, 1)) update_mmu_cache_pmd(vma, addr, pmd); } - if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { if (page->mapping && trylock_page(page)) { lru_add_drain(); if (page->mapping) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a8c3087..82caa48 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3764,8 +3764,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, unsigned long s_end = sbase + PUD_SIZE;
/* Allow segments to share if only one is marked locked */ - unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED; - unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED; + unsigned long vm_flags = vma->vm_flags & ~(VM_LOCKED | VM_LOCKONFAULT); + unsigned long svm_flags = svma->vm_flags & ~(VM_LOCKED | VM_LOCKONFAULT);
/* * match the virtual addresses, permission and the alignment of the diff --git a/mm/mlock.c b/mm/mlock.c index 807f986..2a3a860 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -422,7 +422,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec, void munlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - vma->vm_flags &= ~VM_LOCKED; + vma->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
while (start < end) { struct page *page = NULL; diff --git a/mm/mmap.c b/mm/mmap.c index aa632ad..bdbefc3 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1651,7 +1651,7 @@ out: vma == get_gate_vma(current->mm))) mm->locked_vm += (len >> PAGE_SHIFT); else - vma->vm_flags &= ~VM_LOCKED; + vma->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); }
if (file) diff --git a/mm/rmap.c b/mm/rmap.c index 171b687..47c855a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -744,7 +744,7 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
if (vma->vm_flags & VM_LOCKED) { spin_unlock(ptl); - pra->vm_flags |= VM_LOCKED; + pra->vm_flags |= (vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT)); return SWAP_FAIL; /* To break the loop */ }
@@ -765,7 +765,7 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma,
if (vma->vm_flags & VM_LOCKED) { pte_unmap_unlock(pte, ptl); - pra->vm_flags |= VM_LOCKED; + pra->vm_flags |= (vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT)); return SWAP_FAIL; /* To break the loop */ }
On 07/29/2015 05:42 PM, Eric B Munson wrote:
The cost of faulting in all memory to be locked can be very high when working with large mappings. If only portions of the mapping will be used this can incur a high penalty for locking.
For the example of a large file, this is the usage pattern for a large statical language model (probably applies to other statical or graphical models as well). For the security example, any application transacting in data that cannot be swapped out (credit card data, medical records, etc).
This patch introduces the ability to request that pages are not pre-faulted, but are placed on the unevictable LRU when they are finally faulted in. The VM_LOCKONFAULT flag will be used together with VM_LOCKED and has no effect when set without VM_LOCKED. Setting the VM_LOCKONFAULT flag for a VMA will cause pages faulted into that VMA to be added to the unevictable LRU when they are faulted or if they are already present, but will not cause any missing pages to be faulted in.
Exposing this new lock state means that we cannot overload the meaning of the FOLL_POPULATE flag any longer. Prior to this patch it was used to mean that the VMA for a fault was locked. This means we need the new FOLL_MLOCK flag to communicate the locked state of a VMA. FOLL_POPULATE will now only control if the VMA should be populated and in the case of VM_LOCKONFAULT, it will not be set.
Signed-off-by: Eric B Munson emunson@akamai.com Acked-by: Kirill A. Shutemov kirill.shutemov@linux.intel.com Cc: Michal Hocko mhocko@suse.cz Cc: Vlastimil Babka vbabka@suse.cz Cc: Jonathan Corbet corbet@lwn.net Cc: "Kirill A. Shutemov" kirill@shutemov.name Cc: linux-kernel@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-mm@kvack.org Cc: linux-api@vger.kernel.org
drivers/gpu/drm/drm_vm.c | 8 +++++++- fs/proc/task_mmu.c | 1 + include/linux/mm.h | 2 ++ kernel/fork.c | 2 +- mm/debug.c | 1 + mm/gup.c | 10 ++++++++-- mm/huge_memory.c | 2 +- mm/hugetlb.c | 4 ++-- mm/mlock.c | 2 +- mm/mmap.c | 2 +- mm/rmap.c | 4 ++-- 11 files changed, 27 insertions(+), 11 deletions(-)
diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c index aab49ee..103a5f6 100644 --- a/drivers/gpu/drm/drm_vm.c +++ b/drivers/gpu/drm/drm_vm.c @@ -699,9 +699,15 @@ int drm_vma_info(struct seq_file *m, void *data) (void *)(unsigned long)virt_to_phys(high_memory));
list_for_each_entry(pt, &dev->vmalist, head) {
char lock_flag = '-';
- vma = pt->vma; if (!vma) continue;
if (vma->vm_flags & VM_LOCKONFAULT)
lock_flag = 'f';
else if (vma->vm_flags & VM_LOCKED)
seq_printf(m, "\n%5d 0x%pK-0x%pK %c%c%c%c%c%c 0x%08lx000", pt->pid,lock_flag = 'l';
@@ -710,7 +716,7 @@ int drm_vma_info(struct seq_file *m, void *data) vma->vm_flags & VM_WRITE ? 'w' : '-', vma->vm_flags & VM_EXEC ? 'x' : '-', vma->vm_flags & VM_MAYSHARE ? 's' : 'p',
vma->vm_flags & VM_LOCKED ? 'l' : '-',
lock_flag, vma->vm_flags & VM_IO ? 'i' : '-', vma->vm_pgoff);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ca1e091..38d69fc 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -579,6 +579,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
This function has the following comment:
Don't forget to update Documentation/ on changes.
[...]
--- a/mm/gup.c +++ b/mm/gup.c @@ -92,7 +92,7 @@ retry: */ mark_page_accessed(page); }
- if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
- if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /*
- The preliminary mapping check is mainly to avoid the
- pointless overhead of lock_page on the ZERO_PAGE
@@ -265,6 +265,9 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, unsigned int fault_flags = 0; int ret;
- /* mlock all present pages, but do not fault in new pages */
- if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
/* For mm_populate(), just skip the stack guard page. */ if ((*flags & FOLL_POPULATE) && (stack_guard_page_start(vma, address) ||return -ENOENT;
@@ -850,7 +853,10 @@ long populate_vma_page_range(struct vm_area_struct *vma, VM_BUG_ON_VMA(end > vma->vm_end, vma); VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
- gup_flags = FOLL_TOUCH | FOLL_POPULATE;
- gup_flags = FOLL_TOUCH | FOLL_MLOCK;
- if ((vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT)) == VM_LOCKED)
gup_flags |= FOLL_POPULATE;
- /*
- We want to touch writable mappings with a write fault in order
- to break COW, except for shared mappings because these don't COW
I think this might be breaking the populate part of mmap(MAP_POPULATE & ~MAP_LOCKED) case, if I follow the execution correctly (it's far from simple...)
SYSCALL_DEFINE6(mmap_pgoff... with MAP_POPULATE vm_mmap_pgoff(..., MAP_POPULATE...) do_mmap_pgoff(...MAP_POPULATE... &populate) -> populate == TRUE mm_populate() __mm_populate() populate_vma_page_range()
Previously, this path would have FOLL_POPULATE in gup_flags and continue with __get_user_pages() and faultin_page() (actually regardless of FOLL_POPULATE) which would fault in the pages.
After your patch, populate_vma_page_range() will set FOLL_MLOCK, but since VM_LOCKED is not set, FOLL_POPULATE won't be set either. Then faultin_page() will return on the new check:
flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK
On Thu, Aug 06, 2015 at 05:33:54PM +0200, Vlastimil Babka wrote:
On 07/29/2015 05:42 PM, Eric B Munson wrote:
The cost of faulting in all memory to be locked can be very high when working with large mappings. If only portions of the mapping will be used this can incur a high penalty for locking.
For the example of a large file, this is the usage pattern for a large statical language model (probably applies to other statical or graphical models as well). For the security example, any application transacting in data that cannot be swapped out (credit card data, medical records, etc).
This patch introduces the ability to request that pages are not pre-faulted, but are placed on the unevictable LRU when they are finally faulted in. The VM_LOCKONFAULT flag will be used together with VM_LOCKED and has no effect when set without VM_LOCKED. Setting the VM_LOCKONFAULT flag for a VMA will cause pages faulted into that VMA to be added to the unevictable LRU when they are faulted or if they are already present, but will not cause any missing pages to be faulted in.
Exposing this new lock state means that we cannot overload the meaning of the FOLL_POPULATE flag any longer. Prior to this patch it was used to mean that the VMA for a fault was locked. This means we need the new FOLL_MLOCK flag to communicate the locked state of a VMA. FOLL_POPULATE will now only control if the VMA should be populated and in the case of VM_LOCKONFAULT, it will not be set.
Signed-off-by: Eric B Munson emunson@akamai.com Acked-by: Kirill A. Shutemov kirill.shutemov@linux.intel.com Cc: Michal Hocko mhocko@suse.cz Cc: Vlastimil Babka vbabka@suse.cz Cc: Jonathan Corbet corbet@lwn.net Cc: "Kirill A. Shutemov" kirill@shutemov.name Cc: linux-kernel@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: linux-mm@kvack.org Cc: linux-api@vger.kernel.org
drivers/gpu/drm/drm_vm.c | 8 +++++++- fs/proc/task_mmu.c | 1 + include/linux/mm.h | 2 ++ kernel/fork.c | 2 +- mm/debug.c | 1 + mm/gup.c | 10 ++++++++-- mm/huge_memory.c | 2 +- mm/hugetlb.c | 4 ++-- mm/mlock.c | 2 +- mm/mmap.c | 2 +- mm/rmap.c | 4 ++-- 11 files changed, 27 insertions(+), 11 deletions(-)
diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c index aab49ee..103a5f6 100644 --- a/drivers/gpu/drm/drm_vm.c +++ b/drivers/gpu/drm/drm_vm.c @@ -699,9 +699,15 @@ int drm_vma_info(struct seq_file *m, void *data) (void *)(unsigned long)virt_to_phys(high_memory));
list_for_each_entry(pt, &dev->vmalist, head) {
char lock_flag = '-';
- vma = pt->vma; if (!vma) continue;
if (vma->vm_flags & VM_LOCKONFAULT)
lock_flag = 'f';
else if (vma->vm_flags & VM_LOCKED)
seq_printf(m, "\n%5d 0x%pK-0x%pK %c%c%c%c%c%c 0x%08lx000", pt->pid,lock_flag = 'l';
@@ -710,7 +716,7 @@ int drm_vma_info(struct seq_file *m, void *data) vma->vm_flags & VM_WRITE ? 'w' : '-', vma->vm_flags & VM_EXEC ? 'x' : '-', vma->vm_flags & VM_MAYSHARE ? 's' : 'p',
vma->vm_flags & VM_LOCKED ? 'l' : '-',
lock_flag, vma->vm_flags & VM_IO ? 'i' : '-', vma->vm_pgoff);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ca1e091..38d69fc 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -579,6 +579,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
This function has the following comment:
Don't forget to update Documentation/ on changes.
[...]
--- a/mm/gup.c +++ b/mm/gup.c @@ -92,7 +92,7 @@ retry: */ mark_page_accessed(page); }
- if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
- if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /*
- The preliminary mapping check is mainly to avoid the
- pointless overhead of lock_page on the ZERO_PAGE
@@ -265,6 +265,9 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, unsigned int fault_flags = 0; int ret;
- /* mlock all present pages, but do not fault in new pages */
- if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
/* For mm_populate(), just skip the stack guard page. */ if ((*flags & FOLL_POPULATE) && (stack_guard_page_start(vma, address) ||return -ENOENT;
@@ -850,7 +853,10 @@ long populate_vma_page_range(struct vm_area_struct *vma, VM_BUG_ON_VMA(end > vma->vm_end, vma); VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
- gup_flags = FOLL_TOUCH | FOLL_POPULATE;
- gup_flags = FOLL_TOUCH | FOLL_MLOCK;
- if ((vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT)) == VM_LOCKED)
gup_flags |= FOLL_POPULATE;
- /*
- We want to touch writable mappings with a write fault in order
- to break COW, except for shared mappings because these don't COW
I think this might be breaking the populate part of mmap(MAP_POPULATE & ~MAP_LOCKED) case, if I follow the execution correctly (it's far from simple...)
SYSCALL_DEFINE6(mmap_pgoff... with MAP_POPULATE vm_mmap_pgoff(..., MAP_POPULATE...) do_mmap_pgoff(...MAP_POPULATE... &populate) -> populate == TRUE mm_populate() __mm_populate() populate_vma_page_range()
Previously, this path would have FOLL_POPULATE in gup_flags and continue with __get_user_pages() and faultin_page() (actually regardless of FOLL_POPULATE) which would fault in the pages.
After your patch, populate_vma_page_range() will set FOLL_MLOCK, but since VM_LOCKED is not set, FOLL_POPULATE won't be set either. Then faultin_page() will return on the new check:
flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK
Good catch!
I guess it should be something like:
gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK; if (vma->vm_flags & VM_LOCKONFAULT) gup_flags &= ~FOLL_POPULATE;
On Thu, 06 Aug 2015, Vlastimil Babka wrote:
...
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ca1e091..38d69fc 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -579,6 +579,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
This function has the following comment:
Don't forget to update Documentation/ on changes.
[...]
--- a/mm/gup.c +++ b/mm/gup.c @@ -92,7 +92,7 @@ retry: */ mark_page_accessed(page); }
- if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) {
- if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /*
- The preliminary mapping check is mainly to avoid the
- pointless overhead of lock_page on the ZERO_PAGE
@@ -265,6 +265,9 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, unsigned int fault_flags = 0; int ret;
- /* mlock all present pages, but do not fault in new pages */
- if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
/* For mm_populate(), just skip the stack guard page. */ if ((*flags & FOLL_POPULATE) && (stack_guard_page_start(vma, address) ||return -ENOENT;
@@ -850,7 +853,10 @@ long populate_vma_page_range(struct vm_area_struct *vma, VM_BUG_ON_VMA(end > vma->vm_end, vma); VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
- gup_flags = FOLL_TOUCH | FOLL_POPULATE;
- gup_flags = FOLL_TOUCH | FOLL_MLOCK;
- if ((vma->vm_flags & (VM_LOCKED | VM_LOCKONFAULT)) == VM_LOCKED)
gup_flags |= FOLL_POPULATE;
- /*
- We want to touch writable mappings with a write fault in order
- to break COW, except for shared mappings because these don't COW
I think this might be breaking the populate part of mmap(MAP_POPULATE & ~MAP_LOCKED) case, if I follow the execution correctly (it's far from simple...)
SYSCALL_DEFINE6(mmap_pgoff... with MAP_POPULATE vm_mmap_pgoff(..., MAP_POPULATE...) do_mmap_pgoff(...MAP_POPULATE... &populate) -> populate == TRUE mm_populate() __mm_populate() populate_vma_page_range()
Previously, this path would have FOLL_POPULATE in gup_flags and continue with __get_user_pages() and faultin_page() (actually regardless of FOLL_POPULATE) which would fault in the pages.
After your patch, populate_vma_page_range() will set FOLL_MLOCK, but since VM_LOCKED is not set, FOLL_POPULATE won't be set either. Then faultin_page() will return on the new check:
flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK
I am on vacation atm but I will try and get to respin this series after making sure there aren't any more FOLL flag issues.
Thanks for keeping with these :)
Eric
On Wed, Jul 29, 2015 at 11:42:52AM -0400, Eric B Munson wrote: [...]
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ca1e091..38d69fc 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -579,6 +579,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) #ifdef CONFIG_X86_INTEL_MPX [ilog2(VM_MPX)] = "mp", #endif
[ilog2(VM_LOCKED)] = "lo", [ilog2(VM_IO)] = "io", [ilog2(VM_SEQ_READ)] = "sr",[ilog2(VM_LOCKONFAULT)] = "lf",
diff --git a/include/linux/mm.h b/include/linux/mm.h index 2e872f9..c2f3551 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -127,6 +127,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */
+#define VM_LOCKONFAULT 0x00001000 /* Lock the pages covered when they are faulted in */ #define VM_LOCKED 0x00002000 #define VM_IO 0x00004000 /* Memory mapped I/O or similar */
This clashes with another change currently in linux-next:
81d056997385 userfaultfd: add VM_UFFD_MISSING and VM_UFFD_WP
Adding Andrea for visibility.
I noticed this because I was trying to make selftests/vm/mlock2-tests work to see if the new mlock2 syscall would work on ARM. It didn't, so I had to investigate and noticed that two symbolic names resolve to the same value, which results in the mnemonics table (first hunk above) overwriting the VM_LOCKONFAULT entry with the VM_UFFD_WP entry.
I've applied the following patch locally to fix this up.
Andrew, I think both of those patches came in via your tree, so perhaps the best thing would be to squash the below (provided everybody agrees that it's the right fix) into Eric's patch, adding the VM_LOCKONFAULT flag?
Thierry
---- >8 ---- From a0003ebfeb15f91094d17961633cabb4e1beed21 Mon Sep 17 00:00:00 2001 From: Thierry Reding treding@nvidia.com Date: Fri, 7 Aug 2015 14:23:42 +0200 Subject: [PATCH] mm: Fix VM_LOCKONFAULT clash with VM_UFFD_WP
Currently two patches in linux-next add new VM flags and unfortunately two flags end up using the same value. This results for example in the /proc/pid/smaps file not listing the VM_LOCKONFAULT flag, which breaks tools/testing/selftests/vm/mlock2-tests.
Signed-off-by: Thierry Reding treding@nvidia.com --- fs/proc/task_mmu.c | 2 +- include/linux/mm.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index bdd7e48a85f0..893e4b9bb2da 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -592,13 +592,13 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) #ifdef CONFIG_X86_INTEL_MPX [ilog2(VM_MPX)] = "mp", #endif - [ilog2(VM_LOCKONFAULT)] = "lf", [ilog2(VM_LOCKED)] = "lo", [ilog2(VM_IO)] = "io", [ilog2(VM_SEQ_READ)] = "sr", [ilog2(VM_RAND_READ)] = "rr", [ilog2(VM_DONTCOPY)] = "dc", [ilog2(VM_DONTEXPAND)] = "de", + [ilog2(VM_LOCKONFAULT)] = "lf", [ilog2(VM_ACCOUNT)] = "ac", [ilog2(VM_NORESERVE)] = "nr", [ilog2(VM_HUGETLB)] = "ht", diff --git a/include/linux/mm.h b/include/linux/mm.h index 363ea2cda35f..cb4e1737d669 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -129,7 +129,6 @@ extern unsigned int kobjsize(const void *objp); #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ #define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */
-#define VM_LOCKONFAULT 0x00001000 /* Lock the pages covered when they are faulted in */ #define VM_LOCKED 0x00002000 #define VM_IO 0x00004000 /* Memory mapped I/O or similar */
@@ -139,6 +138,7 @@ extern unsigned int kobjsize(const void *objp);
#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ +#define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
dri-devel@lists.freedesktop.org