[S390] pgtables: Fix race in enable_sie vs. page table ops
The current enable_sie code sets the mm->context.pgstes bit to tell dup_mm that the new mm should have extended page tables. This bit is also used by the s390 specific page table primitives to decide about the page table layout - which means context.pgstes has two meanings. This can cause any kind of bugs. For example - e.g. shrink_zone can call ptep_clear_flush_young while enable_sie is running. ptep_clear_flush_young will test for context.pgstes. Since enable_sie changed that value of the old struct mm without changing the page table layout ptep_clear_flush_young will do the wrong thing. The solution is to split pgstes into two bits - one for the allocation - one for the current state Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com> Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
This commit is contained in:
committed by
Martin Schwidefsky
parent
2c78091405
commit
250cf776f7
@@ -7,7 +7,8 @@ typedef struct {
|
|||||||
unsigned long asce_bits;
|
unsigned long asce_bits;
|
||||||
unsigned long asce_limit;
|
unsigned long asce_limit;
|
||||||
int noexec;
|
int noexec;
|
||||||
int pgstes;
|
int has_pgste; /* The mmu context has extended page tables */
|
||||||
|
int alloc_pgste; /* cloned contexts will have extended page tables */
|
||||||
} mm_context_t;
|
} mm_context_t;
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@@ -20,12 +20,25 @@ static inline int init_new_context(struct task_struct *tsk,
|
|||||||
#ifdef CONFIG_64BIT
|
#ifdef CONFIG_64BIT
|
||||||
mm->context.asce_bits |= _ASCE_TYPE_REGION3;
|
mm->context.asce_bits |= _ASCE_TYPE_REGION3;
|
||||||
#endif
|
#endif
|
||||||
if (current->mm->context.pgstes) {
|
if (current->mm->context.alloc_pgste) {
|
||||||
|
/*
|
||||||
|
* alloc_pgste indicates, that any NEW context will be created
|
||||||
|
* with extended page tables. The old context is unchanged. The
|
||||||
|
* page table allocation and the page table operations will
|
||||||
|
* look at has_pgste to distinguish normal and extended page
|
||||||
|
* tables. The only way to create extended page tables is to
|
||||||
|
* set alloc_pgste and then create a new context (e.g. dup_mm).
|
||||||
|
* The page table allocation is called after init_new_context
|
||||||
|
* and if has_pgste is set, it will create extended page
|
||||||
|
* tables.
|
||||||
|
*/
|
||||||
mm->context.noexec = 0;
|
mm->context.noexec = 0;
|
||||||
mm->context.pgstes = 1;
|
mm->context.has_pgste = 1;
|
||||||
|
mm->context.alloc_pgste = 1;
|
||||||
} else {
|
} else {
|
||||||
mm->context.noexec = s390_noexec;
|
mm->context.noexec = s390_noexec;
|
||||||
mm->context.pgstes = 0;
|
mm->context.has_pgste = 0;
|
||||||
|
mm->context.alloc_pgste = 0;
|
||||||
}
|
}
|
||||||
mm->context.asce_limit = STACK_TOP_MAX;
|
mm->context.asce_limit = STACK_TOP_MAX;
|
||||||
crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
|
crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
|
||||||
|
@@ -679,7 +679,7 @@ static inline void pmd_clear(pmd_t *pmd)
|
|||||||
|
|
||||||
static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
|
static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
|
||||||
{
|
{
|
||||||
if (mm->context.pgstes)
|
if (mm->context.has_pgste)
|
||||||
ptep_rcp_copy(ptep);
|
ptep_rcp_copy(ptep);
|
||||||
pte_val(*ptep) = _PAGE_TYPE_EMPTY;
|
pte_val(*ptep) = _PAGE_TYPE_EMPTY;
|
||||||
if (mm->context.noexec)
|
if (mm->context.noexec)
|
||||||
@@ -763,7 +763,7 @@ static inline int kvm_s390_test_and_clear_page_dirty(struct mm_struct *mm,
|
|||||||
struct page *page;
|
struct page *page;
|
||||||
unsigned int skey;
|
unsigned int skey;
|
||||||
|
|
||||||
if (!mm->context.pgstes)
|
if (!mm->context.has_pgste)
|
||||||
return -EINVAL;
|
return -EINVAL;
|
||||||
rcp_lock(ptep);
|
rcp_lock(ptep);
|
||||||
pgste = (unsigned long *) (ptep + PTRS_PER_PTE);
|
pgste = (unsigned long *) (ptep + PTRS_PER_PTE);
|
||||||
@@ -794,7 +794,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
|
|||||||
int young;
|
int young;
|
||||||
unsigned long *pgste;
|
unsigned long *pgste;
|
||||||
|
|
||||||
if (!vma->vm_mm->context.pgstes)
|
if (!vma->vm_mm->context.has_pgste)
|
||||||
return 0;
|
return 0;
|
||||||
physpage = pte_val(*ptep) & PAGE_MASK;
|
physpage = pte_val(*ptep) & PAGE_MASK;
|
||||||
pgste = (unsigned long *) (ptep + PTRS_PER_PTE);
|
pgste = (unsigned long *) (ptep + PTRS_PER_PTE);
|
||||||
@@ -844,7 +844,7 @@ static inline void __ptep_ipte(unsigned long address, pte_t *ptep)
|
|||||||
static inline void ptep_invalidate(struct mm_struct *mm,
|
static inline void ptep_invalidate(struct mm_struct *mm,
|
||||||
unsigned long address, pte_t *ptep)
|
unsigned long address, pte_t *ptep)
|
||||||
{
|
{
|
||||||
if (mm->context.pgstes) {
|
if (mm->context.has_pgste) {
|
||||||
rcp_lock(ptep);
|
rcp_lock(ptep);
|
||||||
__ptep_ipte(address, ptep);
|
__ptep_ipte(address, ptep);
|
||||||
ptep_rcp_copy(ptep);
|
ptep_rcp_copy(ptep);
|
||||||
|
@@ -169,7 +169,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
|
|||||||
unsigned long *table;
|
unsigned long *table;
|
||||||
unsigned long bits;
|
unsigned long bits;
|
||||||
|
|
||||||
bits = (mm->context.noexec || mm->context.pgstes) ? 3UL : 1UL;
|
bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL;
|
||||||
spin_lock(&mm->page_table_lock);
|
spin_lock(&mm->page_table_lock);
|
||||||
page = NULL;
|
page = NULL;
|
||||||
if (!list_empty(&mm->context.pgtable_list)) {
|
if (!list_empty(&mm->context.pgtable_list)) {
|
||||||
@@ -186,7 +186,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
|
|||||||
pgtable_page_ctor(page);
|
pgtable_page_ctor(page);
|
||||||
page->flags &= ~FRAG_MASK;
|
page->flags &= ~FRAG_MASK;
|
||||||
table = (unsigned long *) page_to_phys(page);
|
table = (unsigned long *) page_to_phys(page);
|
||||||
if (mm->context.pgstes)
|
if (mm->context.has_pgste)
|
||||||
clear_table_pgstes(table);
|
clear_table_pgstes(table);
|
||||||
else
|
else
|
||||||
clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
|
clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
|
||||||
@@ -210,7 +210,7 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
|
|||||||
struct page *page;
|
struct page *page;
|
||||||
unsigned long bits;
|
unsigned long bits;
|
||||||
|
|
||||||
bits = (mm->context.noexec || mm->context.pgstes) ? 3UL : 1UL;
|
bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL;
|
||||||
bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long);
|
bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long);
|
||||||
page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
|
page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
|
||||||
spin_lock(&mm->page_table_lock);
|
spin_lock(&mm->page_table_lock);
|
||||||
@@ -257,7 +257,7 @@ int s390_enable_sie(void)
|
|||||||
struct mm_struct *mm, *old_mm;
|
struct mm_struct *mm, *old_mm;
|
||||||
|
|
||||||
/* Do we have pgstes? if yes, we are done */
|
/* Do we have pgstes? if yes, we are done */
|
||||||
if (tsk->mm->context.pgstes)
|
if (tsk->mm->context.has_pgste)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
/* lets check if we are allowed to replace the mm */
|
/* lets check if we are allowed to replace the mm */
|
||||||
@@ -269,14 +269,14 @@ int s390_enable_sie(void)
|
|||||||
}
|
}
|
||||||
task_unlock(tsk);
|
task_unlock(tsk);
|
||||||
|
|
||||||
/* we copy the mm with pgstes enabled */
|
/* we copy the mm and let dup_mm create the page tables with_pgstes */
|
||||||
tsk->mm->context.pgstes = 1;
|
tsk->mm->context.alloc_pgste = 1;
|
||||||
mm = dup_mm(tsk);
|
mm = dup_mm(tsk);
|
||||||
tsk->mm->context.pgstes = 0;
|
tsk->mm->context.alloc_pgste = 0;
|
||||||
if (!mm)
|
if (!mm)
|
||||||
return -ENOMEM;
|
return -ENOMEM;
|
||||||
|
|
||||||
/* Now lets check again if somebody attached ptrace etc */
|
/* Now lets check again if something happened */
|
||||||
task_lock(tsk);
|
task_lock(tsk);
|
||||||
if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
|
if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
|
||||||
tsk->mm != tsk->active_mm || tsk->mm->ioctx_list) {
|
tsk->mm != tsk->active_mm || tsk->mm->ioctx_list) {
|
||||||
|
Reference in New Issue
Block a user