408996c96c
Add upstream patch for XSA351 bump PKGREVISION
1002 lines
36 KiB
Text
1002 lines
36 KiB
Text
$NetBSD: patch-XSA286,v 1.2 2020/11/12 11:29:25 bouyer Exp $
|
|
|
|
From: Jan Beulich <jbeulich@suse.com>
|
|
Subject: x86: don't allow clearing of TF_kernel_mode for other than 64-bit PV
|
|
|
|
The flag is really only meant for those, both HVM and 32-bit PV tell
|
|
kernel from user mode based on CPL/RPL. Remove the all-question-marks
|
|
comment and let's be on the safe side here and also suppress clearing
|
|
for 32-bit PV (this isn't a fast path after all).
|
|
|
|
Remove no longer necessary is_pv_32bit_*() from sh_update_cr3() and
|
|
sh_walk_guest_tables(). Note that shadow_one_bit_disable() already
|
|
assumes the new behavior.
|
|
|
|
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
|
Reviewed-by: Wei Liu <wei.liu2@citrix.com>
|
|
Acked-by: George Dunlap <george.dunlap@citrix.com>
|
|
Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
|
|
|
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
|
|
index 35857dbe86..1d0ac81c5b 100644
|
|
--- xen/arch/x86/domain.c.orig
|
|
+++ xen/arch/x86/domain.c
|
|
@@ -804,9 +804,15 @@ int arch_set_info_guest(
|
|
|
|
v->fpu_initialised = !!(flags & VGCF_I387_VALID);
|
|
|
|
- v->arch.flags &= ~TF_kernel_mode;
|
|
- if ( (flags & VGCF_in_kernel) || is_hvm_domain(d)/*???*/ )
|
|
- v->arch.flags |= TF_kernel_mode;
|
|
+ v->arch.flags |= TF_kernel_mode;
|
|
+ if ( unlikely(!(flags & VGCF_in_kernel)) &&
|
|
+ /*
|
|
+ * TF_kernel_mode is only allowed to be clear for 64-bit PV. See
|
|
+ * update_cr3(), sh_update_cr3(), sh_walk_guest_tables(), and
|
|
+ * shadow_one_bit_disable() for why that is.
|
|
+ */
|
|
+ !is_hvm_domain(d) && !is_pv_32bit_domain(d) )
|
|
+ v->arch.flags &= ~TF_kernel_mode;
|
|
|
|
v->arch.vgc_flags = flags;
|
|
|
|
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
|
|
index 8ab343d16e..a2ebb4943f 100644
|
|
--- xen/arch/x86/mm/shadow/multi.c.orig
|
|
+++ xen/arch/x86/mm/shadow/multi.c
|
|
@@ -180,7 +180,7 @@ sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw,
|
|
INVALID_MFN, v->arch.paging.shadow.gl3e);
|
|
#else /* 32 or 64 */
|
|
const struct domain *d = v->domain;
|
|
- mfn_t root_mfn = ((v->arch.flags & TF_kernel_mode) || is_pv_32bit_domain(d)
|
|
+ mfn_t root_mfn = (v->arch.flags & TF_kernel_mode
|
|
? pagetable_get_mfn(v->arch.guest_table)
|
|
: pagetable_get_mfn(v->arch.guest_table_user));
|
|
void *root_map = map_domain_page(root_mfn);
|
|
@@ -4018,7 +4018,7 @@ sh_update_cr3(struct vcpu *v, int do_locking, bool noflush)
|
|
v, (unsigned long)pagetable_get_pfn(v->arch.guest_table));
|
|
|
|
#if GUEST_PAGING_LEVELS == 4
|
|
- if ( !(v->arch.flags & TF_kernel_mode) && !is_pv_32bit_domain(d) )
|
|
+ if ( !(v->arch.flags & TF_kernel_mode) )
|
|
gmfn = pagetable_get_mfn(v->arch.guest_table_user);
|
|
else
|
|
#endif
|
|
From: Jan Beulich <jbeulich@suse.com>
|
|
Subject: x86/mm: split L4 and L3 parts of the walk out of do_page_walk()
|
|
|
|
The L3 one at least is going to be re-used by a subsequent patch, and
|
|
splitting the L4 one then as well seems only natural.
|
|
|
|
This is part of XSA-286.
|
|
|
|
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
|
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
|
|
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
|
|
|
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
|
|
index 3bd157967a..e73daa55e4 100644
|
|
--- xen/arch/x86/x86_64/mm.c.orig
|
|
+++ xen/arch/x86/x86_64/mm.c
|
|
@@ -44,26 +44,47 @@ unsigned int __read_mostly m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START;
|
|
|
|
l2_pgentry_t *compat_idle_pg_table_l2;
|
|
|
|
-void *do_page_walk(struct vcpu *v, unsigned long addr)
|
|
+static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
|
|
{
|
|
- unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
|
|
- l4_pgentry_t l4e, *l4t;
|
|
- l3_pgentry_t l3e, *l3t;
|
|
- l2_pgentry_t l2e, *l2t;
|
|
- l1_pgentry_t l1e, *l1t;
|
|
+ unsigned long mfn = pagetable_get_pfn(root);
|
|
+ l4_pgentry_t *l4t, l4e;
|
|
|
|
- if ( !is_pv_vcpu(v) || !is_canonical_address(addr) )
|
|
- return NULL;
|
|
+ if ( !is_canonical_address(addr) )
|
|
+ return l4e_empty();
|
|
|
|
l4t = map_domain_page(_mfn(mfn));
|
|
l4e = l4t[l4_table_offset(addr)];
|
|
unmap_domain_page(l4t);
|
|
+
|
|
+ return l4e;
|
|
+}
|
|
+
|
|
+static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
|
|
+{
|
|
+ l4_pgentry_t l4e = page_walk_get_l4e(root, addr);
|
|
+ l3_pgentry_t *l3t, l3e;
|
|
+
|
|
if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
|
|
- return NULL;
|
|
+ return l3e_empty();
|
|
|
|
l3t = map_l3t_from_l4e(l4e);
|
|
l3e = l3t[l3_table_offset(addr)];
|
|
unmap_domain_page(l3t);
|
|
+
|
|
+ return l3e;
|
|
+}
|
|
+
|
|
+void *do_page_walk(struct vcpu *v, unsigned long addr)
|
|
+{
|
|
+ l3_pgentry_t l3e;
|
|
+ l2_pgentry_t l2e, *l2t;
|
|
+ l1_pgentry_t l1e, *l1t;
|
|
+ unsigned long mfn;
|
|
+
|
|
+ if ( !is_pv_vcpu(v) )
|
|
+ return NULL;
|
|
+
|
|
+ l3e = page_walk_get_l3e(v->arch.guest_table, addr);
|
|
mfn = l3e_get_pfn(l3e);
|
|
if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
|
|
return NULL;
|
|
From: Jan Beulich <jbeulich@suse.com>
|
|
Subject: x86/mm: check page types in do_page_walk()
|
|
|
|
For page table entries read to be guaranteed valid, transiently locking
|
|
the pages and validating their types is necessary. Note that guest use
|
|
of linear page tables is intentionally not taken into account here, as
|
|
ordinary data (guest stacks) can't possibly live inside page tables.
|
|
|
|
This is part of XSA-286.
|
|
|
|
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
|
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
|
|
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
|
|
|
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
|
|
index e73daa55e4..1ca9547d68 100644
|
|
--- xen/arch/x86/x86_64/mm.c.orig
|
|
+++ xen/arch/x86/x86_64/mm.c
|
|
@@ -46,15 +46,29 @@ l2_pgentry_t *compat_idle_pg_table_l2;
|
|
|
|
static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
|
|
{
|
|
- unsigned long mfn = pagetable_get_pfn(root);
|
|
- l4_pgentry_t *l4t, l4e;
|
|
+ mfn_t mfn = pagetable_get_mfn(root);
|
|
+ /* current's root page table can't disappear under our feet. */
|
|
+ bool need_lock = !mfn_eq(mfn, pagetable_get_mfn(current->arch.guest_table));
|
|
+ struct page_info *pg;
|
|
+ l4_pgentry_t l4e = l4e_empty();
|
|
|
|
if ( !is_canonical_address(addr) )
|
|
return l4e_empty();
|
|
|
|
- l4t = map_domain_page(_mfn(mfn));
|
|
- l4e = l4t[l4_table_offset(addr)];
|
|
- unmap_domain_page(l4t);
|
|
+ pg = mfn_to_page(mfn);
|
|
+ if ( need_lock && !page_lock(pg) )
|
|
+ return l4e_empty();
|
|
+
|
|
+ if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l4_page_table )
|
|
+ {
|
|
+ l4_pgentry_t *l4t = map_domain_page(mfn);
|
|
+
|
|
+ l4e = l4t[l4_table_offset(addr)];
|
|
+ unmap_domain_page(l4t);
|
|
+ }
|
|
+
|
|
+ if ( need_lock )
|
|
+ page_unlock(pg);
|
|
|
|
return l4e;
|
|
}
|
|
@@ -62,14 +76,26 @@ static l4_pgentry_t page_walk_get_l4e(pagetable_t root, unsigned long addr)
|
|
static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
|
|
{
|
|
l4_pgentry_t l4e = page_walk_get_l4e(root, addr);
|
|
- l3_pgentry_t *l3t, l3e;
|
|
+ mfn_t mfn = l4e_get_mfn(l4e);
|
|
+ struct page_info *pg;
|
|
+ l3_pgentry_t l3e = l3e_empty();
|
|
|
|
if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
|
|
return l3e_empty();
|
|
|
|
- l3t = map_l3t_from_l4e(l4e);
|
|
- l3e = l3t[l3_table_offset(addr)];
|
|
- unmap_domain_page(l3t);
|
|
+ pg = mfn_to_page(mfn);
|
|
+ if ( !page_lock(pg) )
|
|
+ return l3e_empty();
|
|
+
|
|
+ if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l3_page_table )
|
|
+ {
|
|
+ l3_pgentry_t *l3t = map_domain_page(mfn);
|
|
+
|
|
+ l3e = l3t[l3_table_offset(addr)];
|
|
+ unmap_domain_page(l3t);
|
|
+ }
|
|
+
|
|
+ page_unlock(pg);
|
|
|
|
return l3e;
|
|
}
|
|
@@ -77,44 +103,67 @@ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
|
|
void *do_page_walk(struct vcpu *v, unsigned long addr)
|
|
{
|
|
l3_pgentry_t l3e;
|
|
- l2_pgentry_t l2e, *l2t;
|
|
- l1_pgentry_t l1e, *l1t;
|
|
- unsigned long mfn;
|
|
+ l2_pgentry_t l2e = l2e_empty();
|
|
+ l1_pgentry_t l1e = l1e_empty();
|
|
+ mfn_t mfn;
|
|
+ struct page_info *pg;
|
|
|
|
if ( !is_pv_vcpu(v) )
|
|
return NULL;
|
|
|
|
l3e = page_walk_get_l3e(v->arch.guest_table, addr);
|
|
- mfn = l3e_get_pfn(l3e);
|
|
- if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
|
|
+ mfn = l3e_get_mfn(l3e);
|
|
+ if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
|
|
return NULL;
|
|
if ( (l3e_get_flags(l3e) & _PAGE_PSE) )
|
|
{
|
|
- mfn += PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1));
|
|
+ mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1)));
|
|
goto ret;
|
|
}
|
|
|
|
- l2t = map_domain_page(_mfn(mfn));
|
|
- l2e = l2t[l2_table_offset(addr)];
|
|
- unmap_domain_page(l2t);
|
|
- mfn = l2e_get_pfn(l2e);
|
|
- if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
|
|
+ pg = mfn_to_page(mfn);
|
|
+ if ( !page_lock(pg) )
|
|
+ return NULL;
|
|
+
|
|
+ if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table )
|
|
+ {
|
|
+ const l2_pgentry_t *l2t = map_domain_page(mfn);
|
|
+
|
|
+ l2e = l2t[l2_table_offset(addr)];
|
|
+ unmap_domain_page(l2t);
|
|
+ }
|
|
+
|
|
+ page_unlock(pg);
|
|
+
|
|
+ mfn = l2e_get_mfn(l2e);
|
|
+ if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
|
|
return NULL;
|
|
if ( (l2e_get_flags(l2e) & _PAGE_PSE) )
|
|
{
|
|
- mfn += PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1));
|
|
+ mfn = mfn_add(mfn, PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1)));
|
|
goto ret;
|
|
}
|
|
|
|
- l1t = map_domain_page(_mfn(mfn));
|
|
- l1e = l1t[l1_table_offset(addr)];
|
|
- unmap_domain_page(l1t);
|
|
- mfn = l1e_get_pfn(l1e);
|
|
- if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
|
|
+ pg = mfn_to_page(mfn);
|
|
+ if ( !page_lock(pg) )
|
|
+ return NULL;
|
|
+
|
|
+ if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table )
|
|
+ {
|
|
+ const l1_pgentry_t *l1t = map_domain_page(mfn);
|
|
+
|
|
+ l1e = l1t[l1_table_offset(addr)];
|
|
+ unmap_domain_page(l1t);
|
|
+ }
|
|
+
|
|
+ page_unlock(pg);
|
|
+
|
|
+ mfn = l1e_get_mfn(l1e);
|
|
+ if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(mfn) )
|
|
return NULL;
|
|
|
|
ret:
|
|
- return map_domain_page(_mfn(mfn)) + (addr & ~PAGE_MASK);
|
|
+ return map_domain_page(mfn) + (addr & ~PAGE_MASK);
|
|
}
|
|
|
|
/*
|
|
From: Jan Beulich <jbeulich@suse.com>
|
|
Subject: x86/mm: avoid using linear page tables in map_guest_l1e()
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
Replace the linear L2 table access by an actual page walk.
|
|
|
|
This is part of XSA-286.
|
|
|
|
Reported-by: Jann Horn <jannh@google.com>
|
|
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
|
Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
|
|
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
|
|
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
|
|
|
diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c
|
|
index 80bf280fb2..ee08c13881 100644
|
|
--- xen/arch/x86/pv/mm.c.orig
|
|
+++ xen/arch/x86/pv/mm.c
|
|
@@ -40,11 +40,14 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn)
|
|
if ( unlikely(!__addr_ok(linear)) )
|
|
return NULL;
|
|
|
|
- /* Find this l1e and its enclosing l1mfn in the linear map. */
|
|
- if ( __copy_from_user(&l2e,
|
|
- &__linear_l2_table[l2_linear_offset(linear)],
|
|
- sizeof(l2_pgentry_t)) )
|
|
+ if ( unlikely(!(current->arch.flags & TF_kernel_mode)) )
|
|
+ {
|
|
+ ASSERT_UNREACHABLE();
|
|
return NULL;
|
|
+ }
|
|
+
|
|
+ /* Find this l1e and its enclosing l1mfn. */
|
|
+ l2e = page_walk_get_l2e(current->arch.guest_table, linear);
|
|
|
|
/* Check flags that it will be safe to read the l1e. */
|
|
if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) != _PAGE_PRESENT )
|
|
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
|
|
index 1ca9547d68..dfa33ba894 100644
|
|
--- xen/arch/x86/x86_64/mm.c.orig
|
|
+++ xen/arch/x86/x86_64/mm.c
|
|
@@ -100,6 +100,34 @@ static l3_pgentry_t page_walk_get_l3e(pagetable_t root, unsigned long addr)
|
|
return l3e;
|
|
}
|
|
|
|
+l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr)
|
|
+{
|
|
+ l3_pgentry_t l3e = page_walk_get_l3e(root, addr);
|
|
+ mfn_t mfn = l3e_get_mfn(l3e);
|
|
+ struct page_info *pg;
|
|
+ l2_pgentry_t l2e = l2e_empty();
|
|
+
|
|
+ if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ||
|
|
+ (l3e_get_flags(l3e) & _PAGE_PSE) )
|
|
+ return l2e_empty();
|
|
+
|
|
+ pg = mfn_to_page(mfn);
|
|
+ if ( !page_lock(pg) )
|
|
+ return l2e_empty();
|
|
+
|
|
+ if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l2_page_table )
|
|
+ {
|
|
+ l2_pgentry_t *l2t = map_domain_page(mfn);
|
|
+
|
|
+ l2e = l2t[l2_table_offset(addr)];
|
|
+ unmap_domain_page(l2t);
|
|
+ }
|
|
+
|
|
+ page_unlock(pg);
|
|
+
|
|
+ return l2e;
|
|
+}
|
|
+
|
|
void *do_page_walk(struct vcpu *v, unsigned long addr)
|
|
{
|
|
l3_pgentry_t l3e;
|
|
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
|
|
index 7825691d06..afafe87fe7 100644
|
|
--- xen/include/asm-x86/mm.h.orig
|
|
+++ xen/include/asm-x86/mm.h
|
|
@@ -585,7 +585,9 @@ void audit_domains(void);
|
|
void make_cr3(struct vcpu *v, mfn_t mfn);
|
|
void update_cr3(struct vcpu *v);
|
|
int vcpu_destroy_pagetables(struct vcpu *);
|
|
+
|
|
void *do_page_walk(struct vcpu *v, unsigned long addr);
|
|
+l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr);
|
|
|
|
int __sync_local_execstate(void);
|
|
|
|
From: Jan Beulich <jbeulich@suse.com>
|
|
Subject: x86/mm: avoid using linear page tables in guest_get_eff_kern_l1e()
|
|
|
|
First of all drop guest_get_eff_l1e() entirely - there's no actual user
|
|
of it: pv_ro_page_fault() has a guest_kernel_mode() conditional around
|
|
its only call site.
|
|
|
|
Then replace the linear L1 table access by an actual page walk.
|
|
|
|
This is part of XSA-286.
|
|
|
|
Reported-by: Jann Horn <jannh@google.com>
|
|
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
|
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
|
|
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
|
|
|
diff --git a/xen/arch/x86/pv/mm.c b/xen/arch/x86/pv/mm.c
|
|
index ee08c13881..c70785d0cf 100644
|
|
--- xen/arch/x86/pv/mm.c.orig
|
|
+++ xen/arch/x86/pv/mm.c
|
|
@@ -59,27 +59,6 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn)
|
|
}
|
|
|
|
/*
|
|
- * Read the guest's l1e that maps this address, from the kernel-mode
|
|
- * page tables.
|
|
- */
|
|
-static l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear)
|
|
-{
|
|
- struct vcpu *curr = current;
|
|
- const bool user_mode = !(curr->arch.flags & TF_kernel_mode);
|
|
- l1_pgentry_t l1e;
|
|
-
|
|
- if ( user_mode )
|
|
- toggle_guest_pt(curr);
|
|
-
|
|
- l1e = guest_get_eff_l1e(linear);
|
|
-
|
|
- if ( user_mode )
|
|
- toggle_guest_pt(curr);
|
|
-
|
|
- return l1e;
|
|
-}
|
|
-
|
|
-/*
|
|
* Map a guest's LDT page (covering the byte at @offset from start of the LDT)
|
|
* into Xen's virtual range. Returns true if the mapping changed, false
|
|
* otherwise.
|
|
diff --git a/xen/arch/x86/pv/mm.h b/xen/arch/x86/pv/mm.h
|
|
index 976209ba4c..cc4ee1affb 100644
|
|
--- xen/arch/x86/pv/mm.h.orig
|
|
+++ xen/arch/x86/pv/mm.h
|
|
@@ -5,19 +5,19 @@ l1_pgentry_t *map_guest_l1e(unsigned long linear, mfn_t *gl1mfn);
|
|
|
|
int new_guest_cr3(mfn_t mfn);
|
|
|
|
-/* Read a PV guest's l1e that maps this linear address. */
|
|
-static inline l1_pgentry_t guest_get_eff_l1e(unsigned long linear)
|
|
+/*
|
|
+ * Read the guest's l1e that maps this address, from the kernel-mode
|
|
+ * page tables.
|
|
+ */
|
|
+static inline l1_pgentry_t guest_get_eff_kern_l1e(unsigned long linear)
|
|
{
|
|
- l1_pgentry_t l1e;
|
|
+ l1_pgentry_t l1e = l1e_empty();
|
|
|
|
ASSERT(!paging_mode_translate(current->domain));
|
|
ASSERT(!paging_mode_external(current->domain));
|
|
|
|
- if ( unlikely(!__addr_ok(linear)) ||
|
|
- __copy_from_user(&l1e,
|
|
- &__linear_l1_table[l1_linear_offset(linear)],
|
|
- sizeof(l1_pgentry_t)) )
|
|
- l1e = l1e_empty();
|
|
+ if ( likely(__addr_ok(linear)) )
|
|
+ l1e = page_walk_get_l1e(current->arch.guest_table, linear);
|
|
|
|
return l1e;
|
|
}
|
|
diff --git a/xen/arch/x86/pv/ro-page-fault.c b/xen/arch/x86/pv/ro-page-fault.c
|
|
index a3c0c2dd19..c9ee5156f8 100644
|
|
--- xen/arch/x86/pv/ro-page-fault.c.orig
|
|
+++ xen/arch/x86/pv/ro-page-fault.c
|
|
@@ -357,7 +357,7 @@ int pv_ro_page_fault(unsigned long addr, struct cpu_user_regs *regs)
|
|
bool mmio_ro;
|
|
|
|
/* Attempt to read the PTE that maps the VA being accessed. */
|
|
- pte = guest_get_eff_l1e(addr);
|
|
+ pte = guest_get_eff_kern_l1e(addr);
|
|
|
|
/* We are only looking for read-only mappings */
|
|
if ( ((l1e_get_flags(pte) & (_PAGE_PRESENT | _PAGE_RW)) != _PAGE_PRESENT) )
|
|
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
|
|
index dfa33ba894..cca7ea6e9d 100644
|
|
--- xen/arch/x86/x86_64/mm.c.orig
|
|
+++ xen/arch/x86/x86_64/mm.c
|
|
@@ -128,6 +128,62 @@ l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr)
|
|
return l2e;
|
|
}
|
|
|
|
+/*
|
|
+ * For now no "set_accessed" parameter, as all callers want it set to true.
|
|
+ * For now also no "set_dirty" parameter, as all callers deal with r/o
|
|
+ * mappings, and we don't want to set the dirty bit there (conflicts with
|
|
+ * CET-SS). However, as there are CPUs which may set the dirty bit on r/o
|
|
+ * PTEs, the logic below tolerates the bit becoming set "behind our backs".
|
|
+ */
|
|
+l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr)
|
|
+{
|
|
+ l2_pgentry_t l2e = page_walk_get_l2e(root, addr);
|
|
+ mfn_t mfn = l2e_get_mfn(l2e);
|
|
+ struct page_info *pg;
|
|
+ l1_pgentry_t l1e = l1e_empty();
|
|
+
|
|
+ if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) ||
|
|
+ (l2e_get_flags(l2e) & _PAGE_PSE) )
|
|
+ return l1e_empty();
|
|
+
|
|
+ pg = mfn_to_page(mfn);
|
|
+ if ( !page_lock(pg) )
|
|
+ return l1e_empty();
|
|
+
|
|
+ if ( (pg->u.inuse.type_info & PGT_type_mask) == PGT_l1_page_table )
|
|
+ {
|
|
+ l1_pgentry_t *l1t = map_domain_page(mfn);
|
|
+
|
|
+ l1e = l1t[l1_table_offset(addr)];
|
|
+
|
|
+ if ( (l1e_get_flags(l1e) & (_PAGE_ACCESSED | _PAGE_PRESENT)) ==
|
|
+ _PAGE_PRESENT )
|
|
+ {
|
|
+ l1_pgentry_t ol1e = l1e;
|
|
+
|
|
+ l1e_add_flags(l1e, _PAGE_ACCESSED);
|
|
+ /*
|
|
+ * Best effort only; with the lock held the page shouldn't
|
|
+ * change anyway, except for the dirty bit to perhaps become set.
|
|
+ */
|
|
+ while ( cmpxchg(&l1e_get_intpte(l1t[l1_table_offset(addr)]),
|
|
+ l1e_get_intpte(ol1e), l1e_get_intpte(l1e)) !=
|
|
+ l1e_get_intpte(ol1e) &&
|
|
+ !(l1e_get_flags(l1e) & _PAGE_DIRTY) )
|
|
+ {
|
|
+ l1e_add_flags(ol1e, _PAGE_DIRTY);
|
|
+ l1e_add_flags(l1e, _PAGE_DIRTY);
|
|
+ }
|
|
+ }
|
|
+
|
|
+ unmap_domain_page(l1t);
|
|
+ }
|
|
+
|
|
+ page_unlock(pg);
|
|
+
|
|
+ return l1e;
|
|
+}
|
|
+
|
|
void *do_page_walk(struct vcpu *v, unsigned long addr)
|
|
{
|
|
l3_pgentry_t l3e;
|
|
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
|
|
index afafe87fe7..423313ae3a 100644
|
|
--- xen/include/asm-x86/mm.h.orig
|
|
+++ xen/include/asm-x86/mm.h
|
|
@@ -588,6 +588,7 @@ int vcpu_destroy_pagetables(struct vcpu *);
|
|
|
|
void *do_page_walk(struct vcpu *v, unsigned long addr);
|
|
l2_pgentry_t page_walk_get_l2e(pagetable_t root, unsigned long addr);
|
|
+l1_pgentry_t page_walk_get_l1e(pagetable_t root, unsigned long addr);
|
|
|
|
int __sync_local_execstate(void);
|
|
|
|
From: Jan Beulich <jbeulich@suse.com>
|
|
Subject: x86/mm: avoid using top level linear page tables in
|
|
{,un}map_domain_page()
|
|
|
|
Move the page table recursion two levels down. This entails avoiding
|
|
to free the recursive mapping prematurely in free_perdomain_mappings().
|
|
|
|
This is part of XSA-286.
|
|
|
|
Reported-by: Jann Horn <jannh@google.com>
|
|
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
|
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
|
|
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
|
|
|
diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
|
|
index 0c24530ed9..d89fa27f8e 100644
|
|
--- xen/arch/x86/domain_page.c.orig
|
|
+++ xen/arch/x86/domain_page.c
|
|
@@ -65,7 +65,8 @@ void __init mapcache_override_current(struct vcpu *v)
|
|
#define mapcache_l2_entry(e) ((e) >> PAGETABLE_ORDER)
|
|
#define MAPCACHE_L2_ENTRIES (mapcache_l2_entry(MAPCACHE_ENTRIES - 1) + 1)
|
|
#define MAPCACHE_L1ENT(idx) \
|
|
- __linear_l1_table[l1_linear_offset(MAPCACHE_VIRT_START + pfn_to_paddr(idx))]
|
|
+ ((l1_pgentry_t *)(MAPCACHE_VIRT_START | \
|
|
+ ((L2_PAGETABLE_ENTRIES - 1) << L2_PAGETABLE_SHIFT)))[idx]
|
|
|
|
void *map_domain_page(mfn_t mfn)
|
|
{
|
|
@@ -235,6 +236,7 @@ int mapcache_domain_init(struct domain *d)
|
|
{
|
|
struct mapcache_domain *dcache = &d->arch.pv_domain.mapcache;
|
|
unsigned int bitmap_pages;
|
|
+ int rc;
|
|
|
|
ASSERT(is_pv_domain(d));
|
|
|
|
@@ -243,8 +245,10 @@ int mapcache_domain_init(struct domain *d)
|
|
return 0;
|
|
#endif
|
|
|
|
+ BUILD_BUG_ON(MAPCACHE_VIRT_START & ((1 << L3_PAGETABLE_SHIFT) - 1));
|
|
BUILD_BUG_ON(MAPCACHE_VIRT_END + PAGE_SIZE * (3 +
|
|
- 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) >
|
|
+ 2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) +
|
|
+ (1U << L2_PAGETABLE_SHIFT) >
|
|
MAPCACHE_VIRT_START + (PERDOMAIN_SLOT_MBYTES << 20));
|
|
bitmap_pages = PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long));
|
|
dcache->inuse = (void *)MAPCACHE_VIRT_END + PAGE_SIZE;
|
|
@@ -253,9 +257,25 @@ int mapcache_domain_init(struct domain *d)
|
|
|
|
spin_lock_init(&dcache->lock);
|
|
|
|
- return create_perdomain_mapping(d, (unsigned long)dcache->inuse,
|
|
- 2 * bitmap_pages + 1,
|
|
- NIL(l1_pgentry_t *), NULL);
|
|
+ rc = create_perdomain_mapping(d, (unsigned long)dcache->inuse,
|
|
+ 2 * bitmap_pages + 1,
|
|
+ NIL(l1_pgentry_t *), NULL);
|
|
+ if ( !rc )
|
|
+ {
|
|
+ /*
|
|
+ * Install mapping of our L2 table into its own last slot, for easy
|
|
+ * access to the L1 entries via MAPCACHE_L1ENT().
|
|
+ */
|
|
+ l3_pgentry_t *l3t = __map_domain_page(d->arch.perdomain_l3_pg);
|
|
+ l3_pgentry_t l3e = l3t[l3_table_offset(MAPCACHE_VIRT_END)];
|
|
+ l2_pgentry_t *l2t = map_l2t_from_l3e(l3e);
|
|
+
|
|
+ l2e_get_intpte(l2t[L2_PAGETABLE_ENTRIES - 1]) = l3e_get_intpte(l3e);
|
|
+ unmap_domain_page(l2t);
|
|
+ unmap_domain_page(l3t);
|
|
+ }
|
|
+
|
|
+ return rc;
|
|
}
|
|
|
|
int mapcache_vcpu_init(struct vcpu *v)
|
|
@@ -346,7 +366,7 @@ mfn_t domain_page_map_to_mfn(const void *ptr)
|
|
else
|
|
{
|
|
ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END);
|
|
- pl1e = &__linear_l1_table[l1_linear_offset(va)];
|
|
+ pl1e = &MAPCACHE_L1ENT(PFN_DOWN(va - MAPCACHE_VIRT_START));
|
|
}
|
|
|
|
return l1e_get_mfn(*pl1e);
|
|
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
|
|
index 626768a950..8f975a747d 100644
|
|
--- xen/arch/x86/mm.c.orig
|
|
+++ xen/arch/x86/mm.c
|
|
@@ -6038,6 +6038,10 @@ void free_perdomain_mappings(struct domain *d)
|
|
{
|
|
struct page_info *l1pg = l2e_get_page(l2tab[j]);
|
|
|
|
+ /* mapcache_domain_init() installs a recursive entry. */
|
|
+ if ( l1pg == l2pg )
|
|
+ continue;
|
|
+
|
|
if ( l2e_get_flags(l2tab[j]) & _PAGE_AVAIL0 )
|
|
{
|
|
l1_pgentry_t *l1tab = __map_domain_page(l1pg);
|
|
From: Jan Beulich <jbeulich@suse.com>
|
|
Subject: x86/mm: restrict use of linear page tables to shadow mode code
|
|
|
|
Other code does not require them to be set up anymore, so restrict when
|
|
to populate the respective L4 slot and reduce visibility of the
|
|
accessors.
|
|
|
|
While with the removal of all uses the vulnerability is actually fixed,
|
|
removing the creation of the linear mapping adds an extra layer of
|
|
protection. Similarly reducing visibility of the accessors mostly
|
|
eliminates the risk of undue re-introduction of uses of the linear
|
|
mappings.
|
|
|
|
This is (not strictly) part of XSA-286.
|
|
|
|
Signed-off-by: Jan Beulich <jbeulich@suse.com>
|
|
Reviewed-by: George Dunlap <george.dunlap@citrix.com>
|
|
Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
|
|
|
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
|
|
index 8f975a747d..10175764e8 100644
|
|
--- xen/arch/x86/mm.c.orig
|
|
+++ xen/arch/x86/mm.c
|
|
@@ -1755,9 +1755,10 @@ void init_xen_l4_slots(l4_pgentry_t *l4t, mfn_t l4mfn,
|
|
l4t[l4_table_offset(PCI_MCFG_VIRT_START)] =
|
|
idle_pg_table[l4_table_offset(PCI_MCFG_VIRT_START)];
|
|
|
|
- /* Slot 258: Self linear mappings. */
|
|
+ /* Slot 258: Self linear mappings (shadow pt only). */
|
|
ASSERT(!mfn_eq(l4mfn, INVALID_MFN));
|
|
l4t[l4_table_offset(LINEAR_PT_VIRT_START)] =
|
|
+ !shadow_mode_external(d) ? l4e_empty() :
|
|
l4e_from_mfn(l4mfn, __PAGE_HYPERVISOR_RW);
|
|
|
|
/* Slot 259: Shadow linear mappings (if applicable) .*/
|
|
diff --git a/xen/arch/x86/mm/shadow/private.h b/xen/arch/x86/mm/shadow/private.h
|
|
index c7fa18925b..1933a6a2a2 100644
|
|
--- xen/arch/x86/mm/shadow/private.h.orig
|
|
+++ xen/arch/x86/mm/shadow/private.h
|
|
@@ -137,6 +137,15 @@ enum {
|
|
# define GUEST_PTE_SIZE 4
|
|
#endif
|
|
|
|
+/* Where to find each level of the linear mapping */
|
|
+#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
|
|
+#define __linear_l2_table \
|
|
+ ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START)))
|
|
+#define __linear_l3_table \
|
|
+ ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START)))
|
|
+#define __linear_l4_table \
|
|
+ ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START)))
|
|
+
|
|
/******************************************************************************
|
|
* Auditing routines
|
|
*/
|
|
diff --git a/xen/arch/x86/x86_64/mm.c b/xen/arch/x86/x86_64/mm.c
|
|
index cca7ea6e9d..d7551e594a 100644
|
|
--- xen/arch/x86/x86_64/mm.c.orig
|
|
+++ xen/arch/x86/x86_64/mm.c
|
|
@@ -833,9 +833,6 @@ void __init paging_init(void)
|
|
|
|
machine_to_phys_mapping_valid = 1;
|
|
|
|
- /* Set up linear page table mapping. */
|
|
- l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)],
|
|
- l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR_RW));
|
|
return;
|
|
|
|
nomem:
|
|
diff --git a/xen/include/asm-x86/config.h b/xen/include/asm-x86/config.h
|
|
index 9ef9d03ca7..4670ab99f6 100644
|
|
--- xen/include/asm-x86/config.h.orig
|
|
+++ xen/include/asm-x86/config.h
|
|
@@ -193,7 +193,7 @@ extern unsigned char boot_edid_info[128];
|
|
*/
|
|
#define PCI_MCFG_VIRT_START (PML4_ADDR(257))
|
|
#define PCI_MCFG_VIRT_END (PCI_MCFG_VIRT_START + PML4_ENTRY_BYTES)
|
|
-/* Slot 258: linear page table (guest table). */
|
|
+/* Slot 258: linear page table (monitor table, HVM only). */
|
|
#define LINEAR_PT_VIRT_START (PML4_ADDR(258))
|
|
#define LINEAR_PT_VIRT_END (LINEAR_PT_VIRT_START + PML4_ENTRY_BYTES)
|
|
/* Slot 259: linear page table (shadow table). */
|
|
diff --git a/xen/include/asm-x86/page.h b/xen/include/asm-x86/page.h
|
|
index c1e92937c0..e72c277b9f 100644
|
|
--- xen/include/asm-x86/page.h.orig
|
|
+++ xen/include/asm-x86/page.h
|
|
@@ -274,19 +274,6 @@ void copy_page_sse2(void *, const void *);
|
|
#define vmap_to_mfn(va) _mfn(l1e_get_pfn(*virt_to_xen_l1e((unsigned long)(va))))
|
|
#define vmap_to_page(va) mfn_to_page(vmap_to_mfn(va))
|
|
|
|
-#endif /* !defined(__ASSEMBLY__) */
|
|
-
|
|
-/* Where to find each level of the linear mapping */
|
|
-#define __linear_l1_table ((l1_pgentry_t *)(LINEAR_PT_VIRT_START))
|
|
-#define __linear_l2_table \
|
|
- ((l2_pgentry_t *)(__linear_l1_table + l1_linear_offset(LINEAR_PT_VIRT_START)))
|
|
-#define __linear_l3_table \
|
|
- ((l3_pgentry_t *)(__linear_l2_table + l2_linear_offset(LINEAR_PT_VIRT_START)))
|
|
-#define __linear_l4_table \
|
|
- ((l4_pgentry_t *)(__linear_l3_table + l3_linear_offset(LINEAR_PT_VIRT_START)))
|
|
-
|
|
-
|
|
-#ifndef __ASSEMBLY__
|
|
extern root_pgentry_t idle_pg_table[ROOT_PAGETABLE_ENTRIES];
|
|
extern l2_pgentry_t *compat_idle_pg_table_l2;
|
|
extern unsigned int m2p_compat_vstart;
|
|
From 1d021db3c8712d25e25f078833baa160c90f260f Mon Sep 17 00:00:00 2001
|
|
From: Andrew Cooper <andrew.cooper3@citrix.com>
|
|
Date: Thu, 22 Oct 2020 11:28:58 +0100
|
|
Subject: [PATCH 1/2] x86/pv: Drop FLUSH_TLB_GLOBAL in do_mmu_update() for XPTI
|
|
|
|
c/s 9d1d31ad9498 "x86: slightly reduce Meltdown band-aid overhead" removed the
|
|
use of Global TLB flushes on the Xen entry path, but added a FLUSH_TLB_GLOBAL
|
|
to the L4 path in do_mmu_update().
|
|
|
|
However, this was unnecessary.
|
|
|
|
It is the guests responsibility to perform appropriate TLB flushing if the L4
|
|
modification altered an established mapping in a flush-relevant way. In this
|
|
case, an MMUEXT_OP hypercall will follow. The case which Xen needs to cover
|
|
is when new mappings are created, and the resync on the exit-to-guest path
|
|
covers this correctly.
|
|
|
|
There is a corner case with multiple vCPUs in hypercalls at the same time,
|
|
which 9d1d31ad9498 changed, and this patch changes back to its original XPTI
|
|
behaviour.
|
|
|
|
Architecturally, established TLB entries can continue to be used until the
|
|
broadcast flush has completed. Therefore, even with concurrent hypercalls,
|
|
the guest cannot depend on older mappings not being used until an MMUEXT_OP
|
|
hypercall completes. Xen's implementation of guest-initiated flushes will
|
|
take correct effect on top of an in-progress hypercall, picking up new mapping
|
|
setting before the other vCPU's MMUEXT_OP completes.
|
|
|
|
Note: The correctness of this change is not impacted by whether XPTI uses
|
|
global mappings or not. Correctness there depends on the behaviour of Xen on
|
|
the entry/exit paths when switching two/from the XPTI "shadow" pagetables.
|
|
|
|
This is (not really) XSA-286 (but necessary to simplify the logic).
|
|
|
|
Fixes: 9d1d31ad9498 ("x86: slightly reduce Meltdown band-aid overhead")
|
|
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
|
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
|
(cherry picked from commit 055e1c3a3d95b1e753148369fbc4ba48782dd602)
|
|
---
|
|
xen/arch/x86/mm.c | 2 +-
|
|
1 file changed, 1 insertion(+), 1 deletion(-)
|
|
|
|
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
|
|
index 5ca5c8c9a2..129da1e648 100644
|
|
--- xen/arch/x86/mm.c.orig
|
|
+++ xen/arch/x86/mm.c
|
|
@@ -4279,7 +4279,7 @@ long do_mmu_update(
|
|
|
|
cpumask_andnot(mask, pt_owner->dirty_cpumask, cpumask_of(cpu));
|
|
if ( !cpumask_empty(mask) )
|
|
- flush_mask(mask, FLUSH_TLB_GLOBAL | FLUSH_ROOT_PGTBL);
|
|
+ flush_mask(mask, FLUSH_ROOT_PGTBL);
|
|
}
|
|
|
|
perfc_add(num_page_updates, i);
|
|
--
|
|
2.20.1
|
|
|
|
From e274c8bdc12eb596e55233040e8b49da27150f31 Mon Sep 17 00:00:00 2001
|
|
From: Andrew Cooper <andrew.cooper3@citrix.com>
|
|
Date: Mon, 19 Oct 2020 15:51:22 +0100
|
|
Subject: [PATCH 2/2] x86/pv: Flush TLB in response to paging structure changes
|
|
|
|
With MMU_UPDATE, a PV guest can make changes to higher level pagetables. This
|
|
is safe from Xen's point of view (as the update only affects guest mappings),
|
|
and the guest is required to flush (if necessary) after making updates.
|
|
|
|
However, Xen's use of linear pagetables (UPDATE_VA_MAPPING, GNTTABOP_map,
|
|
writeable pagetables, etc.) is an implementation detail outside of the
|
|
API/ABI.
|
|
|
|
Changes in the paging structure require invalidations in the linear pagetable
|
|
range for subsequent accesses into the linear pagetables to access non-stale
|
|
mappings. Xen must provide suitable flushing to prevent intermixed guest
|
|
actions from accidentally accessing/modifying the wrong pagetable.
|
|
|
|
For all L2 and higher modifications, flush the TLB. PV guests cannot create
|
|
L2 or higher entries with the Global bit set, so no mappings established in
|
|
the linear range can be global. (This could in principle be an order 39 flush
|
|
starting at LINEAR_PT_VIRT_START, but no such mechanism exists in practice.)
|
|
|
|
Express the necessary flushes as a set of booleans which accumulate across the
|
|
operation. Comment the flushing logic extensively.
|
|
|
|
This is XSA-286.
|
|
|
|
Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
|
|
Reviewed-by: Jan Beulich <jbeulich@suse.com>
|
|
(cherry picked from commit 16a20963b3209788f2c0d3a3eebb7d92f03f5883)
|
|
---
|
|
xen/arch/x86/mm.c | 69 ++++++++++++++++++++++++++++++++++++++++-------
|
|
1 file changed, 59 insertions(+), 10 deletions(-)
|
|
|
|
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
|
|
index 129da1e648..3528cf6b85 100644
|
|
--- xen/arch/x86/mm.c.orig
|
|
+++ xen/arch/x86/mm.c
|
|
@@ -3983,7 +3983,8 @@ long do_mmu_update(
|
|
struct vcpu *curr = current, *v = curr;
|
|
struct domain *d = v->domain, *pt_owner = d, *pg_owner;
|
|
mfn_t map_mfn = INVALID_MFN;
|
|
- bool sync_guest = false;
|
|
+ bool flush_linear_pt = false, flush_root_pt_local = false,
|
|
+ flush_root_pt_others = false;
|
|
uint32_t xsm_needed = 0;
|
|
uint32_t xsm_checked = 0;
|
|
int rc = put_old_guest_table(curr);
|
|
@@ -4133,6 +4134,8 @@ long do_mmu_update(
|
|
break;
|
|
rc = mod_l2_entry(va, l2e_from_intpte(req.val), mfn,
|
|
cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
|
|
+ if ( !rc )
|
|
+ flush_linear_pt = true;
|
|
break;
|
|
|
|
case PGT_l3_page_table:
|
|
@@ -4140,6 +4143,8 @@ long do_mmu_update(
|
|
break;
|
|
rc = mod_l3_entry(va, l3e_from_intpte(req.val), mfn,
|
|
cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
|
|
+ if ( !rc )
|
|
+ flush_linear_pt = true;
|
|
break;
|
|
|
|
case PGT_l4_page_table:
|
|
@@ -4147,6 +4152,8 @@ long do_mmu_update(
|
|
break;
|
|
rc = mod_l4_entry(va, l4e_from_intpte(req.val), mfn,
|
|
cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
|
|
+ if ( !rc )
|
|
+ flush_linear_pt = true;
|
|
if ( !rc && pt_owner->arch.pv_domain.xpti )
|
|
{
|
|
bool local_in_use = false;
|
|
@@ -4154,7 +4161,7 @@ long do_mmu_update(
|
|
if ( pagetable_get_pfn(curr->arch.guest_table) == mfn )
|
|
{
|
|
local_in_use = true;
|
|
- get_cpu_info()->root_pgt_changed = true;
|
|
+ flush_root_pt_local = true;
|
|
}
|
|
|
|
/*
|
|
@@ -4166,7 +4173,7 @@ long do_mmu_update(
|
|
(1 + !!(page->u.inuse.type_info & PGT_pinned) +
|
|
(pagetable_get_pfn(curr->arch.guest_table_user) ==
|
|
mfn) + local_in_use) )
|
|
- sync_guest = true;
|
|
+ flush_root_pt_others = true;
|
|
}
|
|
break;
|
|
|
|
@@ -4268,19 +4275,61 @@ long do_mmu_update(
|
|
if ( va )
|
|
unmap_domain_page(va);
|
|
|
|
- if ( sync_guest )
|
|
+ /*
|
|
+ * Perform required TLB maintenance.
|
|
+ *
|
|
+ * This logic currently depend on flush_linear_pt being a superset of the
|
|
+ * flush_root_pt_* conditions.
|
|
+ *
|
|
+ * pt_owner may not be current->domain. This may occur during
|
|
+ * construction of 32bit PV guests, or debugging of PV guests. The
|
|
+ * behaviour cannot be correct with domain unpaused. We therefore expect
|
|
+ * pt_owner->dirty_cpumask to be empty, but it is a waste of effort to
|
|
+ * explicitly check for, and exclude, this corner case.
|
|
+ *
|
|
+ * flush_linear_pt requires a FLUSH_TLB to all dirty CPUs. The flush must
|
|
+ * be performed now to maintain correct behaviour across a multicall.
|
|
+ * i.e. we cannot relax FLUSH_TLB to FLUSH_ROOT_PGTBL, given that the
|
|
+ * former is a side effect of the latter, because the resync (which is in
|
|
+ * the return-to-guest path) happens too late.
|
|
+ *
|
|
+ * flush_root_pt_* requires FLUSH_ROOT_PGTBL on either the local CPU
|
|
+ * (implies pt_owner == current->domain and current->processor set in
|
|
+ * pt_owner->dirty_cpumask), and/or all *other* dirty CPUs as there are
|
|
+ * references we can't account for locally.
|
|
+ */
|
|
+ if ( flush_linear_pt /* || flush_root_pt_local || flush_root_pt_others */ )
|
|
{
|
|
+ unsigned int cpu = smp_processor_id();
|
|
+ cpumask_t *mask = pt_owner->dirty_cpumask;
|
|
+
|
|
/*
|
|
- * Force other vCPU-s of the affected guest to pick up L4 entry
|
|
- * changes (if any).
|
|
+ * Always handle local flushing separately (if applicable), to
|
|
+ * separate the flush invocations appropriately for scope of the two
|
|
+ * flush_root_pt_* variables.
|
|
*/
|
|
- unsigned int cpu = smp_processor_id();
|
|
- cpumask_t *mask = per_cpu(scratch_cpumask, cpu);
|
|
+ if ( likely(cpumask_test_cpu(cpu, mask)) )
|
|
+ {
|
|
+ mask = per_cpu(scratch_cpumask, cpu);
|
|
|
|
- cpumask_andnot(mask, pt_owner->dirty_cpumask, cpumask_of(cpu));
|
|
+ cpumask_copy(mask, pt_owner->dirty_cpumask);
|
|
+ __cpumask_clear_cpu(cpu, mask);
|
|
+
|
|
+ flush_local(FLUSH_TLB |
|
|
+ (flush_root_pt_local ? FLUSH_ROOT_PGTBL : 0));
|
|
+ }
|
|
+ else
|
|
+ /* Sanity check. flush_root_pt_local implies local cpu is dirty. */
|
|
+ ASSERT(!flush_root_pt_local);
|
|
+
|
|
+ /* Flush the remote dirty CPUs. Does not include the local CPU. */
|
|
if ( !cpumask_empty(mask) )
|
|
- flush_mask(mask, FLUSH_ROOT_PGTBL);
|
|
+ flush_mask(mask, FLUSH_TLB |
|
|
+ (flush_root_pt_others ? FLUSH_ROOT_PGTBL : 0));
|
|
}
|
|
+ else
|
|
+ /* Sanity check. flush_root_pt_* implies flush_linear_pt. */
|
|
+ ASSERT(!flush_root_pt_local && !flush_root_pt_others);
|
|
|
|
perfc_add(num_page_updates, i);
|
|
|
|
--
|
|
2.20.1
|
|
|